diff --git a/apps/showcases/PorousMediaGPU/CMakeLists.txt b/apps/showcases/PorousMediaGPU/CMakeLists.txt
index 5de0f2ccfad8fddca102f5bc9b08416a2751cf81..c0622b0c32e0e062267ebe977c9e2aec6aecf2bd 100644
--- a/apps/showcases/PorousMediaGPU/CMakeLists.txt
+++ b/apps/showcases/PorousMediaGPU/CMakeLists.txt
@@ -18,11 +18,11 @@ waLBerla_generate_target_from_python(NAME PorousMediaGPUCodeGen
 
 if (WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP)
     waLBerla_add_executable ( NAME PorousMediaGPU
-            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp util.cpp utilGPU.cu
+            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp utilGPU.cu
             DEPENDS blockforest boundary core field gpu lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen )
 else()
     waLBerla_add_executable ( NAME PorousMediaGPU
-            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp util.cpp
+            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp
             DEPENDS blockforest boundary core field lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen )
 
 endif(WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP)
diff --git a/apps/showcases/PorousMediaGPU/Inflow.cpp b/apps/showcases/PorousMediaGPU/Inflow.cpp
index f3b009b41155696fef3201f21e1984dc6033a0a3..03ee9423c5d7d6fa80d95e18b06950c7ea9c8d43 100644
--- a/apps/showcases/PorousMediaGPU/Inflow.cpp
+++ b/apps/showcases/PorousMediaGPU/Inflow.cpp
@@ -13,7 +13,7 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file InitializerFunctions.h
+//! \file Inflow.cpp
 //! \author Markus Holzer <markus.holzer@fau.de>
 //
 //======================================================================================================================
@@ -101,24 +101,23 @@ void Inflow::init(){
     epsilonVKP_      = std::pow(urms_, real_c(3.0)) / le_;
     kappaKolmogorov_ = std::pow(epsilonVKP_ / std::pow(nu_, real_c(3.0)), real_c(1.0)/real_c(4.0));
 
-    firstMode_ = std::log10(kappaVKP_ / real_c(1000.0));
-    lastMode_  = std::log10(kappaKolmogorov_ * real_c(1000.0));
+    firstMode_ = std::log10(kappaVKP_ / real_c(10.0));
+    lastMode_  = std::log10(kappaKolmogorov_ * real_c(10.0));
     const real_t linDeltaKappa = (lastMode_ - firstMode_) / (real_c(n_) - real_c(1.0));
 
-    kappa_.resize(n_);
-    ut_.resize(n_);
     real_t utSquaredSum = 0.0;
     for(uint_t i = 0; i < n_; ++i){
         const real_t kappaN  = std::pow(real_c(10.0), firstMode_ + (real_c(i) * linDeltaKappa));
         const real_t kappaN1 = std::pow(real_c(10.0), firstMode_ + (real_c(i + 1) * linDeltaKappa));
         const real_t deltaKappa = kappaN1 - kappaN;
-
-        kappa_[i] = kappaN;
         const real_t tmp = std::sqrt(getKarmanPaoEnergy(kappaN, alpha_, kappaVKP_, kappaKolmogorov_, urms_) * deltaKappa);
-        ut_[i] = tmp;
+
+        kappa_.emplace_back(kappaN);
+        ut_.emplace_back(tmp);
         utSquaredSum += tmp*tmp;
     }
 
+    n_ = uint_c(kappa_.size());
     const real_t correctionTKE = std::sqrt(((real_c(3.0) / real_c(2.0)) * (urms_*urms_)) / utSquaredSum);
 
     for(uint_t i = 0; i < n_; ++i){
@@ -130,9 +129,6 @@ void Inflow::init(){
     const Vector3<uint_t> s(n_, setup_.cellsPerBlock[1], setup_.cellsPerBlock[2]);
     getBlockAndGridSize(s, block_, grid_);
 
-    WALBERLA_LOG_DEVEL_VAR(block_)
-    WALBERLA_LOG_DEVEL_VAR(grid_)
-
     WALBERLA_GPU_CHECK(gpuMalloc((void **)&kappaGPU_, sizeof(real_t) * kappa_.size() ));
     WALBERLA_GPU_CHECK(gpuMemcpy( kappaGPU_, &kappa_[0], sizeof(real_t) * kappa_.size(), gpuMemcpyHostToDevice ));
 
diff --git a/apps/showcases/PorousMediaGPU/Inflow.h b/apps/showcases/PorousMediaGPU/Inflow.h
index 6950214eb494145672f1315a4c1f43f312ff784e..7b5aa1efd1f24726fa70eacf9abd3e22877a5a38 100644
--- a/apps/showcases/PorousMediaGPU/Inflow.h
+++ b/apps/showcases/PorousMediaGPU/Inflow.h
@@ -26,8 +26,6 @@
 #include "gpu/ErrorChecking.h"
 #include "gpu/FieldCopy.h"
 
-// #include <curand_kernel.h>
-
 #include "Setup.h"
 #include "util.h"
 #include "utilGPU.h"
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
index 63354ab574da6de1da6ce6a72370bc803e9a299b..8245b0eb69776e0e66188a3e48217e950239f7b6 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
@@ -83,6 +83,7 @@
 #include "InitializerFunctions.h"
 #include "GridGeneration.h"
 #include "Setup.h"
+#include "Probe.h"
 #include "TPMS.h"
 #include "Types.h"
 
@@ -195,29 +196,18 @@ int main(int argc, char** argv){
 
    auto inflow = std::make_shared<Inflow>(blocks, setup, ids);
 
-   const bool cpu = false; 
-   if(cpu){
-      WALBERLA_LOG_INFO_ON_ROOT("CPU Version")
-      inflow->updateCPU();
-   }
-   else{
-      WALBERLA_LOG_INFO_ON_ROOT("GPU Version")
-      inflow->updateGPU();
-      gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.velocityFieldInflowSlice, ids.velocityFieldInflowSliceGPU);
-   }
-
+   auto probe = std::make_shared<Probe>(blocks, setup, ids);
 
-   for (auto& block : *blocks){
-      auto velocityFieldInflowSlice = block.getData< VelocityField_T >(ids.velocityFieldInflowSlice);
+   // Vector3<real_t> test(1.5, 0.5, 0.5);
+   probe->addPoint(Vector3<real_t>(1.5, 0.5, 0.5 ));
+   probe->addPoint(Vector3<real_t>(3.51, 0.5, 0.5));
+   probe->addPoint(Vector3<real_t>(100.51, 0.5, 0.5));
 
-      WALBERLA_LOG_DEVEL_VAR(velocityFieldInflowSlice->get(0,20,20,0))
-      WALBERLA_LOG_DEVEL_VAR(velocityFieldInflowSlice->get(0,20,20,1))
-      WALBERLA_LOG_DEVEL_VAR(velocityFieldInflowSlice->get(0,20,20,2))
-   }
+   probe->initialise();
 
    return EXIT_SUCCESS;
 
-
+ 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.prm b/apps/showcases/PorousMediaGPU/PorousMediaGPU.prm
index f3243cf9faab4a2fb95272cf88f4017f5ba1eb7d..5757f1d68a791c27116774368a630434f8cb8cf4 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.prm
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.prm
@@ -7,7 +7,7 @@ Parameters
     latticeVelocity 0.05;
     initialiseWithInletVelocity true;
 
-    timesteps 1;
+    timesteps 10001;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -19,10 +19,10 @@ Parameters
 Turbulence
 {
     useGrid false;
-    U_rms  0.05;
+    U_rms  0.01;
     characteristicLength 0.4;
-    numberOfFourrierModes 100;
-    U_c < 0.05, 0.0, 0.0 >;
+    numberOfFourrierModes 128;
+    U_c < 2.0, 0.0, 0.0 >;
 }
 
 TPMS
@@ -30,17 +30,18 @@ TPMS
     waveNumber 6.2831853072;
     scaling 10.0;
     strut 1.0;
-    start 2.0;
-    end 5.0;
+    start 2.5;
+    end 5.5;
     type gyroid;
 }
 
 //! [domainSetup]
 DomainSetup
 {
-    domainSize    < 7, 1, 1 >;
-    // cellsPerBlock < 1344, 192, 192 >;
-    cellsPerBlock < 336, 48, 48 >;
+    domainSize    < 8, 1, 1 >;
+    cellsPerBlock < 384, 48, 48 >;
+    // cellsPerBlock < 768, 96, 96 >;
+    // cellsPerBlock < 1536, 192, 192 >;
     blocks    < 1, 1, 1 >;
     periodic    < false, true, true >;
     refinementLevels 0;
@@ -65,13 +66,13 @@ StabilityChecker
 
 VTKWriter
 {
-    vtkWriteFrequency 0;
+    vtkWriteFrequency 10000;
     vtkGhostLayers 0;
     velocity true;
-    writeVelocityAsMagnitude false;
+    writeVelocityAsMagnitude true;
     density false;
-    flag true;
-    writeOnlySlice false;
+    flag false;
+    writeOnlySlice true;
     sliceThickness 1;
     writeXZSlice false;
     amrFileFormat false;
diff --git a/apps/showcases/PorousMediaGPU/Probe.cpp b/apps/showcases/PorousMediaGPU/Probe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e88e548287f5ed9b74fda4802b5bef4371bddb6
--- /dev/null
+++ b/apps/showcases/PorousMediaGPU/Probe.cpp
@@ -0,0 +1,68 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Probe.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "Probe.h"
+
+namespace walberla{
+
+void Probe::initialise(){
+
+    std::map<IBlock*, std::vector<Vector3<real_t>> > pointsPerBlock;
+
+    const AABB domain = blocks_->getDomain();
+    const uint_t timesteps = setup_.timesteps;
+ 
+    for(auto p: probePoints_){
+        if(!domain.contains(p)){continue;}
+        IBlock* block = blocks_->getBlock(p);
+        if(block != nullptr){
+            const Cell localCell = blocks_->getBlockLocalCell(*block, p);
+            Vector3<real_t> localCellCenter;
+            blocks_->getCellCenter(localCellCenter, localCell);
+
+            pointsPerBlock[block].emplace_back(localCellCenter);
+        }
+    }
+
+    for (auto& block : *blocks_){
+        auto * dataVector = block.getData< DataVector > ( probeDataID );
+        auto & cpuCellVector = dataVector->cpuCellVector();
+        auto & cpuDataVector = dataVector->cpuDataVector();
+
+        const uint_t n = uint_c(pointsPerBlock[&block].size());
+        cpuCellVector.resize(3 * n);
+
+        uint_t i = 0;
+        for(auto p: pointsPerBlock[&block]){
+            const Cell localCell = blocks_->getBlockLocalCell(block, p);
+
+            cpuCellVector[i] = localCell[0];
+            cpuCellVector[i + n] = localCell[1];
+            cpuCellVector[i + 2*n] = localCell[2];
+            ++i;
+        }
+
+        cpuDataVector.resize(4 * n * timesteps);
+        dataVector->syncGPU();
+    }
+}
+
+
+
+} // namespace walberla
diff --git a/apps/showcases/PorousMediaGPU/Probe.h b/apps/showcases/PorousMediaGPU/Probe.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f6017aaa913781bf38c19a9719544571b99d2df
--- /dev/null
+++ b/apps/showcases/PorousMediaGPU/Probe.h
@@ -0,0 +1,115 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Probe.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "domain_decomposition/IBlock.h"
+
+#include "core/logging/Initialization.h"
+#include "core/math/Constants.h"
+
+#include "gpu/GPUWrapper.h"
+#include "gpu/ErrorChecking.h"
+#include "gpu/FieldCopy.h"
+
+#include "Setup.h"
+#include "utilGPU.h"
+
+namespace walberla{
+
+class Probe
+{
+  public:
+  class DataVector{
+    public:
+        DataVector() = default;
+        bool operator==(DataVector const &other) const { return other.cpuCellVector_ == cpuCellVector_ && other.cpuDataVector_ == cpuDataVector_; }
+
+        ~DataVector() {
+            if(synced){
+              if(!cpuCellVector_.empty()){WALBERLA_GPU_CHECK(gpuFree(gpuCellVector_))}
+              if(!cpuDataVector_.empty()){WALBERLA_GPU_CHECK(gpuFree(gpuDataVector_))}
+            }
+        }
+        
+        std::vector<int64_t>& cpuCellVector() { return cpuCellVector_; }
+        std::vector<real_t>& cpuDataVector() { return cpuDataVector_; }
+
+        int64_t* gpuCellVector() {return gpuCellVector_;}
+        real_t* gpuDataVector() {return gpuDataVector_;}
+
+        void syncGPU()
+        {
+          WALBERLA_GPU_CHECK(gpuMalloc((void **)&gpuCellVector_, sizeof(int64_t) * cpuCellVector_.size() ));
+          WALBERLA_GPU_CHECK(gpuMemcpy( gpuCellVector_, &cpuCellVector_[0], sizeof(int64_t) * cpuCellVector_.size(), gpuMemcpyHostToDevice ));
+
+          WALBERLA_GPU_CHECK(gpuMalloc((void **)&gpuDataVector_, sizeof(real_t) * cpuDataVector_.size() ));
+          WALBERLA_GPU_CHECK(gpuMemcpy( gpuDataVector_, &cpuDataVector_[0], sizeof(real_t) * cpuDataVector_.size(), gpuMemcpyHostToDevice ));
+
+          cur_ = gpuDataVector_;
+
+          synced = true;
+        }
+        
+    private:
+        std::vector<int64_t> cpuCellVector_;
+        std::vector<real_t> cpuDataVector_;
+
+        int64_t *gpuCellVector_;
+        real_t *gpuDataVector_;
+
+        real_t *cur_ = nullptr;
+
+        bool synced{false};
+  };
+
+
+
+  Probe(const std::shared_ptr< StructuredBlockForest >& blocks, const Setup& setup, const IDs& ids)
+      : blocks_(blocks), setup_(setup), ids_(ids), frequency_(uint_c(1))
+    {
+        auto createProbeData = []( IBlock * const , StructuredBlockStorage * const ) { return new DataVector(); };
+        probeDataID = blocks_->addStructuredBlockData< DataVector >( createProbeData, "Probe Data");
+
+
+    }
+ 
+  void addPoint(const Vector3<real_t>& point) {
+    probePoints_.emplace_back(point);
+
+  }
+
+  void initialise();
+
+
+  private:
+  const std::shared_ptr< StructuredBlockForest > blocks_;
+  const Setup setup_;
+  const IDs ids_;
+  BlockDataID probeDataID;
+
+  uint_t frequency_;
+
+  std::vector<Vector3<real_t>> probePoints_;
+
+  uint_t executionCounter_;
+
+
+}; // class Probe
+} // namespace walberla
diff --git a/apps/showcases/PorousMediaGPU/Setup.h b/apps/showcases/PorousMediaGPU/Setup.h
index d45dfa2dc9d63fb0c1643c8d1006a0c5e143fdde..5d5b9f6ec6bcd84e1c1141f217fb1ae4ba8e180f 100644
--- a/apps/showcases/PorousMediaGPU/Setup.h
+++ b/apps/showcases/PorousMediaGPU/Setup.h
@@ -48,7 +48,7 @@ struct Setup
       cellsPerBlock       = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
       rootBlocks          = domainParameters.getParameter< Vector3< uint_t > >("blocks");
       cells               = Vector3<uint_t>(rootBlocks[0] * cellsPerBlock[0], rootBlocks[1] * cellsPerBlock[1], rootBlocks[2] * cellsPerBlock[2]);
-      periodic           = domainParameters.getParameter< Vector3< bool > >("periodic");
+      periodic            = domainParameters.getParameter< Vector3< bool > >("periodic");
       refinementLevels    = domainParameters.getParameter< uint_t >("refinementLevels");
 
       const real_t coarseMeshSize = domainSize[0] / real_c(rootBlocks[0] * cellsPerBlock[0]);
diff --git a/apps/showcases/PorousMediaGPU/util.cpp b/apps/showcases/PorousMediaGPU/util.cpp
index 90d79836a241ba8bf276c7f837ba869e1ae24ff3..cfab8533b63b1bca90d8b2bf11691eb3ed41303e 100644
--- a/apps/showcases/PorousMediaGPU/util.cpp
+++ b/apps/showcases/PorousMediaGPU/util.cpp
@@ -26,16 +26,16 @@ Vector3<real_t> summOfFourrierModes(const Vector3<real_t>& p, const Vector3<real
                                     const std::vector<real_t>& kappa, const std::vector<real_t>& ut, const real_t t){
 
     Vector3<real_t> result(real_c(0.0), real_c(0.0), real_c(0.0));
-    const real_t pi2 = real_c(2.0) * math::pi;
+    const real_t pi2 = real_c(6.28318530717958647693);
     const uint_t n = uint_c(kappa.size());
 
     for(uint_t i = 0; i < n; ++i){
-        const real_t a     = 0.1; // math::realRandom(real_c(0.0), real_c(1.0));
-        const real_t phi   = 0.2; //math::realRandom(real_c(0.0), real_c(1.0)) * pi2;
-        const real_t psi   = 0.3; //math::realRandom(real_c(0.0), real_c(1.0)) * pi2;
-        const real_t zetaX = -0.4; //math::realRandom(real_c(-1.0), real_c(1.0));
-        const real_t zetaY = -0.6; //math::realRandom(real_c(-1.0), real_c(1.0));
-        const real_t zetaZ = 0.8; // math::realRandom(real_c(-1.0), real_c(1.0));
+        const real_t a     = math::realRandom(real_c(0.0), real_c(1.0));
+        const real_t phi   = math::realRandom(real_c(0.0), real_c(1.0)) * pi2;
+        const real_t psi   = math::realRandom(real_c(0.0), real_c(1.0)) * pi2;
+        const real_t zetaX = math::realRandom(real_c(-1.0), real_c(1.0));
+        const real_t zetaY = math::realRandom(real_c(-1.0), real_c(1.0));
+        const real_t zetaZ = math::realRandom(real_c(-1.0), real_c(1.0));
 
         const real_t theta = std::acos(real_c(2.0) * a - real_c(1.0));
         
@@ -51,10 +51,8 @@ Vector3<real_t> summOfFourrierModes(const Vector3<real_t>& p, const Vector3<real
         const real_t zetaXNormalized = zetaX / zettaLength;
         const real_t zetaYNormalized = zetaY / zettaLength;
         const real_t zetaZNormalized = zetaZ / zettaLength;
-
         // WALBERLA_LOG_INFO_ON_ROOT("zeta: [" << zetaXNormalized << ", " << zetaYNormalized << ", " << zetaZNormalized << "]")
         
-
         const real_t sigmaX = kappaY * zetaZNormalized - kappaZ * zetaYNormalized;
         const real_t sigmaY = kappaZ * zetaXNormalized - kappaX * zetaZNormalized;
         const real_t sigmaZ = kappaX * zetaYNormalized - kappaY * zetaXNormalized;
@@ -65,7 +63,6 @@ Vector3<real_t> summOfFourrierModes(const Vector3<real_t>& p, const Vector3<real
         const real_t sigmaXNormalized = sigmaX / sigmaLength;
         const real_t sigmaYNormalized = sigmaY / sigmaLength;
         const real_t sigmaZNormalized = sigmaZ / sigmaLength;
-
         // WALBERLA_LOG_INFO_ON_ROOT("Sigma: [" << sigmaXNormalized << ", " << sigmaYNormalized << ", " << sigmaZNormalized << "]")
 
         const real_t pX = p[0] - (t * uc[0]);
diff --git a/apps/showcases/PorousMediaGPU/utilGPU.cu b/apps/showcases/PorousMediaGPU/utilGPU.cu
index 7cb5912b78e2ccdcbc5fef2af615c5e15b265aab..87ae21283fc2be0c164349019d335933dc736ded 100644
--- a/apps/showcases/PorousMediaGPU/utilGPU.cu
+++ b/apps/showcases/PorousMediaGPU/utilGPU.cu
@@ -13,39 +13,100 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file util.cu
+//! \file utilGPU.cu
 //! \author Markus Holzer <markus.holzer@fau.de>
 //
 //======================================================================================================================
 #include "utilGPU.h"
+#include <curand_kernel.h>
 
 namespace walberla
 {
 
 namespace internal{
 
+static __device__ void densityVelocityEven(double* const _data_pdfs,
+                                           double* density, double* uX, double* uY, double* uZ, 
+                                           int64_t const ctr_0, int64_t const ctr_1, int64_t const ctr_2,
+                                           int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   const double vel0Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3];
+   const double momdensity_0 = vel0Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3];
+   const double vel1Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3];
+   const double momdensity_1 = vel1Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3];
+   const double vel2Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3];
+   const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2];
+   const double momdensity_2 = vel2Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3];
+   const double rho = delta_rho + 1.0;
+   const double u_0 = momdensity_0*((1.0) / (rho));
+   const double u_1 = momdensity_1*((1.0) / (rho));
+   const double u_2 = momdensity_2*((1.0) / (rho));
+   *density = rho;
+   *uX = u_0;
+   *uY = u_1;
+   *uZ = u_2;
+}
+
+static __device__ void densityVelocityOdd(double* const _data_pdfs,
+                                          double* density, double* uX, double* uY, double* uZ, 
+                                          int64_t const ctr_0, int64_t const ctr_1, int64_t const ctr_2,
+                                          int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   const double vel0Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3];
+   const double momdensity_0 = vel0Term - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 13*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3];
+   const double vel1Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3];
+   const double momdensity_1 = vel1Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 12*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3];
+   const double vel2Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 5*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3];
+   const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 6*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3];
+   const double momdensity_2 = vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 14*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 15*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 6*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3];
+   const double rho = delta_rho + 1.0;
+   const double u_0 = momdensity_0*((1.0) / (rho));
+   const double u_1 = momdensity_1*((1.0) / (rho));
+   const double u_2 = momdensity_2*((1.0) / (rho));
+   *density = rho;
+   *uX = u_0;
+   *uY = u_1;
+   *uZ = u_2;
+}
+
+__device__ void warpReduce(volatile double *uX, volatile double *uY, volatile double *uZ, unsigned int tid) 
+{
+   uX[tid] += uX[tid+32]; uY[tid] += uY[tid+32]; uZ[tid] += uZ[tid+32];
+   uX[tid] += uX[tid+16]; uY[tid] += uY[tid+16]; uZ[tid] += uZ[tid+16];
+   uX[tid] += uX[tid+ 8]; uY[tid] += uY[tid+ 8]; uZ[tid] += uZ[tid+ 8];
+   uX[tid] += uX[tid+ 4]; uY[tid] += uY[tid+ 4]; uZ[tid] += uZ[tid+ 4];
+   uX[tid] += uX[tid+ 2]; uY[tid] += uY[tid+ 2]; uZ[tid] += uZ[tid+ 2];
+   uX[tid] += uX[tid+ 1]; uY[tid] += uY[tid+ 1]; uZ[tid] += uZ[tid+ 1];
+}
+
 static __device__ void singleMode(double* rX, double* rY, double* rZ, const double kappaN, const double ut,
-                                  const double pX, const double pY, const double pZ)
+                                  const double pX, const double pY, const double pZ, const int64_t linear_index)
 {
 
-   /*
-   curandState state;
-   curand_init(42, xStride*x + yStride*y + zStride*z, 0, &state);
+   const real_t pi2 = 6.28318530717958647693;
 
-   const real_t a   = curand_uniform(&state);
-   const real_t phi = curand_uniform(&state) * pi2;
-   const real_t psi = curand_uniform(&state) * pi2;
-   const real_t zetaX = real_t(2.0) * curand_uniform(&state) - real_t(1.0);
-   const real_t zetaY = real_t(2.0) * curand_uniform(&state) - real_t(1.0);
-   const real_t zetaZ = real_t(2.0) * curand_uniform(&state) - real_t(1.0);
-   */
+   curandStatePhilox4_32_10_t state;
+   curand_init(42, linear_index, 0, &state);
 
+   const float4 r1 = curand_uniform4(&state);
+   const float4 r2 = curand_uniform4(&state);
+
+   const real_t a   = real_t(r1.x);
+   const real_t phi = real_t(r1.y) * pi2;
+   const real_t psi = real_t(r1.z) * pi2;
+   const real_t zetaX = real_t(2.0) * real_t(r1.w) - real_t(1.0);
+   const real_t zetaY = real_t(2.0) * real_t(r2.x) - real_t(1.0);
+   const real_t zetaZ = real_t(2.0) * real_t(r2.y) - real_t(1.0);
+   
+/*
+   // For debugging
    const real_t a     = 0.1; // math::realRandom(real_c(0.0), real_c(1.0));
    const real_t phi   = 0.2; //math::realRandom(real_c(0.0), real_c(1.0)) * pi2;
    const real_t psi   = 0.3; //math::realRandom(real_c(0.0), real_c(1.0)) * pi2;
    const real_t zetaX = -0.4; //math::realRandom(real_c(-1.0), real_c(1.0));
    const real_t zetaY = -0.6; //math::realRandom(real_c(-1.0), real_c(1.0));
    const real_t zetaZ = 0.8; // math::realRandom(real_c(-1.0), real_c(1.0));
+*/
 
    const real_t theta = acos(real_t(2.0) * a - real_t(1.0));
 
@@ -53,21 +114,21 @@ static __device__ void singleMode(double* rX, double* rY, double* rZ, const doub
    const real_t kappaY = sin(theta) * sin(phi) * kappaN;
    const real_t kappaZ = cos(theta) * kappaN;
 
-   const real_t zettaLength = sqrt(zetaX*zetaX + zetaY*zetaY + zetaZ*zetaZ);
+   const real_t zettaLength = rsqrt(zetaX*zetaX + zetaY*zetaY + zetaZ*zetaZ);
    
-   const real_t zetaXNormalized = zetaX / zettaLength;
-   const real_t zetaYNormalized = zetaY / zettaLength;
-   const real_t zetaZNormalized = zetaZ / zettaLength;
+   const real_t zetaXNormalized = zetaX * zettaLength;
+   const real_t zetaYNormalized = zetaY * zettaLength;
+   const real_t zetaZNormalized = zetaZ * zettaLength;
 
    const real_t sigmaX = kappaY * zetaZNormalized - kappaZ * zetaYNormalized;
    const real_t sigmaY = kappaZ * zetaXNormalized - kappaX * zetaZNormalized;
    const real_t sigmaZ = kappaX * zetaYNormalized - kappaY * zetaXNormalized;
 
-   const real_t sigmaLength = sqrt(sigmaX*sigmaX + sigmaY*sigmaY + sigmaZ*sigmaZ);
+   const real_t sigmaLength = rsqrt(sigmaX*sigmaX + sigmaY*sigmaY + sigmaZ*sigmaZ);
 
-   const real_t sigmaXNormalized = sigmaX / sigmaLength;
-   const real_t sigmaYNormalized = sigmaY / sigmaLength;
-   const real_t sigmaZNormalized = sigmaZ / sigmaLength;
+   const real_t sigmaXNormalized = sigmaX * sigmaLength;
+   const real_t sigmaYNormalized = sigmaY * sigmaLength;
+   const real_t sigmaZNormalized = sigmaZ * sigmaLength;
 
    const real_t arg =  real_t(2.0) * ut * cos(kappaX * pX + kappaY * pY + kappaZ * pZ + psi);
 
@@ -82,46 +143,77 @@ static __global__ void summOfFourrierModesKernel(double * velocity, double * con
                                                  const real_t yMin, const real_t zMin, const real_t minCornerY, const real_t minCornerZ,
                                                  const real_t dx, const real_t ucX, const real_t ucY, const real_t ucZ,
                                                  const real_t t, const uint_t n, const real_t latticeVelocity,
-                                                 const int64_t xSize, const int64_t ySize, const int64_t zSize,
-                                                 const int64_t xStride, const int64_t yStride, const int64_t zStride, const int64_t fStride)
+                                                 const int64_t ySize, const int64_t zSize,
+                                                 const int64_t yStride, const int64_t zStride, const int64_t fStride)
 {
-   if(blockDim.x*blockIdx.x + threadIdx.x < xSize && blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize)
+   if(blockDim.x*blockIdx.x + threadIdx.x < n && blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize)
    {
-    const int64_t x = 0; // blockDim.x*blockIdx.x + threadIdx.x;
-    const int64_t y = blockDim.y*blockIdx.y + threadIdx.y;
-    const int64_t z = blockDim.z*blockIdx.z + threadIdx.z;
-     
+      extern __shared__ double sharedMemory[];
 
-    real_t resultX = 0.0;
-    real_t resultY = 0.0;
-    real_t resultZ = 0.0;
+      double* uPrimeX = &sharedMemory[0];
+      double* uPrimeY = &sharedMemory[n];
+      double* uPrimeZ = &sharedMemory[2*n];
 
-    const real_t xCenter = real_t(0.0);
-    const real_t yCenter = (yMin + ( real_t( y ) + real_t(0.5) ) * dx) + minCornerY;
-    const real_t zCenter = (zMin + ( real_t( z ) + real_t(0.5) ) * dx) + minCornerZ;
-    const real_t pi2 = real_t(2.0) * real_t(3.14159265358979323846);
+      // const int64_t thx = threadIdx.x;
+      const int64_t x   = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t y   = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t z   = blockDim.z*blockIdx.z + threadIdx.z;
 
-    const real_t pX = xCenter - (t * ucX);
-    const real_t pY = yCenter - (t * ucY);
-    const real_t pZ = zCenter - (t * ucZ);
+      const int64_t linear_index = x + (y * n) + (z * n * ySize);
+     
+      const real_t xCenter = real_t(0.0);
+      const real_t yCenter = (yMin + ( real_t( y ) + real_t(0.5) ) * dx) + minCornerY;
+      const real_t zCenter = (zMin + ( real_t( z ) + real_t(0.5) ) * dx) + minCornerZ;
 
-    for(int64_t i = 0; i < n; ++i){
+      const real_t pX = xCenter - (t * ucX);
+      const real_t pY = yCenter - (t * ucY);
+      const real_t pZ = zCenter - (t * ucZ);
 
       double rX, rY, rZ;
-      singleMode(&rX, &rY, &rZ, kappa[i], ut[i], pX, pY, pZ);
-
-      resultX += rX;
-      resultY += rY;
-      resultZ += rZ;
-    }
-
-        
-    velocity[xStride*x + yStride*y + zStride*z + 0*fStride] = resultX;
-    velocity[xStride*x + yStride*y + zStride*z + 1*fStride] = resultY;
-    velocity[xStride*x + yStride*y + zStride*z + 2*fStride] = resultZ;
-
+      singleMode(&rX, &rY, &rZ, kappa[x], ut[x], pX, pY, pZ, linear_index);
+
+      uPrimeX[x] = rX;
+      uPrimeY[x] = rY;
+      uPrimeZ[x] = rZ;
+
+      __syncthreads();
+
+/*
+      for(unsigned int stride = (blockDim.x/2); stride > 32 ; stride /=2){
+         if(thx < stride){
+            uPrimeX[thx] += uPrimeX[thx + stride];
+            uPrimeY[thx] += uPrimeY[thx + stride];
+            uPrimeZ[thx] += uPrimeZ[thx + stride];
+         }
+      }
+
+      if(thx < 32){
+         warpReduce(uPrimeX, uPrimeY, uPrimeZ, thx);
+      }
+
+      if(thx == 0){
+         velocity[yStride*y + zStride*z + 0*fStride] = uPrimeX[0];
+         velocity[yStride*y + zStride*z + 1*fStride] = uPrimeY[0];
+         velocity[yStride*y + zStride*z + 2*fStride] = uPrimeZ[0];
+      }     
+*/
+      
+      if(x == 0){
+         real_t resultX = 0.0;
+         real_t resultY = 0.0;
+         real_t resultZ = 0.0;
+
+         for(int64_t i = 0; i < n; ++i){
+            resultX += uPrimeX[i];
+            resultY += uPrimeY[i];
+            resultZ += uPrimeZ[i];
+         }
+
+         velocity[yStride*y + zStride*z + 0*fStride] = latticeVelocity + resultX;
+         velocity[yStride*y + zStride*z + 1*fStride] = resultY;
+         velocity[yStride*y + zStride*z + 2*fStride] = resultZ;
+      }
    }
-   
 }
 }
 
@@ -150,20 +242,21 @@ void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real
 
    dim3 block_(block[0], block[1], block[2]);
    dim3 grid_(grid[0], grid[1], grid[2]);
+
+   // In shared memory we store the result of the u_prime (x, y, z)
+   const uint_t sharedMemorySize = 3 * n * sizeof(real_t);
  
    double * data_velocity = velocityGPU->dataAt(0, 0, 0, 0);
-   const int64_t size_velocity_0 = int64_c(velocityGPU->xSize());
    const int64_t size_velocity_1 = int64_c(velocityGPU->ySize());
    const int64_t size_velocity_2 = int64_c(velocityGPU->zSize());
-   const int64_t stride_velocity_0 = int64_c(velocityGPU->xStride());
    const int64_t stride_velocity_1 = int64_c(velocityGPU->yStride());
    const int64_t stride_velocity_2 = int64_c(velocityGPU->zStride());
    const int64_t stride_velocity_3 = int64_c(velocityGPU->fStride());
-   internal::summOfFourrierModesKernel<<<grid_, block_, 0, nullptr>>>(data_velocity, kappa, ut,
+   internal::summOfFourrierModesKernel<<<grid_, block_, sharedMemorySize, nullptr>>>(data_velocity, kappa, ut,
                                                                       globalMinCorner[1], globalMinCorner[2], blockMinCorner[1], blockMinCorner[2],
                                                                       dx, uc[0], uc[1], uc[2], t, n, latticeVelocity,
-                                                                      size_velocity_0, size_velocity_1, size_velocity_2,
-                                                                      stride_velocity_0, stride_velocity_1, stride_velocity_2, stride_velocity_3);
+                                                                      size_velocity_1, size_velocity_2,
+                                                                      stride_velocity_1, stride_velocity_2, stride_velocity_3);
 
 }