diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.cpp b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
index 624f466ea87ac857c10c955b54b8669c87630430..0617986e5e8f3264c25b568807eefb922bb1903f 100644
--- a/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
+++ b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
@@ -39,12 +39,12 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks,
    maxLevel_ = blocks->getDepth();
    blocks->getBlocks(finestBlocks_, maxLevel_);
 
-   dragFilename_   = parameters.getParameter< std::string >("filename");
    checkFrequency_ = parameters.getParameter< uint_t >("evaluationFrequency");
    writeCounter_   = 1000 * checkFrequency_;
    dt_             = setup.dt / real_c( 1 << setup.refinementLevels );
    timesteps_      = setup.timesteps * ( 1 << setup.refinementLevels );
    totalExecutionCounter_ = 0;
+   rampUpTime_ = parameters.getParameter< uint_t >("evaluationStart") * ( 1 << setup.refinementLevels );
 
    if(writeCounter_ == uint_c(0))
       writeCounter_++;
@@ -52,10 +52,16 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks,
    const real_t radius   = wallParameters.getParameter< real_t >("hemispheresRadius");
    const real_t distance = wallParameters.getParameter< real_t >("hemispheresDistance");
 
-   filesystem::path hdfFilePath( dragFilename_.c_str() );
-   if( filesystem::exists( hdfFilePath ) )
-      std::remove( hdfFilePath.string().c_str() );
+   const auto outputFolder = setup_.outputFolder;
+   const auto fileStem = setup_.fileStem;
+   const auto fileName = outputFolder + "/" + fileStem + "Drag.h5"; 
+   filesystem::path hdfFilePath( fileName.c_str() );
 
+   WALBERLA_ROOT_SECTION()
+   {
+      if( filesystem::exists( hdfFilePath ) )
+         std::remove( hdfFilePath.string().c_str() );
+   }
 
    extendSize[0] = 0;
    selectStart[0] = 0;
@@ -111,11 +117,13 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks,
    WALBERLA_CHECK_GREATER_EQUAL(surfaceAreaWall_, real_c(0.0))
 
    meanVelocity_ = setup.latticeVelocity;
+   totalSurface_ = surfaceAreaHemiSpheres_ + surfaceAreaWall_;
 
    WALBERLA_LOG_INFO_ON_ROOT(
       "Evaluation initialised:"
          "\n   + Total surface area of Hemispheres:     " << surfaceAreaHemiSpheres_
       << "\n   + Surface area of the wall:              " << surfaceAreaWall_
+      << "\n   + Total Surface area:                    " << totalSurface_
    )
 
 }
@@ -125,7 +133,7 @@ void DragEvaluation::forceCalculation(const uint_t level)
    if(level == maxLevel_){
       ++totalExecutionCounter_;
    }
-   if (checkFrequency_ == uint_c(0) || (totalExecutionCounter_ % checkFrequency_) != uint_c(0)){
+   if (checkFrequency_ == uint_c(0) || rampUpTime_ > totalExecutionCounter_ || (totalExecutionCounter_ % checkFrequency_) != uint_c(0)){
       return;
    }
    
@@ -139,8 +147,8 @@ void DragEvaluation::forceCalculation(const uint_t level)
          const double t = double_c(totalExecutionCounter_ - 1) * dt_;
          const double meanU2 = double_c(meanVelocity_) * double_c(meanVelocity_);
 
-         const double cDRealArea = (double_c(2.0) * force_[0]) / (meanU2 * double_c(surfaceAreaHemiSpheres_));
-         const double cLRealArea = (double_c(2.0) * force_[1]) / (meanU2 * double_c(surfaceAreaHemiSpheres_));
+         const double cDRealArea = (double_c(2.0) * force_[0]) / (meanU2 * double_c(totalSurface_));
+         const double cLRealArea = (double_c(2.0) * force_[1]) / (meanU2 * double_c(totalSurface_));
 
          dragResults_[quantities_["timestep"]*writeCounter_ + executionCounter_] = float_c(t);
          dragResults_[quantities_["Fx"]*writeCounter_ + executionCounter_] = float_c(force_[0]);
@@ -150,10 +158,13 @@ void DragEvaluation::forceCalculation(const uint_t level)
          dragResults_[quantities_["CLReal"]*writeCounter_ + executionCounter_] = float_c(cLRealArea);
       }
 
+      sumFx_ += force_[0];
+
       force_[0] = double_c(0.0);
       force_[1] = double_c(0.0);
       force_[2] = double_c(0.0);
 
+      ++meanQuantitiesCounter_;
       ++executionCounter_;
       if(executionCounter_ >= writeCounter_ || totalExecutionCounter_ == timesteps_){
          writeResults();
@@ -185,4 +196,170 @@ void DragEvaluation::writeResults()
    executionCounter_ = 0;
 }
 
+Plotter::Plotter(std::shared_ptr< StructuredBlockForest >& blocks, const Setup& setup, const IDs& ids)
+   : blocks_(blocks), setup_(setup), ids_(ids)
+{
+
+   const auto outputFolder = setup_.outputFolder;
+   const auto fileStem = setup_.fileStem;
+   const auto fileName = outputFolder + "/" + fileStem + "MeanQuantities.h5"; 
+   filesystem::path hdfFilePath( fileName.c_str() );
+
+   WALBERLA_ROOT_SECTION()
+   {
+      if( filesystem::exists( hdfFilePath ) )
+         std::remove( hdfFilePath.string().c_str() );
+   }
+
+   H5::FileAccPropList fileAccProps;
+#ifdef WALBERLA_BUILD_WITH_MPI
+   //  Enable MPI-IO
+   H5Pset_fapl_mpio(fileAccProps.getId(), mpi::MPIManager::instance()->comm(), MPI_INFO_NULL);
+#endif
+   h5file_ = std::make_unique<H5::H5File>(hdfFilePath.c_str(), H5F_ACC_CREAT | H5F_ACC_RDWR, H5::FileCreatPropList::DEFAULT, fileAccProps);
+   H5::Group dataGroup = h5file_->createGroup("Data");
+}
+
+void Plotter::writeResult(const real_t frictionVelocity, const real_t welfordCounter){
+
+   auto finalDomain = blocks_->getDomain();
+
+   std::map<real_t, uint_t> centerToIntex;
+   uint_t idx = 0;
+
+   Vector3<real_t> p = finalDomain.min();
+   IBlock* block = blocks_->getBlock(p);
+   while(block){
+      const uint_t ny = blocks_->getNumberOfYCells(*block);
+      for(uint_t y = 0; y < ny; ++y){
+            const Cell localCell(cell_idx_c(0), cell_idx_c(y), cell_idx_c(0));
+            Vector3<real_t> globalCellCenter;
+            blocks_->getBlockLocalCellCenter(*block, localCell, globalCellCenter);
+            centerToIntex[globalCellCenter[1]] = idx;
+            idx++;
+      }
+      p[1] = block->getAABB().yMax() + setup_.dxF;
+      block = blocks_->getBlock(p);
+      if(block == nullptr){
+         WALBERLA_CHECK_GREATER(p[1], finalDomain.yMax(), "This method does is not yet implemented for MPI parallel simulations")
+      }
+   }
+
+   const real_t viscosity = setup_.viscosity;
+   const real_t channelHalfHeight = setup_.referenceLength;
+
+   addAttribute(viscosity, "viscosity");
+   addAttribute(channelHalfHeight, "channelHalfHeight");
+   addAttribute(frictionVelocity, "frictionVelocity");
+   addAttribute(welfordCounter, "welfordCounter");
+
+   std::vector<real_t> yCellCenter(centerToIntex.size(), real_c(0.0));
+   std::vector<real_t> yRel(centerToIntex.size(), real_c(0.0));
+   std::vector<real_t> yPlus(centerToIntex.size(), real_c(0.0));
+   std::vector<uint_t> cellCounterI(centerToIntex.size(), uint_c(0));
+   std::vector<uint_t> cellCounterE(centerToIntex.size(), uint_c(0));
+
+   std::vector<real_t> meanVelocityI(centerToIntex.size(), real_c(0.0));
+   std::vector<real_t> meanVelocityE(centerToIntex.size(), real_c(0.0));
+   std::vector<real_t> uPlusI(centerToIntex.size(), real_c(0.0));
+   std::vector<real_t> uPlusE(centerToIntex.size(), real_c(0.0));
+   std::vector<real_t> reynoldsStressVector(centerToIntex.size() * TensorField_T::F_SIZE, 0_r);
+
+   for (auto const& it : centerToIntex){
+      yCellCenter[it.second] = it.first;
+      yRel[it.second] = real_c(it.second) / channelHalfHeight;
+      yPlus[it.second] = real_c(it.second) * frictionVelocity / viscosity;
+   }
+
+   for( auto bIt = blocks_->begin(); bIt != blocks_->end(); ++bIt) {
+      auto * meanVelocityField = bIt->template getData<VelocityField_T>(ids_.meanVelocityField);
+      auto * sosField = bIt->template getData<TensorField_T>(ids_.sosField);
+      auto * flagField = bIt->template getData<FlagField_T>(ids_.flagField);
+      auto domainFlag = flagField->getFlag(setup_.fluidUID);
+      const auto ci = meanVelocityField->xyzSize();
+      for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
+         Vector3<real_t> globalCellCenter;
+         blocks_->getBlockLocalCellCenter(*bIt, *cellIt, globalCellCenter);
+
+         if (!centerToIntex.count(globalCellCenter[1])){
+            WALBERLA_ABORT("y coordinate of center to index map was not contained")
+         }
+
+         idx = centerToIntex[globalCellCenter[1]];
+         cellCounterI[idx]++;
+
+         if(isFlagSet(flagField->get(*cellIt), domainFlag)){
+            cellCounterE[idx]++;
+            meanVelocityE[idx] += meanVelocityField->get(*cellIt, 0);
+            meanVelocityI[idx] += meanVelocityField->get(*cellIt, 0);
+
+            for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+               reynoldsStressVector[i * centerToIntex.size() + idx] = sosField->get(*cellIt, i) / welfordCounter;
+            }
+         }
+      }
+   }
+
+   for(uint_t n = 0; n < centerToIntex.size(); ++n){
+      meanVelocityI[n] /= real_c(cellCounterI[n]);
+      meanVelocityE[n] /= real_c(cellCounterE[n]);
+
+      for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+         reynoldsStressVector[i * centerToIntex.size() + n] /= real_c(cellCounterE[n]);
+      }
+
+      uPlusI[n] = meanVelocityI[n] / frictionVelocity;
+      uPlusE[n] = meanVelocityE[n] / frictionVelocity;
+   }
+
+   writeData(yCellCenter.data(), centerToIntex.size(), "y");
+   writeData(yRel.data(), centerToIntex.size(), "yRel");
+   writeData(yPlus.data(), centerToIntex.size(), "yPlus");
+   writeData(meanVelocityI.data(), centerToIntex.size(), "meanVelocityI");
+   writeData(meanVelocityE.data(), centerToIntex.size(), "meanVelocityE");
+   writeData(uPlusI.data(), centerToIntex.size(), "uPlusI");
+   writeData(uPlusE.data(), centerToIntex.size(), "uPlusE");
+   writeData(&reynoldsStressVector[0 * centerToIntex.size()], centerToIntex.size(), "uu_rms");
+   writeData(&reynoldsStressVector[1 * centerToIntex.size()], centerToIntex.size(), "uv_rms");
+   writeData(&reynoldsStressVector[2 * centerToIntex.size()], centerToIntex.size(), "uw_rms");
+   writeData(&reynoldsStressVector[3 * centerToIntex.size()], centerToIntex.size(), "vu_rms");
+   writeData(&reynoldsStressVector[4 * centerToIntex.size()], centerToIntex.size(), "vv_rms");
+   writeData(&reynoldsStressVector[5 * centerToIntex.size()], centerToIntex.size(), "vw_rms");
+   writeData(&reynoldsStressVector[6 * centerToIntex.size()], centerToIntex.size(), "wu_rms");
+   writeData(&reynoldsStressVector[7 * centerToIntex.size()], centerToIntex.size(), "wv_rms");
+   writeData(&reynoldsStressVector[8 * centerToIntex.size()], centerToIntex.size(), "ww_rms");
+}
+
+void Plotter::writeData(const real_t* data, const uint_t size, const std::string& dataSetName){
+   const hsize_t dsetSize[1]{ 1 };
+   const hsize_t maxSize[1]{ size };
+   const hsize_t chunkSize[1]{ size };
+
+   H5::DataSpace arrayDataSpace(1, dsetSize, maxSize);
+   H5::DSetCreatPropList props;
+   props.setChunk(1, chunkSize);
+
+   const hsize_t selectCount[1] { size };
+   const hsize_t selectStart[1] { 0 };
+
+   auto group = h5file_->openGroup("Data");
+   H5::DataSet dataset = group.createDataSet(dataSetName, io::getH5ExportType<real_t>(), arrayDataSpace, props);
+   dataset.extend( selectCount );
+
+   H5::DataSpace dataspace = dataset.getSpace();
+   dataspace.selectHyperslab(H5S_SELECT_SET, selectCount, selectStart);
+      
+   H5::DataSpace memSpace(1, selectCount);
+   dataset.write(data, io::getH5ExportType<real_t>(), memSpace, dataspace);
+}
+
+void Plotter::addAttribute(const real_t value, const std::string& attributeName){
+   hsize_t attributeShape[1]{ 1 };
+   H5::DataSpace attributeShapeDspace(1, attributeShape, attributeShape);
+   auto group = h5file_->openGroup("Data");
+   H5::Attribute attr = group.createAttribute(attributeName, io::getH5ExportType<real_t>(), attributeShapeDspace);
+   real_t valueArray[1]{value};
+   attr.write(io::getH5ExportType<real_t>(), valueArray);
+}
+
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.h b/apps/showcases/PorousMediaGPU/DragEvaluation.h
index 7e0a4461d685a76781d822306a057d8185e747a0..4d621325b6d1374dd4b87bc3b8e185ed5257b6c1 100644
--- a/apps/showcases/PorousMediaGPU/DragEvaluation.h
+++ b/apps/showcases/PorousMediaGPU/DragEvaluation.h
@@ -58,6 +58,13 @@ class DragEvaluation
       return [this](uint_t level) { forceCalculation(level); };
    }
 
+   double getMeanFrictionVelocity(){
+    if(meanQuantitiesCounter_ == 0){
+      return double_c(0.0);
+    }
+    return (sumFx_ / double_c(meanQuantitiesCounter_)) / totalSurface_;
+   }
+
  protected:
    bool initialized_{false};
 
@@ -72,11 +79,13 @@ class DragEvaluation
    real_t dt_;
    uint_t timesteps_;
 
-   std::string dragFilename_;
    uint_t checkFrequency_;
    uint_t writeCounter_;
    uint_t executionCounter_{0};
    uint_t totalExecutionCounter_{0};
+   uint_t rampUpTime_{100000};
+
+   uint_t meanQuantitiesCounter_{0};
 
    std::unique_ptr<H5::H5File> h5file_;
    hsize_t extendSize[1];
@@ -86,10 +95,41 @@ class DragEvaluation
 
    double surfaceAreaHemiSpheres_;
    double surfaceAreaWall_;
+   double totalSurface_;
    double meanVelocity_;
 
    Vector3< double > force_;
 
+   double sumFx_{0.0};
+
 }; // class DragEvaluation
 
+class Plotter
+{
+ public:
+   Plotter(std::shared_ptr< StructuredBlockForest >& blocks, const Setup& setup, const IDs& ids);
+
+   // void forceCalculation(const uint_t level); // for calculating the force
+   // void writeResults();
+
+   void writeResult(const real_t frictionVelocity, const real_t welfordCounter);
+   void writeData(const real_t* data, const uint_t size, const std::string& dataSetName);
+   void addAttribute(const real_t value, const std::string& attributeName);
+
+ protected:
+   bool initialized_{false};
+
+   std::shared_ptr< StructuredBlockForest > blocks_;
+   Setup setup_;
+   IDs ids_;
+
+   std::unique_ptr<H5::H5File> h5file_;
+
+  uint_t executionCounter_{0};
+  uint_t totalExecutionCounter_{0};
+
+  uint_t rampUpTime_{100000};
+  uint_t checkFrequency_{10000};
+}; // class Plotter
+
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
index b0c7c191dd9d82d587fb23de243697da0c8e60ab..fcae3b718a1a6e99b98bc5243bf1e88b79a97009 100644
--- a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
+++ b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
@@ -101,15 +101,16 @@ void setVelocityFieldsAsmuth(const shared_ptr< StructuredBlockStorage >& blocks,
    const real_t kappa = real_c(0.41);
 
    auto domainSize = blocks->getDomain().max();
+   const real_t delta = domainSize[1] / real_c(2.0);
+
    domainSize /= setup.dxC;
 
    const real_t frictionVelocity = setup.frictionVelocity;
+   const real_t dx = setup.dxC;
+   const real_t viscosity = setup.viscosity;
 
    for( auto block = blocks->begin(); block != blocks->end(); ++block ) {
       const uint_t level = blocks->getLevel(*block.get());
-      const real_t levelScaleFactor = real_c(uint_c(1) << level);
-      const real_t dx = setup.dxC / (real_c(1.0) / levelScaleFactor);
-
       auto * velocityField = block->template getData<VelocityField_T>(ids.velocityField);
 
       const auto ci = velocityField->xyzSizeWithGhostLayer();
@@ -118,29 +119,49 @@ void setVelocityFieldsAsmuth(const shared_ptr< StructuredBlockStorage >& blocks,
          Cell globalCell(*cellIt);
          blocks->transformBlockLocalToGlobalCell(globalCell, *block);
          Vector3<real_t> cellCenter;
-         blocks->getCellCenter(cellCenter, globalCell);
+         blocks->getCellCenter(cellCenter, globalCell, level);
 
+         const auto y = cellCenter[1];
          const auto rel_x = cellCenter[0] / domainSize[0];
-         const auto rel_y = cellCenter[1] / domainSize[1];
+         // const auto rel_y = cellCenter[1] / domainSize[1];
          const auto rel_z = cellCenter[2] / domainSize[2];
 
+         const real_t pos = std::max(delta - std::abs(y - delta - 0_r), 0.05_r);
+         const auto rel_y = pos / delta;
+
+         auto initialVel = frictionVelocity * (std::log(frictionVelocity * pos / viscosity) / kappa + B);
+
+         Vector3<real_t> vel;
+         vel[0] = initialVel;
+
+         vel[2] = 2_r * frictionVelocity / kappa * std::sin(math::pi * 16_r/dx * rel_x) *
+                        std::sin(math::pi * 8_r  * rel_y) / (std::pow(rel_y, 2_r) + 1_r);
+
+         vel[1] = 8_r * frictionVelocity / kappa *
+                           (std::sin(math::pi * 8_r/dx * rel_z) * std::sin(math::pi * 8_r * rel_y) +
+                           std::sin(math::pi * 8_r/dx * rel_x)) / (std::pow(0.5_r * delta - pos, 2_r) + 1_r);
+
+
+/*
+         const real_t pos = std::max(delta - std::abs(y - delta), real_c(2.0));
          const real_t t1 = std::sin(math::pi * 8.0 / dx * rel_z) * std::sin(math::pi * 8.0 / dx * rel_y) + std::sin(math::pi * 8.0 / dx * rel_x);
-         const real_t t2 = real_c(1.0); // (np.power(0.5 * delta - pos, 2.0) + 1.0);
+         const real_t t2 = std::pow(1.0_r * delta - pos, 2_r) + 1_r;
          const real_t t3 = std::sin(math::pi * 16.0_r / dx * rel_x) * std::sin(math::pi * 8.0_r / dx * rel_y);
 
          Vector3<real_t> vel;
-
-         vel[0] = setup.latticeVelocity;
-         vel[1] = 5.0_r * frictionVelocity / kappa * t1 / t2;
+         
+         const real_t intVelX = frictionVelocity * (std::log(frictionVelocity * pos / viscosity) / kappa + B);
+         vel[0] = setup.latticeVelocity; // intVelX > setup.latticeVelocity ? setup.latticeVelocity : intVelX;
+         vel[1] = 1.0_r * frictionVelocity / kappa * t1 / t2;
          vel[2] = 2.0_r * frictionVelocity / kappa * t3 / (std::pow(rel_y, 2.0_r) + 1.0_r);
-
+         
+*/
          velocityField->get(*cellIt, 0) = vel[0];
          velocityField->get(*cellIt, 1) = vel[1];
          velocityField->get(*cellIt, 2) = vel[2];
 
       }
    }
-
 } // function setVelocityFieldsHenrik
 
 } // namespace walberla
diff --git a/apps/showcases/PorousMediaGPU/InitializerFunctions.h b/apps/showcases/PorousMediaGPU/InitializerFunctions.h
index cb69db2eb743ff15e024424b54c5fb68121203c4..de76286fc600dc33fdb7c997783ebbeedb5629cb 100644
--- a/apps/showcases/PorousMediaGPU/InitializerFunctions.h
+++ b/apps/showcases/PorousMediaGPU/InitializerFunctions.h
@@ -36,6 +36,7 @@
 namespace walberla{
 void initGrid(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID flagFieldID, const Setup& setup);
 void initHemisphers(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID flagFieldID, const Setup& setup, const Config::BlockHandle& wallParameters);
+void initGenericPorousMatrix(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID flagFieldID, const Setup& setup, const Config::BlockHandle& gpmParameters);
 bool hemispheresContains(const Vector3<real_t>& point, const Vector3<real_t>& center, const real_t radiusSquared);
 void setVelocityFieldsAsmuth(const shared_ptr< StructuredBlockStorage >& blocks, const Setup& setup, const IDs& ids);
 } // namespace walberla
diff --git a/apps/showcases/PorousMediaGPU/JinCase.prm b/apps/showcases/PorousMediaGPU/JinCase.prm
new file mode 100644
index 0000000000000000000000000000000000000000..8215f12bf9e8fe4ed022bebce95ceab08aac95db
--- /dev/null
+++ b/apps/showcases/PorousMediaGPU/JinCase.prm
@@ -0,0 +1,108 @@
+Parameters
+{
+    reynoldsNumber 400;
+    bulkReynoldsNumber 4466;
+    latticeVelocity 0.05;
+    initialiseWithlatticeVelocity true;
+    initialiseWithTurbulence true;
+    forceUpdateFrequency 10;
+
+    welfordSamplingStart 200000;
+    welfordSamplingFrequency 1;
+
+    timesteps 10;
+
+    processMemoryLimit 512; // MiB
+    innerOuterSplit <1, 1, 1>;
+
+    // GPU related Parameters, only used if GPU is enabled
+    gpuEnabledMPI false;
+}
+
+Turbulence
+{
+    useGrid false;
+    turbulentInflow false;
+    U_rms  0.01;
+    characteristicLength 0.4;
+    numberOfFourrierModes 128;
+    U_c < 2.0, 0.0, 0.0 >;
+}
+
+RoughWall
+{
+    hemispheresRadius 1;
+    hemispheresDistance 4;
+}
+
+DomainSetup
+{
+    domainSize    < 48.4, 40, 12 >;
+    cellsPerBlock < 121, 10, 30 >;
+    blocks    < 1, 10, 1 >;
+
+    // domainSize    < 48.4, 40, 12 >;
+    // cellsPerBlock < 242, 20, 60 >;
+    // blocks    < 1, 10, 1 >;
+
+    periodic    < true, false, true >;
+    refinementLevels 0;
+    numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
+    blockForestFilestem porousMediaBlockForest;
+}
+//! [domainSetup]
+
+Boundaries
+{
+    Border { direction N;    walldistance -1;  flag Obstacle; }
+    Border { direction S;    walldistance -1;  flag Obstacle; }
+}
+
+
+StabilityChecker
+{
+    checkFrequency 0;
+    streamOutput   false;
+    vtkOutput      true;
+}
+
+VTKWriter
+{
+    vtkWriteFrequency 2000;
+    vtkGhostLayers 0;
+    velocity true;
+    writeVelocityAsMagnitude false;
+    density false;
+    meanVelocity true;
+    flag false;
+    writeOnlySlice true;
+    sliceThickness 1;
+    writeXZSlice false;
+    amrFileFormat false;
+    oneFilePerProcess false;
+    samplingResolution -1;
+    initialWriteCallsToSkip 0;
+}
+
+Logging
+{
+    logLevel info;  // info progress detail tracing
+    writeSetupForestAndReturn false; // When only one process is used the decomposition is writen and the program terminates
+    readSetupFromFile false;
+    remainingTimeLoggerFrequency 20; // in seconds
+    writeSurfaceMeshOfTPMS false;
+    outputFolder output;
+    filestem Re400;
+}
+
+Probes
+{
+    evaluationFrequency 1;
+    writeCounter 1000; // after how many evaluations the results are written to file
+}
+
+DragEvaluation
+{
+    evaluationStart 200000;
+    evaluationFrequency 10;
+}
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
index 387d10542b73a1b0b5e0abe6abd2e208e64bd48e..35af02ce8f28bb5727b7d9a61b89f4360855e181 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
@@ -92,16 +92,12 @@
 #include "PorousMediaGPUStaticDefines.h"
 
 using namespace walberla;
-using macroFieldType = VelocityField_T::value_type;
 
 #if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
-using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
 using gpu::communication::NonUniformGPUScheme;
 using lbm_generated::NonuniformGeneratedGPUPdfPackInfo;
 #else
-using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
 using blockforest::communication::UniformBufferedScheme;
-
 using lbm_generated::UniformGeneratedPdfPackInfo;
 #endif
 
@@ -121,15 +117,17 @@ class HemisphereWallDistance
       const real_t yMin = domain.yMin();
       const real_t yMax = domain.yMax();
 
+      // WALBERLA_CHECK(!hemispheresContains(fluid, center, radius_), "fluid cell is in contained in Hemisphere (" << fluid[0] << ", " << fluid[1] << ", " << fluid[2] << "). The block local cell is " << fluidCell)
+      // WALBERLA_CHECK(hemispheresContains(boundary, center, radius_), "boundary cell is not in contained in Hemisphere (" << boundary[0] << ", " << boundary[1] << ", " << boundary[2] << "). The block local cell is " << boundaryCell)
+
       Vector3< real_t > boundary = SbF->getBlockLocalCellCenter( block, boundaryCell );
       Vector3< real_t > fluid    = SbF->getBlockLocalCellCenter( block, fluidCell );
 
       auto center = getCenterOfHemisphere(boundary, hemispheresDistance_, yMin, yMax);
 
-      hemispheresContains(boundary, center, radius_);
-
-      WALBERLA_CHECK(!hemispheresContains(fluid, center, radius_), "fluid cell is in contained in Hemisphere (" << fluid[0] << ", " << fluid[1] << ", " << fluid[2] << "). The block local cell is " << fluidCell)
-      WALBERLA_CHECK(hemispheresContains(boundary, center, radius_), "boundary cell is not in contained in Hemisphere (" << boundary[0] << ", " << boundary[1] << ", " << boundary[2] << "). The block local cell is " << boundaryCell)
+      if(hemispheresContains(fluid, center, radius_) || !hemispheresContains(boundary, center, radius_)){
+         return real_c(0.5);
+      }
 
       return delta(fluid, boundary, center);
    }
@@ -190,6 +188,31 @@ void refineWalls(SetupBlockForest& forest, const real_t hemispheresRadius, const
    }
 }
 
+real_t calculateBulkVelocity(const shared_ptr< StructuredBlockStorage >& blocks, const IDs& ids, const FlagUID domainFlagUID) {
+   real_t bulkVelocity = real_c(0.0);
+   real_t numCells = real_c(0);
+
+   for( auto block = blocks->begin(); block != blocks->end(); ++block) {
+      const uint_t level = blocks->getLevel(*block);
+      const real_t scaleFactor = real_c(1.0) / real_c( 1 << level );
+      const real_t cellVolume = scaleFactor * scaleFactor * scaleFactor;
+      auto * velocityField = block->template getData<VelocityField_T>(ids.velocityField);
+      auto * flagField = block->template getData<FlagField_T>(ids.flagField);
+      auto domainFlag = flagField->getFlag(domainFlagUID);
+      const auto ci = velocityField->xyzSize();
+      for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
+         if(isFlagSet(flagField->get(*cellIt), domainFlag)){
+            bulkVelocity += (velocityField->get(*cellIt, 0) * cellVolume);
+            numCells += cellVolume;
+         }
+      }
+   }
+
+   mpi::allReduceInplace< real_t >(bulkVelocity, mpi::SUM);
+   bulkVelocity /= real_c(numCells);
+   return bulkVelocity;
+}
+
 
 
 int main(int argc, char** argv){
@@ -223,6 +246,7 @@ int main(int argc, char** argv){
                               "\n   + omega:               " << setup.omegaF << " (on the finest grid)" <<
                               "\n   + inflow velocity:     " << setup.referenceVelocity << " [m/s]" <<
                               "\n   + lattice velocity:    " << setup.latticeVelocity << " [dx/dt]" <<
+                              "\n   + friction velocity:   " << setup.frictionVelocity << " [dx/dt]" <<
                               "\n   + Reynolds number:     " << setup.reynoldsNumber <<
                               "\n   + Mach number:         " << setup.machNumber <<
                               "\n   + driving force:       " << setup.drivingForce <<
@@ -231,7 +255,8 @@ int main(int argc, char** argv){
                               "\n   + dt (coarsest grid):  " << setup.dt << " [s]" <<
                               "\n   + conversion Faktor:   " << setup.conversionFactor <<
                               "\n   + #time steps:         " << setup.timesteps << " (on the coarsest grid, " << ( real_c(1.0) / setup.dt ) << " for 1s of real time)" <<
-                              "\n   + simulation time:     " << ( real_c( setup.timesteps ) * setup.dt ) << " [s]" )
+                              "\n   + simulation time:     " << ( real_c( setup.timesteps ) * setup.dt ) << " [s]"
+                              "\n   + flow through cycles: " << ( setup.flowThroughCycles ))
 
    bool writeSetupForestAndReturn = loggingParameters.getParameter< bool >("writeSetupForestAndReturn", false);
    if (uint_c(MPIManager::instance()->numProcesses()) > 1) writeSetupForestAndReturn = false;
@@ -261,22 +286,24 @@ int main(int argc, char** argv){
    auto allocator = make_shared< gpu::HostFieldAllocator<macroFieldType> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
    ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
    ids.meanVelocityField = field::addToStorage< VelocityField_T >(blocks, "mean velocity", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
+   ids.sosField = field::addToStorage< TensorField_T >(blocks, "sos velocity", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
    ids.densityField  = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, setup.numGhostLayers, allocator);
    ids.flagField     = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", setup.numGhostLayers);
 
    ids.pdfFieldGPU      = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, ids.pdfField, StorageSpec, "pdfs on GPU", true);
    ids.velocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityField, "velocity on GPU", true);
    ids.meanVelocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.meanVelocityField, "mean velocity on GPU", true);
+   ids.sosFieldGPU = gpu::addGPUFieldToStorage< TensorField_T >(blocks, ids.sosField, "sos field on GPU", true);
    ids.densityFieldGPU  = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, ids.densityField, "velocity on GPU", true);
+   ids.flagFieldGPU     = gpu::addGPUFieldToStorage< FlagField_T >(blocks, ids.flagField, "flag on GPU", true);
 
    std::function< Vector3< uint_t > (const shared_ptr< StructuredBlockStorage >&, IBlock * const) > getInflowSliceSize = inflowSliceSize;
    ids.velocityFieldInflowSlice    = field::addToStorage< VelocityField_T >(blocks, "velocity Inflow", getInflowSliceSize, macroFieldType(0.0), field::fzyx, setup.numGhostLayers);
    ids.velocityFieldInflowSliceGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityFieldInflowSlice, "velocity Inflow on GPU", true);
 
    const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
-   SweepCollection_T sweepCollection(blocks, ids.pdfFieldGPU, ids.densityFieldGPU, ids.velocityFieldGPU, setup.drivingForce, setup.omega, innerOuterSplit);
-   WelfordSweep_T welfordSweep(ids.meanVelocityFieldGPU, ids.velocityFieldGPU, real_c(0.0));
-
+   SweepCollection_T sweepCollection(blocks, ids.densityFieldGPU, ids.pdfFieldGPU, ids.velocityFieldGPU, setup.drivingForce, setup.omega, innerOuterSplit);
+   WelfordSweep_T welfordSweep(ids.meanVelocityFieldGPU, ids.sosFieldGPU, ids.velocityFieldGPU, real_c(0.0));
    WALBERLA_LOG_INFO_ON_ROOT(++step << ". Initialising data")
 
    if(setup.initialiseWithlatticeVelocity){
@@ -288,13 +315,13 @@ int main(int argc, char** argv){
       }
    }
 
-   setVelocityFieldsAsmuth(blocks, setup, ids);
+   if(parameters.getParameter< bool >("initialiseWithTurbulence", false)){
+      setVelocityFieldsAsmuth(blocks, setup, ids);
+   }
 
    gpu::fieldCpy< gpu::GPUField< VelocityField_T::value_type >, VelocityField_T >(blocks, ids.velocityFieldGPU, ids.velocityField);
-
    for (auto& block : *blocks){
       sweepCollection.initialise(&block, cell_idx_c(setup.numGhostLayers - 1));
-
       auto velocityInflow = block.getData< VelocityField_T >(ids.velocityFieldInflowSlice);
       WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velocityInflow,
          velocityInflow->get(x, y, z, 0) = setup.latticeVelocity;
@@ -305,6 +332,8 @@ int main(int argc, char** argv){
 
    auto inflow = std::make_shared<Inflow>(blocks, setup, ids);
    auto probe = std::make_shared<Probe>(blocks, setup, ids, config->getOneBlock("Probes"));
+   auto plotter = std::make_shared<Plotter>(blocks, setup, ids);
+
 
 /*
    const real_t lenght = 8.0;
@@ -316,9 +345,16 @@ int main(int argc, char** argv){
    }
 */ 
 
+   const uint_t nx = blocks->getXSize() * blocks->getNumberOfXCells();
+   for(uint_t i = 0; i < nx; ++i){
+      probe->addPoint(Vector3<real_t>(real_c(i) * setup.dxC, finalDomain.center()[1], finalDomain.center()[2] ));
+   }
+
+   probe->addPoint(finalDomain.center());
    probe->initialise();
 
 
+
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -373,6 +409,9 @@ int main(int argc, char** argv){
       }
    }
 
+   gpu::fieldCpy< GPUFlagField_T, FlagField_T >(blocks, ids.flagFieldGPU, ids.flagField);
+   auto bvc = std::make_shared<BulkVelocityCalculator>(blocks, ids, setup);
+
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                           COMMUNICATION SCHEME                                             ///
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -405,65 +444,41 @@ int main(int argc, char** argv){
       LBMMeshRefinement.addPostBoundaryHandlingBlockFunction(evaluation->forceCalculationFunctor());
    }
 
-
-   const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - setup.dxF,
-                      finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + setup.dxF);
-
-/*                      
-   CellInterval globalCellInterval;
-   blocks->getCellBBFromAABB( globalCellInterval, sliceXY, uint_c(0) );
-
-   std::map<IBlock*, CellInterval> sliceCellIntervals;
-
-   for (auto& block : *blocks){
-      CellInterval cellBB;
-      blocks->getCellBBFromAABB( cellBB, sliceXY, blocks->getLevel(block) );
-      blocks->transformGlobalToBlockLocalCellInterval( cellBB, block );
-
-      Cell maxCell(cell_idx_c(0), cell_idx_c(0), cell_idx_c(0));
-      Cell minCell(cell_idx_c( blocks->getNumberOfXCells( block ) ), cell_idx_c( blocks->getNumberOfYCells( block ) ), cell_idx_c( blocks->getNumberOfZCells( block ) ));
-
-      for( cell_idx_t z = cell_idx_c(0); z != cell_idx_c( blocks->getNumberOfZCells( block ) ); ++z ){
-         for( cell_idx_t y = cell_idx_c(0); y != cell_idx_c( blocks->getNumberOfYCells( block ) ); ++y ){
-            for( cell_idx_t x = cell_idx_c(0); x != cell_idx_c( blocks->getNumberOfXCells( block ) ); ++x ){
-               if( cellBB.contains(x,y,z) ){
-                  if(Cell(x, y, z) < minCell){
-                     minCell = Cell(x, y, z);
-                  }
-                  if(!(Cell(x, y, z) < maxCell)){
-                     maxCell = Cell(x, y, z);
-                  }
-               }
-            }
+   const uint_t forceUpdateFrequency = parameters.getParameter< uint_t >("forceUpdateFrequency", 0);
+   if(forceUpdateFrequency > 0){
+      auto updateForce = [&](){
+      if(timeloop.getCurrentTimeStep() % forceUpdateFrequency  == 0) {
+            const real_t correctedDrivingForce = bvc->getDrivingForce();
+            sweepCollection.setFx(correctedDrivingForce);
          }
-      }
-      CellInterval localCellInterval(minCell, maxCell);
-      if(!localCellInterval.empty()){
-         sliceCellIntervals[&block] = localCellInterval;
-      }
+      };
+      timeloop.addFuncBeforeTimeStep(updateForce, "Update force");
    }
 
-   timeloop.add() << BeforeFunction([&]()
-   {
-      for (auto const& sliceCellInterval : sliceCellIntervals){
-         sweepCollection.calculateMacroscopicParametersCellInterval(sliceCellInterval.first, sliceCellInterval.second);
-      }
-
-      const uint_t velCtr = uint_c(welfordSweep.getCounter());
-      welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
-   }) << Sweep(welfordSweep.getSweepOnCellInterval(blocks, globalCellInterval), "welford sweep");
-
-*/
+   const uint_t welfordSamplingStart = parameters.getParameter< uint_t >("welfordSamplingStart", 0);
+   const uint_t welfordSamplingFrequency = parameters.getParameter< uint_t >("welfordSamplingFrequency", 0);
 
-   /*SweepTimeloop timeloop(blocks->getBlockStorage(), setup.timesteps);
-   timeloop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
-                  << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
-   timeloop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");*/
+   if(welfordSamplingFrequency > 0){
+      auto welfordUpdate = [&](){
+         if(timeloop.getCurrentTimeStep() > welfordSamplingStart  && (timeloop.getCurrentTimeStep() % welfordSamplingFrequency) == 0) {
+            const uint_t velCtr = uint_c(welfordSweep.getCounter());
+            welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
+            for (auto& block : *blocks){
+               welfordSweep.run(&block);
+            }
+         };
+      };
+      timeloop.addFuncBeforeTimeStep(welfordUpdate, "Welford Sampling");
+   }
 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                             VTK Output                                                     ///
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+   auto copyVelocity     = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.velocityField, ids.velocityFieldGPU);
+   auto copyDensity      = gpu::fieldCpyFunctor< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(ids.densityField, ids.densityFieldGPU);
+   auto copyMeanVelocity = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.meanVelocityField, ids.meanVelocityFieldGPU);
+
    // VTK
    auto VTKWriter = config->getOneBlock("VTKWriter");
    const uint_t vtkWriteFrequency       = VTKWriter.getParameter< uint_t >("vtkWriteFrequency", 0);
@@ -475,28 +490,30 @@ int main(int argc, char** argv){
    const real_t sliceThickness          = VTKWriter.getParameter< real_t >("sliceThickness", real_c(1.0));
 
    if (vtkWriteFrequency > 0){
-      const std::string vtkFolder = "VTKPorousMediaRE_" + std::to_string(uint64_c(setup.reynoldsNumber));
+      const std::string identifier = setup.fileStem;
       auto vtkOutput =
-         vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, vtkGhostLayers, false, vtkFolder,
+         vtk::createVTKOutput_BlockData(*blocks, identifier, vtkWriteFrequency, vtkGhostLayers, false, setup.outputFolder,
                                         "simulation_step", false, true, true, false, 0, amrFileFormat, oneFilePerProcess);
 
       vtkOutput->setInitialWriteCallsToSkip(initialWriteCallsToSkip);
-
       vtkOutput->addBeforeFunction([&]() {
+
          for (auto& block : *blocks){
-            sweepCollection.calculateMacroscopicParameters(&block);
+            if (VTKWriter.getParameter< bool >("velocity")){copyVelocity(&block);}
+            if (VTKWriter.getParameter< bool >("density")){copyDensity(&block);}
+            if (VTKWriter.getParameter< bool >("meanVelocity")){copyMeanVelocity(&block);}
          }
 
+         auto drivingForeceGPU = bvc->getDrivingForce();
+         auto initialDrivingForce = setup.drivingForce;
 
-         // for (auto const& sliceCellInterval : sliceCellIntervals){
-         //    sweepCollection.calculateMacroscopicParametersCellInterval(sliceCellInterval.first, sliceCellInterval.second);
-         // }
+         auto bulkVelocityCPU = calculateBulkVelocity(blocks, ids, setup.fluidUID);
+         auto blukVelocityGPU = bvc->bulkVelocity();
+         WALBERLA_LOG_DEVEL_VAR(bulkVelocityCPU);
+         WALBERLA_LOG_DEVEL_VAR(blukVelocityGPU);
+         WALBERLA_LOG_DEVEL_VAR(blukVelocityGPU);
+         WALBERLA_LOG_DEVEL_VAR(drivingForeceGPU);
 
-#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
-         gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.velocityField, ids.velocityFieldGPU);
-         // gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.meanVelocityField, ids.meanVelocityFieldGPU);
-         // gpu::fieldCpy< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(blocks, ids.densityField, ids.densityFieldGPU);
-#endif
       });
 
       vtkOutput->setSamplingResolution(samplingResolution);
@@ -507,8 +524,8 @@ int main(int argc, char** argv){
 
 
       if (VTKWriter.getParameter< bool >("writeOnlySlice", true)){
-         // const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - sliceThickness * setup.dxF,
-         //                    finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + sliceThickness * setup.dxF);
+         const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - sliceThickness * setup.dxF,
+                            finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + sliceThickness * setup.dxF);
          vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
 
          if (VTKWriter.getParameter< bool >("writeXZSlice", false)){
@@ -589,6 +606,15 @@ int main(int argc, char** argv){
 #endif
    WALBERLA_MPI_BARRIER()
    simTimer.end();
+
+   gpu::fieldCpy< TensorField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.sosField, ids.sosFieldGPU);
+
+   const double ut = evaluation->getMeanFrictionVelocity();
+   const double welfordCounter = welfordSweep.getCounter();
+   WALBERLA_LOG_DEVEL_VAR(ut)
+   WALBERLA_LOG_DEVEL_VAR(welfordCounter)
+   plotter->writeResult(ut, welfordCounter);
+
    WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
    double time = double_c(simTimer.max());
    WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.py b/apps/showcases/PorousMediaGPU/PorousMediaGPU.py
index 009f0742b89acbc3163115db810aa1bd2763c7e9..fbd9e7e4f1f3832d8cdffba57b7ecee7b4871100 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.py
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.py
@@ -7,7 +7,7 @@ from pystencils.simp import insert_aliases, insert_constants
 from lbmpy import LBStencil, LBMConfig, LBMOptimisation
 from lbmpy.boundaries.boundaryconditions import NoSlip, NoSlipLinearBouzidi, ExtrapolationOutflow, UBB, FixedDensity
 from lbmpy.creationfunctions import create_lb_collision_rule
-from lbmpy.enums import Method, Stencil, ForceModel
+from lbmpy.enums import Method, Stencil, ForceModel, SubgridScaleModel
 from lbmpy.flow_statistics import welford_assignments
 
 from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
@@ -39,15 +39,16 @@ with CodeGeneration() as ctx:
     dtype = 'float64' if ctx.double_accuracy else 'float32'
     pdf_dtype = 'float64'
 
-    stencil = LBStencil(Stencil.D3Q27)
+    stencil = LBStencil(Stencil.D3Q19)
     q = stencil.Q
     dim = stencil.D
-
+ 
     streaming_pattern = 'esopull'
 
     pdfs = fields(f"pdfs({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
     velocity_field, density_field = fields(f"velocity({dim}), density(1) : {dtype}[{dim}D]", layout='fzyx')
     mean_velocity = fields(f"mean_velocity({dim}): {dtype}[{stencil.D}D]", layout='fzyx')
+    sum_of_squares_field = fields(f"sum_of_squares_field({stencil.D**2}): {dtype}[{stencil.D}D]", layout='fzyx')
 
     macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
 
@@ -59,9 +60,10 @@ with CodeGeneration() as ctx:
         relaxation_rate=omega,
         compressible=True,
         force=force_vector,
-        # subgrid_scale_model=SubgridScaleModel.QR,
-        fourth_order_correction=0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False,
-        streaming_pattern=streaming_pattern
+        force_model=ForceModel.GUO,
+        # fourth_order_correction=0.1 if method_enum == Method.CUMULANT and stencil.Q == 27 else False,
+        streaming_pattern=streaming_pattern,
+        output={'density': density_field, 'velocity': velocity_field}
     )
 
     lbm_opt = LBMOptimisation(cse_global=False, cse_pdfs=False, field_layout="fzyx")
@@ -110,11 +112,12 @@ with CodeGeneration() as ctx:
                          target=target, data_type=dtype, pdfs_data_type=pdf_dtype,
                          max_threads=max_threads)
 
-    welford_update = welford_assignments(field=velocity_field, mean_field=mean_velocity)
+    welford_update = welford_assignments(field=velocity_field, mean_field=mean_velocity, sum_of_squares_field=sum_of_squares_field)
     generate_sweep(ctx, "Welford", welford_update, target=target)
 
     field_typedefs = {'VelocityField_T': velocity_field,
-                      'ScalarField_T': density_field}
+                      'ScalarField_T': density_field,
+                      'TensorField_T': sum_of_squares_field}
 
     # Info header containing correct template definitions for stencil and field
     generate_info_header(ctx, f'{name_prefix}InfoHeader',
diff --git a/apps/showcases/PorousMediaGPU/Probe.cpp b/apps/showcases/PorousMediaGPU/Probe.cpp
index 160297ac6dd76b180e7f79b9737127ce1155ae7a..37323d336e2e5ff1a8be124faa5fc3c3892f386c 100644
--- a/apps/showcases/PorousMediaGPU/Probe.cpp
+++ b/apps/showcases/PorousMediaGPU/Probe.cpp
@@ -23,7 +23,6 @@
 namespace walberla{
 
 void Probe::initialise(){
-
     if(probePoints_.empty()){return;}
 
     const AABB domain = blocks_->getDomain();
@@ -35,22 +34,26 @@ void Probe::initialise(){
             const Cell localCell = blocks_->getBlockLocalCell(*block, p);
             Vector3<real_t> localCellCenter;
             blocks_->getCellCenter(localCellCenter, localCell);
-
             pointsPerBlock_[block].emplace_back(localCellCenter);
         }
     }
 
-    for (auto& block : *blocks_){
-        auto * dataVector = block.getData< DataVector > ( probeDataID );
+    uint_t totalNumberOfPoints = 0;
+    for (auto const& it : pointsPerBlock_){
+        IBlock* block = it.first;
+        std::vector<Vector3<real_t>> points = it.second;
+
+        auto * dataVector = block->getData< DataVector > ( probeDataID );
         auto & cpuCellVector = dataVector->cpuCellVector();
         auto & cpuDataVector = dataVector->cpuDataVector();
 
-        const uint_t n = uint_c(pointsPerBlock_[&block].size());
+        const uint_t n = points.size();
+        totalNumberOfPoints += n;
         cpuCellVector.resize(3 * n);
 
         uint_t i = 0;
-        for(auto p: pointsPerBlock_[&block]){
-            const Cell localCell = blocks_->getBlockLocalCell(block, p);
+        for(auto p: points){
+            const Cell localCell = blocks_->getBlockLocalCell(*block, p);
 
             cpuCellVector[i] = localCell[0];
             cpuCellVector[i + n] = localCell[1];
@@ -67,14 +70,15 @@ void Probe::initialise(){
 
         getBlockAndGridSize(s, threadPerBlock, numberBlocks);
 
-        gpuBlock_[&block] = threadPerBlock;
-        gpuGrid_[&block] = numberBlocks;
+        gpuBlock_[block] = threadPerBlock;
+        gpuGrid_[block] = numberBlocks;
     }
 
     extendSize[0] = 0;
     selectStart[0] = 0;
 
     writeHeaderFiles();
+    WALBERLA_LOG_INFO_ON_ROOT("Added " << totalNumberOfPoints << " probes for postprocessing")
 }
 
 void Probe::writeHeaderFiles(){
@@ -87,10 +91,16 @@ void Probe::writeHeaderFiles(){
     H5::DSetCreatPropList props;
     props.setChunk(1, chunkSize);
 
-    filesystem::path hdfFilePath( filename_.c_str() );
-    if( filesystem::exists( hdfFilePath ) )
-        std::remove( hdfFilePath.string().c_str() );
+    const auto outputFolder = setup_.outputFolder;
+    const auto fileStem = setup_.fileStem;
+    const auto fileName = outputFolder + "/" + fileStem + "Probe.h5";
+    filesystem::path hdfFilePath( fileName.c_str() );
 
+    WALBERLA_ROOT_SECTION()
+    {
+        if( filesystem::exists( hdfFilePath ) )
+             std::remove( hdfFilePath.string().c_str() );
+    }
 
     H5::FileAccPropList fileAccProps;
 #ifdef WALBERLA_BUILD_WITH_MPI
@@ -103,8 +113,8 @@ void Probe::writeHeaderFiles(){
     timeGroup.createDataSet("t", io::h5Float(), arrayDataSpace, props);
 
     uint_t counter = 0;
-    for (auto blockIterator = blocks_->begin(); blockIterator != blocks_->end(); ++blockIterator){
-        auto block = blockIterator.get();
+    for (auto const& it : pointsPerBlock_){
+        IBlock* block = it.first;
         auto* dataVector = block->getData< DataVector > ( probeDataID );
         auto & cpuCellVector = dataVector->cpuCellVector();
         const uint_t n = uint_c(dataVector->size());
@@ -144,8 +154,8 @@ void Probe::writeProbes(){
 
     writeData(tmp, "time", "t");
 
-    for (auto blockIterator = blocks_->begin(); blockIterator != blocks_->end(); ++blockIterator){
-        auto block = blockIterator.get();
+    for (auto const& it : pointsPerBlock_){
+        IBlock* block = it.first;
         auto* dataVector = block->getData< DataVector > ( probeDataID );
         dataVector->syncCPU();
         dataVector->clear();
@@ -153,8 +163,8 @@ void Probe::writeProbes(){
 
     const uint_t nDatasets = dataSetNames.size();
 
-    for (auto blockIterator = blocks_->begin(); blockIterator != blocks_->end(); ++blockIterator){
-        auto block = blockIterator.get();
+    for (auto const& it : pointsPerBlock_){
+        IBlock* block = it.first;
         auto* dataVector = block->getData< DataVector > ( probeDataID );
         auto & cpuDataVector = dataVector->cpuDataVector();
         const uint_t n = uint_c(dataVector->size());
@@ -193,19 +203,19 @@ void Probe::update(){
     ++totalExecutionCounter_;
     if (probePoints_.empty() || (totalExecutionCounter_ - uint_c(1)) % frequency_ != 0) return;
 
-    const real_t velocityScalingFactor = setup_.conversionFactor;
-
-    for (auto& block : *blocks_){
+    const float velocityScalingFactor = float_c(setup_.conversionFactor);
 
-        auto pdfs = block.getData< gpu::GPUField<double> >(ids_.pdfFieldGPU);
+    for (auto const& it : pointsPerBlock_){
+        IBlock* block = it.first;
+        auto density = block->getData< gpu::GPUField<double> >(ids_.densityFieldGPU);
+        auto velocity = block->getData< gpu::GPUField<double> >(ids_.velocityFieldGPU);
+        auto* dataVector = block->getData< DataVector > ( probeDataID );
 
-        auto* dataVector = block.getData< DataVector > ( probeDataID );
         int64_t* gpuCellVector = dataVector->gpuCellVector();
         float* gpuDataVector = dataVector->advanceGPUDataVector();
-
         const int64_t n = dataVector->size();
 
-        updateProbe(pdfs, gpuCellVector, gpuDataVector, n, velocityScalingFactor, gpuBlock_[&block], gpuGrid_[&block]);
+        updateProbe(density, velocity, gpuCellVector, gpuDataVector, n, velocityScalingFactor, gpuBlock_[block], gpuGrid_[block]);
     }
 
     ++executionCounter_;
diff --git a/apps/showcases/PorousMediaGPU/Probe.h b/apps/showcases/PorousMediaGPU/Probe.h
index 7dec08cf8ee025362edd42cf6e61dc45db3ae7d1..f787233adfd2e4d310be8d401ba3575b2f0fb729 100644
--- a/apps/showcases/PorousMediaGPU/Probe.h
+++ b/apps/showcases/PorousMediaGPU/Probe.h
@@ -106,7 +106,6 @@ class Probe
     {
 
         frequency_ = probeParameters.getParameter< uint_t >("evaluationFrequency");
-        filename_ = probeParameters.getParameter< std::string >("filename");
         writeCounter_ = probeParameters.getParameter< uint_t >("writeCounter");
 
         auto createProbeData = []( IBlock * const , StructuredBlockStorage * const ) { return new DataVector(); };
@@ -130,7 +129,6 @@ class Probe
   BlockDataID probeDataID;
 
   uint_t frequency_;
-  std::string filename_;
   uint_t writeCounter_;
   uint_t executionCounter_;
   uint_t totalExecutionCounter_;
diff --git a/apps/showcases/PorousMediaGPU/Setup.h b/apps/showcases/PorousMediaGPU/Setup.h
index 09d0723dbf70d78463efc97b642ec0f10df56bb1..d156bccc23ac1f68288bd794c6a8a05191fc9a24 100644
--- a/apps/showcases/PorousMediaGPU/Setup.h
+++ b/apps/showcases/PorousMediaGPU/Setup.h
@@ -28,14 +28,12 @@
 #include <string>
 
 #include "Types.h"
+#include "util.h"
+#include "core/Filesystem.h"
 
 namespace walberla
 {
 
-inline real_t calculateDrivingForce(const real_t frictionVelocity, const real_t channelHalfHeight){
-   return frictionVelocity * frictionVelocity / channelHalfHeight;
-}
-
 struct Setup
 {
 
@@ -75,22 +73,23 @@ struct Setup
       uc                    = turbulence.getParameter< Vector3< real_t > >("U_c");
 
       if(config->getNumBlocks("RoughWall") > 0){
-         kinViscosity                    = parameters.getParameter< real_t >("viscosity");
-         const real_t bulkReynoldsNumber = parameters.getParameter< real_t >("bulkReynoldsNumber");
+         // kinViscosity                    = parameters.getParameter< real_t >("viscosity");
          referenceLength *= real_c(0.5);
-
-         const real_t bulkVelocity = (bulkReynoldsNumber * kinViscosity) / referenceLength;
-         dt  = latticeVelocity / bulkVelocity * coarseMeshSize;
-         viscosity = (kinViscosity * dt) / (coarseMeshSize * coarseMeshSize);
          const real_t latticeReferenceLength = referenceLength / coarseMeshSize;
+         const real_t bulkReynoldsNumber = parameters.getParameter< real_t >("bulkReynoldsNumber");
+
+         viscosity = latticeReferenceLength * latticeVelocity / bulkReynoldsNumber;
+         referenceVelocity = latticeVelocity;
+         dt  = latticeVelocity / referenceVelocity * coarseMeshSize;
+
+         kinViscosity = referenceLength * referenceVelocity / bulkReynoldsNumber;
 
-         referenceVelocity = (reynoldsNumber * kinViscosity) / referenceLength;
          const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) ));
          machNumber = latticeVelocity / speedOfSound;
          omega      = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5)));
 
          frictionVelocity = (reynoldsNumber * viscosity) / latticeReferenceLength;
-         drivingForce = calculateDrivingForce(frictionVelocity, latticeReferenceLength);
+         drivingForce = calculateDrivingForce(frictionVelocity, latticeReferenceLength, latticeVelocity, latticeVelocity);
       }
       else{
          referenceVelocity = parameters.getParameter< real_t >("referenceVelocity");
@@ -116,6 +115,8 @@ struct Setup
       timesteps = parameters.getParameter< uint_t >("timesteps");
       numGhostLayers = refinementLevels == 0 ? uint_c(1) : uint_c(2);
 
+      flowThroughCycles = (real_c(timesteps) * dt) / (domainSize[0] / (latticeVelocity * conversionFactor));
+
       valuesPerCell = (Stencil_T::Q + VelocityField_T::F_SIZE + uint_c(1) * ScalarField_T::F_SIZE);
       memoryPerCell = memory_t(valuesPerCell * sizeof(PdfField_T::value_type) + uint_c(1));
       processMemoryLimit = parameters.getParameter< memory_t >( "processMemoryLimit", memory_t( 512 ) ) * memory_t( 1024 * 1024 );
@@ -133,6 +134,14 @@ struct Setup
 
       writeSetupForestAndReturn    = logging.getParameter< bool >("writeSetupForestAndReturn", false);
       remainingTimeLoggerFrequency = logging.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0);
+      outputFolder                 = logging.getParameter< std::string >("outputFolder", "output");
+      fileStem                     = logging.getParameter< std::string >("filestem", "data");
+
+      WALBERLA_ROOT_SECTION(){
+         if (!filesystem::is_directory(outputFolder) || !filesystem::exists(outputFolder)) { // Check if output folder exists
+            filesystem::create_directory(outputFolder); // create output folder
+         }
+      }
    }
 
    std::string blockForestFilestem;
@@ -172,6 +181,7 @@ struct Setup
    real_t dxC;
    real_t dxF;
    real_t dt;
+   real_t flowThroughCycles;
 
    real_t drivingForce;
    real_t conversionFactor;
@@ -201,6 +211,8 @@ struct Setup
 
    bool writeSetupForestAndReturn;
    real_t remainingTimeLoggerFrequency;
+   std::string outputFolder;
+   std::string fileStem;
 };
 
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/Types.h b/apps/showcases/PorousMediaGPU/Types.h
index 5d2fbe11f65c22752d59d698a3c4dbaa78ca2f3d..d27d1fbf78e93c68935940c08f61c6343e24ffbb 100644
--- a/apps/showcases/PorousMediaGPU/Types.h
+++ b/apps/showcases/PorousMediaGPU/Types.h
@@ -36,11 +36,21 @@ using BoundaryCollection_T = lbm::PorousMediaGPUBoundaryCollection< FlagField_T
 using SweepCollection_T = lbm::PorousMediaGPUSweepCollection;
 using WelfordSweep_T = pystencils::Welford;
 
+using macroFieldType = VelocityField_T::value_type;
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+#include "lbm_generated/gpu/GPUPdfField.h"
+using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
+using GPUField_T = gpu::GPUField< VelocityField_T::value_type >;
+using GPUFlagField_T = gpu::GPUField< FlagField_T::value_type >;
+#endif
+
 struct IDs
 {
    BlockDataID pdfField;
    BlockDataID velocityField;
    BlockDataID meanVelocityField;
+   BlockDataID sosField;
    BlockDataID velocityFieldInflowSlice;
    BlockDataID densityField;
    BlockDataID flagField;
@@ -48,6 +58,8 @@ struct IDs
    BlockDataID pdfFieldGPU;
    BlockDataID velocityFieldGPU;
    BlockDataID meanVelocityFieldGPU;
+   BlockDataID sosFieldGPU;
    BlockDataID velocityFieldInflowSliceGPU;
    BlockDataID densityFieldGPU;
+   BlockDataID flagFieldGPU;
 };
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/roughWall.prm b/apps/showcases/PorousMediaGPU/roughWall.prm
index a23472152e1ed181709a8ab8aa16f4a624a898fa..fc4b740ff9bc6e4baa03ef0500a3290447982397 100644
--- a/apps/showcases/PorousMediaGPU/roughWall.prm
+++ b/apps/showcases/PorousMediaGPU/roughWall.prm
@@ -2,11 +2,15 @@ Parameters
 {
     reynoldsNumber 400;
     bulkReynoldsNumber 4466;
-    viscosity 1.48e-5;
     latticeVelocity 0.05;
     initialiseWithlatticeVelocity true;
+    initialiseWithTurbulence true;
+    forceUpdateFrequency 10;
 
-    timesteps 1;
+    welfordSamplingStart 200000;
+    welfordSamplingFrequency 1;
+
+    timesteps 10;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -25,20 +29,24 @@ Turbulence
     U_c < 2.0, 0.0, 0.0 >;
 }
 
-RoughWall
+genericPorousMatrix
 {
-    hemispheresRadius 1;
-    hemispheresDistance 4;
+    poreSize 2;
+    barSize 1;
 }
 
 DomainSetup
 {
-    domainSize    < 34.6, 40, 4 >;
-    cellsPerBlock < 346, 40, 40 >;
-
+    domainSize    < 48.4, 40, 12 >;
+    cellsPerBlock < 121, 10, 30 >;
     blocks    < 1, 10, 1 >;
+
+    // domainSize    < 48.4, 40, 12 >;
+    // cellsPerBlock < 242, 20, 60 >;
+    // blocks    < 1, 10, 1 >;
+
     periodic    < true, false, true >;
-    refinementLevels 2;
+    refinementLevels 0;
     numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
     blockForestFilestem porousMediaBlockForest;
 }
@@ -46,8 +54,8 @@ DomainSetup
 
 Boundaries
 {
-    Border { direction N;    walldistance -1;  flag NoSlip; }
-    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction N;    walldistance -1;  flag Obstacle; }
+    Border { direction S;    walldistance -1;  flag Obstacle; }
 }
 
 
@@ -60,12 +68,12 @@ StabilityChecker
 
 VTKWriter
 {
-    vtkWriteFrequency 1000;
+    vtkWriteFrequency 2000;
     vtkGhostLayers 0;
     velocity true;
     writeVelocityAsMagnitude false;
     density false;
-    meanVelocity false;
+    meanVelocity true;
     flag false;
     writeOnlySlice true;
     sliceThickness 1;
@@ -83,17 +91,18 @@ Logging
     readSetupFromFile false;
     remainingTimeLoggerFrequency 20; // in seconds
     writeSurfaceMeshOfTPMS false;
+    outputFolder output;
+    filestem Re400;
 }
 
 Probes
 {
     evaluationFrequency 1;
     writeCounter 1000; // after how many evaluations the results are written to file
-    filename probeData.h5;
 }
 
 DragEvaluation
 {
-    evaluationFrequency 40;
-    filename drag.h5;
+    evaluationStart 200000;
+    evaluationFrequency 10;
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/util.cpp b/apps/showcases/PorousMediaGPU/util.cpp
index bc7e9ceae4cb5808330ebd68c0fe9440055d73be..c1a374962b11f51ac25d569b2dffd67cebd9e4bc 100644
--- a/apps/showcases/PorousMediaGPU/util.cpp
+++ b/apps/showcases/PorousMediaGPU/util.cpp
@@ -130,4 +130,8 @@ void getAllHemisphereCenters(std::vector<Vector3<real_t>>& centers, const real_t
     }
 }
 
+real_t calculateDrivingForce(const real_t targetFrictionVelocity, const real_t referenceLenght, const real_t targetBulkVelocity, const real_t bulkVelocity){
+   return targetFrictionVelocity * targetFrictionVelocity / referenceLenght + (targetBulkVelocity - bulkVelocity) * targetBulkVelocity / referenceLenght;
+}
+
 } // namespace walberla
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/util.h b/apps/showcases/PorousMediaGPU/util.h
index 2e05b671b3b9553c8206712cea633ee7850413e3..f2c76729e684f325ab1ec6e6283cf3c02edf5a34 100644
--- a/apps/showcases/PorousMediaGPU/util.h
+++ b/apps/showcases/PorousMediaGPU/util.h
@@ -35,6 +35,8 @@ Vector3<real_t> getCenterOfHemisphere(const Vector3<real_t>& p, const real_t hem
                                       const real_t domainYMin, const real_t domainYMax);
 
 bool hemispheresContains(const Vector3<real_t>& point, const Vector3<real_t>& center, const real_t radius);
-void getAllHemisphereCenters(std::vector<Vector3<real_t>>& centers, const real_t hemispheresRadius, const real_t hemispheresDistance, const math::AABB& domain);                         
+void getAllHemisphereCenters(std::vector<Vector3<real_t>>& centers, const real_t hemispheresRadius, const real_t hemispheresDistance, const math::AABB& domain);     
+
+real_t calculateDrivingForce(const real_t targetFrictionVelocity, const real_t referenceLenght, const real_t targetBulkVelocity, const real_t bulkVelocity);
 
 } // namespace walberla
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/utilGPU.cu b/apps/showcases/PorousMediaGPU/utilGPU.cu
index d6b56626cfe5c67ba92bfedc021648c1ff64e3b0..c1c6f47cc95c6240dcd93bac34ea54b4daca10e6 100644
--- a/apps/showcases/PorousMediaGPU/utilGPU.cu
+++ b/apps/showcases/PorousMediaGPU/utilGPU.cu
@@ -25,50 +25,22 @@ namespace walberla
 
 namespace internal{
 
-static __device__ void densityVelocityEven(double* const _data_pdfs,
-                                           double* density, double* uX, double* uY, double* uZ, 
-                                           int64_t const ctr_0, int64_t const ctr_1, int64_t const ctr_2,
-                                           int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+#if __CUDA_ARCH__ < 600
+__device__ double atomicAdd(double* address, double val)
 {
-   const double vel0Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3];
-   const double momdensity_0 = vel0Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3];
-   const double vel1Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3];
-   const double momdensity_1 = vel1Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3];
-   const double vel2Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3];
-   const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2];
-   const double momdensity_2 = vel2Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3];
-   const double rho = delta_rho + 1.0;
-   const double u_0 = momdensity_0*((1.0) / (rho));
-   const double u_1 = momdensity_1*((1.0) / (rho));
-   const double u_2 = momdensity_2*((1.0) / (rho));
-   *density = rho;
-   *uX = u_0;
-   *uY = u_1;
-   *uZ = u_2;
-}
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
 
-static __device__ void densityVelocityOdd(double* const _data_pdfs,
-                                          double* density, double* uX, double* uY, double* uZ, 
-                                          int64_t const ctr_0, int64_t const ctr_1, int64_t const ctr_2,
-                                          int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
-{
-   const double vel0Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3];
-   const double momdensity_0 = vel0Term - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 13*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3];
-   const double vel1Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3];
-   const double momdensity_1 = vel1Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 12*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3];
-   const double vel2Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 5*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3];
-   const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 6*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3];
-   const double momdensity_2 = vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 14*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 15*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 6*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3];
-   const double rho = delta_rho + 1.0;
-   const double u_0 = momdensity_0*((1.0) / (rho));
-   const double u_1 = momdensity_1*((1.0) / (rho));
-   const double u_2 = momdensity_2*((1.0) / (rho));
-   *density = rho;
-   *uX = u_0;
-   *uY = u_1;
-   *uZ = u_2;
-}
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
 
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+#endif
 
 static __device__ void singleMode(double* rX, double* rY, double* rZ, const double kappaN, const double ut,
                                   const double pX, const double pY, const double pZ, const int64_t linear_index)
@@ -195,9 +167,10 @@ static __global__ void summOfFourrierModesKernel(double * velocity, double * con
 }
 
 
-static __global__ void evaluateProbePoints(double* const pdfs, int64_t* cellData, float* probeData,
-                                           const int64_t n, const uint8_t timestep, const real_t velocityScalingFactor,
-                                           const int64_t xStride, const int64_t yStride, const int64_t zStride, const int64_t fStride)
+static __global__ void evaluateProbePoints(double* const density, double* const velocity, int64_t* cellData, float* probeData,
+                                           const int64_t n, const float velocityScalingFactor,
+                                           const int64_t xStrideDensity, const int64_t yStrideDensity, const int64_t zStrideDensity,
+                                           const int64_t xStrideVelocity, const int64_t yStrideVelocity, const int64_t zStrideVelocity, const int64_t fStrideVelocity)
 {
    if(blockDim.x*blockIdx.x + threadIdx.x < n)
    {
@@ -207,19 +180,34 @@ static __global__ void evaluateProbePoints(double* const pdfs, int64_t* cellData
       const int64_t y = cellData[ctr + n];
       const int64_t z = cellData[ctr + 2*n];
 
-      double rho, uX, uY, uZ; 
+      const float rho = ((float)(density[xStrideDensity*x + yStrideDensity*y + zStrideDensity*z]));
+      const float uX = ((float)(velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z]));
+      const float uY = ((float)(velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z + fStrideVelocity]));
+      const float uZ = ((float)(velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z + 2*fStrideVelocity]));
 
-      if(((timestep & 1) ^ 1)) {
-         densityVelocityEven(pdfs, &rho, &uX, &uY, &uZ, x, y, z, xStride, yStride, zStride, fStride);
-      }
-      else{
-         densityVelocityOdd(pdfs, &rho, &uX, &uY, &uZ, x, y, z, xStride, yStride, zStride, fStride);
-      }
+      probeData[ctr] = rho;
+      probeData[ctr + n]   = velocityScalingFactor * uX;
+      probeData[ctr + 2*n] = velocityScalingFactor * uY;
+      probeData[ctr + 3*n] = velocityScalingFactor * uZ;
+   }
+}
+
+static __global__ void sumVelocity(double* result, double* const velocity, uint8_t* const flagField, const int64_t n, const double cellVolume, const uint8_t domainFlag,
+                                   const int64_t xSize, const int64_t ySize, const int64_t zSize,
+                                   const int64_t xStrideVelocity, const int64_t yStrideVelocity, const int64_t zStrideVelocity, const int64_t fStrideVelocity,
+                                   const int64_t xStrideFlag, const int64_t yStrideFlag, const int64_t zStrideFlag)
+{
+   if(blockDim.x*blockIdx.x + threadIdx.x < xSize && blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize)
+   {
+      const int64_t x = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t y = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t z = blockDim.z*blockIdx.z + threadIdx.z;
+
+      const double uX = cellVolume * velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z + 0*fStrideVelocity];
 
-      probeData[ctr] = ((float)(rho));
-      probeData[ctr + n] = ((float)(velocityScalingFactor)) * ((float)(uX));
-      probeData[ctr + 2*n] = ((float)(velocityScalingFactor)) * ((float)(uY));
-      probeData[ctr + 3*n] = ((float)(velocityScalingFactor)) * ((float)(uZ));
+      if( ( flagField[xStrideFlag*x + yStrideFlag*y + zStrideFlag*z] & domainFlag) == domainFlag ){
+         atomicAdd(&result[n], uX);
+      }
    }
 }
 
@@ -273,22 +261,75 @@ void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real
 
 }
 
-void updateProbe(gpu::GPUField<real_t>* pdfs, int64_t* cellData, float* probeData, const int64_t n, const real_t velocityScalingFactor,
+void updateProbe(gpu::GPUField<real_t>* density, gpu::GPUField<real_t>* velocity, int64_t* cellData, float* probeData, const int64_t n, const float velocityScalingFactor,
                  const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid){
 
    dim3 block_(block[0], block[1], block[2]);
    dim3 grid_(grid[0], grid[1], grid[2]);
 
-   double * data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-   const uint8_t timestep = pdfs->getTimestep();
+   double * data_density = density->dataAt(0, 0, 0, 0);
+   double * data_velocity = velocity->dataAt(0, 0, 0, 0);
   
-   const int64_t xStride = int64_c(pdfs->xStride());
-   const int64_t yStride = int64_c(pdfs->yStride());
-   const int64_t zStride = int64_c(pdfs->zStride());
-   const int64_t fStride = int64_c(pdfs->fStride());
-   internal::evaluateProbePoints<<<grid_, block_, 0, nullptr>>>(data_pdfs, cellData, probeData, n, timestep, velocityScalingFactor,
-                                                                xStride, yStride, zStride, fStride);
+   const int64_t xStrideDensity = int64_c(density->xStride());
+   const int64_t yStrideDensity = int64_c(density->yStride());
+   const int64_t zStrideDensity = int64_c(density->zStride());
+
+   const int64_t xStrideVelocity = int64_c(velocity->xStride());
+   const int64_t yStrideVelocity = int64_c(velocity->yStride());
+   const int64_t zStrideVelocity = int64_c(velocity->zStride());
+   const int64_t fStrideVelocity = int64_c(velocity->fStride());
+   internal::evaluateProbePoints<<<grid_, block_, 0, nullptr>>>(data_density, data_velocity, cellData, probeData, n, velocityScalingFactor,
+                                                                xStrideDensity, yStrideDensity, zStrideDensity, xStrideVelocity, yStrideVelocity, zStrideVelocity, fStrideVelocity);
+
+}
+
+void BulkVelocityCalculator::calculateBulkVelocity() {
+   dim3 b(block_[0], block_[1], block_[2]);
+   dim3 g(grid_[0], grid_[1], grid_[2]);
+
+   int64_t counter = 0;
+   for( auto it = blocks_->begin(); it != blocks_->end(); ++it ){
+      auto* block = dynamic_cast< Block* >(it.get());
+      const uint_t level = block->getLevel();
+      const double scaleFactor = double_c(1.0) / double_c( 1 << level );
+      const double cellVolume = scaleFactor * scaleFactor * scaleFactor;
+      GPUField_T* velocityField = block->template getData<GPUField_T>(ids_.velocityFieldGPU);
+      GPUFlagField_T* flagField = block->template getData<GPUFlagField_T>(ids_.flagFieldGPU);
+      
+      double* const data_velocity = velocityField->dataAt(0, 0, 0, 0);
+      uint8_t* const data_flagField = flagField->dataAt(0, 0, 0, 0);
+
+      const int64_t xSize = int64_c(velocityField->xSize());
+      const int64_t ySize = int64_c(velocityField->ySize());
+      const int64_t zSize = int64_c(velocityField->zSize());
+
+      const int64_t xStrideVelocity = int64_c(velocityField->xStride());
+      const int64_t yStrideVelocity = int64_c(velocityField->yStride());
+      const int64_t zStrideVelocity = int64_c(velocityField->zStride());
+      const int64_t fStrideVelocity = int64_c(velocityField->fStride());
+
+      const int64_t xStrideFlag = int64_c(flagField->xStride());
+      const int64_t yStrideFlag = int64_c(flagField->yStride());
+      const int64_t zStrideFlag = int64_c(flagField->zStride());
+
+      internal::sumVelocity<<<g, b, 0, nullptr>>>(resultGPU_, data_velocity, data_flagField, counter, cellVolume, setup_.fluidFlag,
+                                                  xSize, ySize, zSize,
+                                                  xStrideVelocity, yStrideVelocity, zStrideVelocity, fStrideVelocity,
+                                                  xStrideFlag, yStrideFlag, zStrideFlag);
+
+      
+   }
+   WALBERLA_GPU_CHECK(gpuMemcpy(&resultCPU_[0], resultGPU_, sizeof(double) * resultCPU_.size(), gpuMemcpyDeviceToHost ));
+
+   double bulkVelocity = double_c(0.0);
+   for (auto& bulkVelocityPerBlock : resultCPU_){
+      bulkVelocity += bulkVelocityPerBlock;
+   }
+
+   mpi::allReduceInplace< double >(bulkVelocity, mpi::SUM);
+   bulkVelocity_ = double_c(bulkVelocity) / volume_;
 
+   WALBERLA_GPU_CHECK(gpuMemset(resultGPU_, 0.0, sizeof(double) * resultCPU_.size()));
 }
 
 
diff --git a/apps/showcases/PorousMediaGPU/utilGPU.h b/apps/showcases/PorousMediaGPU/utilGPU.h
index 6b0ac4cf9ce1c3cca0f3303b842bcb50888b0d90..7fff9bffc84046754d3e7cfcd8c52f1fdb2ddc5c 100644
--- a/apps/showcases/PorousMediaGPU/utilGPU.h
+++ b/apps/showcases/PorousMediaGPU/utilGPU.h
@@ -19,6 +19,8 @@
 //======================================================================================================================
 #pragma once
 
+#include "blockforest/Block.h"
+
 #include "core/DataTypes.h"
 #include "core/math/Constants.h"
 
@@ -27,14 +29,87 @@
 #include "gpu/FieldCopy.h"
 #include "gpu/GPUField.h"
 
+#include "Types.h"
+#include "Setup.h"
+
 namespace walberla{
+
 void getBlockAndGridSize(Vector3<uint_t> s, Vector3<uint32_t>& block, Vector3<uint32_t>& grid);
 
 void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real_t* ut, const Vector3<real_t>& globalMinCorner, const Vector3<real_t>& blockMinCorner, const Vector3<real_t>& uc,
                          const real_t dx, const real_t t, const uint_t n, const real_t latticeVelocity, const int64_t nGhostLayers,
                          const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid);
 
-void updateProbe(gpu::GPUField<real_t>* pdfs, int64_t* cellData, float* probeData, const int64_t n, const real_t velocityScalingFactor,
+void updateProbe(gpu::GPUField<real_t>* density, gpu::GPUField<real_t>* velocity, int64_t* cellData, float* probeData, const int64_t n, const float velocityScalingFactor,
                  const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid);
 
+
+
+class BulkVelocityCalculator {
+public:
+    BulkVelocityCalculator(const std::shared_ptr<StructuredBlockForest> & blocks, const IDs& ids, const Setup& setup) 
+    : blocks_(blocks), setup_(setup), ids_(ids)
+    {
+#if __CUDA_ARCH__ < 600
+    WALBERLA_LOG_WARNING_ON_ROOT("Warning atomic add in double precision not supported. Falling back to software implementation")
+#endif
+
+        volume_ = 0.0;
+        uint_t nBlocks = 0;
+        for( auto it = blocks_->begin(); it != blocks_->end(); ++it ){
+            auto* block = dynamic_cast< Block* >(it.get());
+            const uint_t level = block->getLevel();
+
+            const double scaleFactor = double_c(1.0) / double_c( 1 << level );
+            const double cellVolume = scaleFactor * scaleFactor * scaleFactor;
+            auto * flagField = block->template getData<FlagField_T>(ids.flagField);
+            auto domainFlag = flagField->getFlag(setup.fluidUID);
+            const auto ci = flagField->xyzSize();
+            for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
+                if(isFlagSet(flagField->get(*cellIt), domainFlag)){
+                    volume_ += cellVolume;
+                }
+            }
+            nBlocks++;
+        }
+
+        const Vector3<uint_t> s(blocks_->getNumberOfXCellsPerBlock(), blocks_->getNumberOfYCellsPerBlock(), blocks_->getNumberOfZCellsPerBlock());
+        getBlockAndGridSize(s, block_, grid_);
+        mpi::allReduceInplace< double >(volume_, mpi::SUM);
+
+
+        resultCPU_.resize(nBlocks);
+
+        WALBERLA_GPU_CHECK(gpuMalloc((void **)&resultGPU_, sizeof(double) * resultCPU_.size() ));
+        WALBERLA_GPU_CHECK(gpuMemset(resultGPU_, 0.0, sizeof(double) * resultCPU_.size()));
+
+    }
+
+    double bulkVelocity() const { return bulkVelocity_; }
+    void setBulkVelocity(const double bulkVelocity) { bulkVelocity_ = bulkVelocity; }
+    void calculateBulkVelocity();
+
+    real_t getDrivingForce() {
+        calculateBulkVelocity();
+
+        const real_t latticeReferenceLength = setup_.referenceLength / setup_.dxC;
+        return calculateDrivingForce(setup_.frictionVelocity, latticeReferenceLength, setup_.latticeVelocity, bulkVelocity_);
+    }
+
+private:
+    const std::shared_ptr< StructuredBlockForest > blocks_;
+    const Setup setup_;
+    const IDs ids_;
+
+    double volume_;
+    double bulkVelocity_{0.0};
+
+    std::vector<double> resultCPU_;
+    double* resultGPU_;
+
+    Vector3<uint32_t> block_;
+    Vector3<uint32_t> grid_;
+};
+
+
 } // namespace walberla
\ No newline at end of file
diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py
index b1c61b589aaa585406b554e19861bf2f6c8e193d..d61adf2a5c156c7e610c7f4f5a53e6932bb404be 100644
--- a/python/pystencils_walberla/jinja_filters.py
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -43,7 +43,7 @@ delete_loop = """
 """
 
 standard_parameter_registration = """
-for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++)
+for (uint_t level = 0; level < blocks_->getNumberOfLevels(); level++)
 {{
     const {dtype} level_scale_factor = {dtype}(uint_t(1) << level);
     {function_rules}
@@ -760,11 +760,19 @@ def generate_setter(kernel_infos, parameter_registration=None):
     for kernel_info in kernel_infos:
         for param in kernel_info.parameters:
             if not param.is_field_parameter and param.symbol.name not in params_to_skip:
-                if param.symbol.name in scaling_identifiers:
-                    continue
                 dtype = param.symbol.dtype
                 name = param.symbol.name
-                code_line = f"inline void set{name.capitalize()}(const {dtype} value) {{ {name}_ = value; }}"
+
+                if param.symbol.name in scaling_identifiers:
+                    idx = scaling_identifiers.index(param.symbol.name)
+                    lambda_expr = parameter_registration.scaling_info[idx][2]
+                    raw_scaling = convert_to_raw_form(name, lambda_expr, variable_name="value", setter=True)
+                    function_rule = raw_scaling.format(dtype=dtype)
+                    code = standard_parameter_registration.format(dtype=dtype, function_rules=function_rule)
+                    code_line = f"inline void set{name.capitalize()}(const {dtype} value) {{ {code}}}"
+                else:
+                    code_line = f"inline void set{name.capitalize()}(const {dtype} value) {{ {name}_ = value; }}"
+
                 result.append(code_line)
                 params_to_skip.append(param.symbol.name)
 
@@ -803,7 +811,9 @@ def generate_parameter_registration(kernel_infos, parameter_registration):
     return "\n".join(result)
 
 
-def convert_to_raw_form(name, expression):
+def convert_to_raw_form(name, expression, variable_name=None, setter=False):
+    if variable_name is None:
+        variable_name = name
     expr_str = str(expression)
 
     # Extract the body of the lambda expression
@@ -812,8 +822,12 @@ def convert_to_raw_form(name, expression):
 
     # Replace numeric constants with the {dtype}(number) format
     body = re.sub(r'(?<!\w)(\d+(\.\d*)?|\.\d+)(?!\w)', lambda match: f'{{dtype}}({match.group(1)})', body)
+    body = body.replace(name, variable_name)
 
-    return f"{name}Vector.push_back({{dtype}}({body}));"
+    if setter:
+        return f"{name}Vector[level] = {{dtype}}({body});"
+    else:
+        return f"{name}Vector.push_back({{dtype}}({body}));"
 
 
 def generate_constructor(kernel_infos, parameter_registration):
diff --git a/src/io/hdfWrapper.h b/src/io/hdfWrapper.h
index bd090771383ea156dbb2c8612bd7affd8b9d4175..46dcafd56520106df540fdf0ebae67e5c8b22746 100644
--- a/src/io/hdfWrapper.h
+++ b/src/io/hdfWrapper.h
@@ -36,6 +36,28 @@ H5::FloatType h5Double();
 H5::FloatType h5Float();
 H5::FloatType h5FileFloat();
 
+template< typename T >
+H5::DataType getH5ExportType();
+
+template<>
+inline H5::DataType getH5ExportType< uint8_t >()
+{
+  return H5::FloatType(H5::PredType::STD_U8LE);
+}
+
+template<>
+inline H5::DataType getH5ExportType< float >()
+{
+  return H5::FloatType(H5::PredType::IEEE_F32LE);
+}
+
+template<>
+inline H5::DataType getH5ExportType< double >()
+{
+  return H5::FloatType(H5::PredType::IEEE_F64LE);
+}
+
+
 
 
 }
\ No newline at end of file