diff --git a/apps/showcases/PorousMediaGPU/CMakeLists.txt b/apps/showcases/PorousMediaGPU/CMakeLists.txt
index 9c59796b9b73f04c95ae4c6cdef70be476cb320c..087cab7fb27ee5fb37914277b230395b5c055b7a 100644
--- a/apps/showcases/PorousMediaGPU/CMakeLists.txt
+++ b/apps/showcases/PorousMediaGPU/CMakeLists.txt
@@ -10,19 +10,21 @@ waLBerla_generate_target_from_python(NAME PorousMediaGPUCodeGen
         OUT_FILES PorousMediaGPUSweepCollection.h PorousMediaGPUSweepCollection.${CODEGEN_FILE_SUFFIX}
         PorousMediaGPUStorageSpecification.h PorousMediaGPUStorageSpecification.${CODEGEN_FILE_SUFFIX}
         NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX}
+        Obstacle.h Obstacle.${CODEGEN_FILE_SUFFIX}
         UBB.h UBB.${CODEGEN_FILE_SUFFIX}
         Outflow.h Outflow.${CODEGEN_FILE_SUFFIX}
+        Welford.h Welford.${CODEGEN_FILE_SUFFIX}
         PorousMediaGPUBoundaryCollection.h
         PorousMediaGPUInfoHeader.h
         PorousMediaGPUStaticDefines.h)
 
 if (WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP)
     waLBerla_add_executable ( NAME PorousMediaGPU
-            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp utilGPU.cu
+            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h DragEvaluation.cpp InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp utilGPU.cu
             DEPENDS blockforest boundary core field gpu io lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen )
 else()
     waLBerla_add_executable ( NAME PorousMediaGPU
-            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp
+            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h DragEvaluation.cpp InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp
             DEPENDS blockforest boundary core field io lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen )
 
 # target_include_directories(PorousMediaGPU PUBLIC ${HDF5_CXX_INCLUDE_DIRS})
diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.cpp b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8107e6df742ca088338a141550ed64a151fb073
--- /dev/null
+++ b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
@@ -0,0 +1,180 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Evaluation.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "DragEvaluation.h"
+
+
+namespace walberla
+{
+
+DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks, BoundaryCollection_T & boundaryCollection,
+                               const IDs& ids, const Setup& setup, std::shared_ptr<Config>& config)
+   : blocks_(blocks), boundaryCollection_(boundaryCollection), ids_(ids), setup_(setup)
+{
+   quantities_ = {{"timestep", 0}, {"Fx", 1}, {"Fy", 2}, {"Fz", 3}, {"CDReal", 4}, {"CLReal", 5}};
+
+   WALBERLA_ASSERT_NOT_NULLPTR(blocks)
+   maxLevel_ = blocks->getDepth();
+   blocks->getBlocks(finestBlocks_, maxLevel_);
+
+   auto parameters     = config->getOneBlock("DragEvaluation");
+   auto wallParameters = config->getOneBlock("RoughWall");
+
+   dragFilename_   = parameters.getParameter< std::string >("filename");
+   checkFrequency_ = parameters.getParameter< uint_t >("evaluationFrequency");
+   writeCounter_   = 1000 * checkFrequency_;
+   dt_             = setup.dt / real_c( 1 << setup.refinementLevels );
+   timesteps_      = setup.timesteps * ( 1 << setup.refinementLevels );
+
+   const real_t radius   = wallParameters.getParameter< real_t >("hemispheresRadius");
+   const real_t distance = wallParameters.getParameter< real_t >("hemispheresDistance");
+
+   filesystem::path hdfFilePath( dragFilename_.c_str() );
+   if( filesystem::exists( hdfFilePath ) )
+      std::remove( hdfFilePath.string().c_str() );
+
+
+   extendSize[0] = 0;
+   selectStart[0] = 0;
+   H5::FileAccPropList fileAccProps;
+#ifdef WALBERLA_BUILD_WITH_MPI
+   //  Enable MPI-IO
+   H5Pset_fapl_mpio(fileAccProps.getId(), mpi::MPIManager::instance()->comm(), MPI_INFO_NULL);
+#endif
+   h5file_ = std::make_unique<H5::H5File>(hdfFilePath.c_str(), H5F_ACC_CREAT | H5F_ACC_RDWR, H5::FileCreatPropList::DEFAULT, fileAccProps);
+   H5::Group dataGroup = h5file_->createGroup("Data");
+
+   const hsize_t dsetSize[1]{ 1 };
+   const hsize_t maxSize[1]{ H5S_UNLIMITED };
+   const hsize_t chunkSize[1]{writeCounter_};
+
+   H5::DataSpace arrayDataSpace(1, dsetSize, maxSize);
+   H5::DSetCreatPropList props;
+   props.setChunk(1, chunkSize);
+
+   for (auto quantitie = quantities_.begin(); quantitie != quantities_.end(); quantitie++){
+      dataGroup.createDataSet(quantitie->first, io::h5Float(), arrayDataSpace, props);
+   }
+   dragResults_.resize(quantities_.size() * writeCounter_);
+
+   auto domain = blocks->getDomain();
+   const real_t yMin = domain.yMin();
+   const real_t yMax = domain.yMax();
+
+   std::vector<Vector3<real_t>> hemisphereCenters;
+   getAllHemisphereCenters(hemisphereCenters, radius, distance, domain);
+
+   const double pi = double_c(3.14159265358979323846);
+   surfaceAreaHemiSpheres_ = double_c(0.0);
+   double totalCircleArea = double_c(0.0);
+
+   for (auto center : hemisphereCenters){
+      double surfaceAreaHemiSphere = double_c(2.0) * pi * double_c(radius * radius);
+      double circleArea = pi * double_c(radius * radius);
+      if(center[0] < (domain.xMin() + setup_.dxF) || center[0] > (domain.xMax() - setup_.dxF)){
+         surfaceAreaHemiSphere *= double_c(0.5);
+         circleArea *= double_c(0.5);
+      }
+      if(center[2] < (domain.zMin() + setup_.dxF) || center[2] > (domain.zMax() - setup_.dxF)){
+         surfaceAreaHemiSphere *= double_c(0.5);
+         circleArea *= double_c(0.5);
+      }
+      surfaceAreaHemiSpheres_ += surfaceAreaHemiSphere;
+      totalCircleArea += circleArea;
+   }
+
+   surfaceAreaWall_ = double_c(2.0) * (domain.xMax() - domain.xMin()) * (domain.zMax() - domain.zMin());
+   surfaceAreaWall_ -= totalCircleArea;
+   WALBERLA_CHECK_GREATER_EQUAL(surfaceAreaWall_, real_c(0.0))
+
+   // TODO
+   meanVelocity_        = 1.0;
+
+   WALBERLA_LOG_INFO_ON_ROOT(
+      "Evaluation initialised:"
+         "\n   + Total surface area of Hemispheres:     " << surfaceAreaHemiSpheres_
+      << "\n   + Surface area of the wall:              " << surfaceAreaWall_
+   )
+
+}
+
+void DragEvaluation::forceCalculation(const uint_t level)
+{
+   if (checkFrequency_ == uint_c(0))
+      return;
+
+   ++totalExecutionCounter_;
+
+   if(level == maxLevel_){
+      for (auto b : finestBlocks_){
+         force_ += Vector3<double>(boundaryCollection_.ObstacleObject->getForce(b));
+      }
+
+      mpi::reduceInplace(force_, mpi::SUM);
+      WALBERLA_ROOT_SECTION(){
+         const double t = double_c(totalExecutionCounter_) * dt_;
+
+         const double meanU2 = double_c(meanVelocity_) * double_c(meanVelocity_);
+
+         const double cDRealArea = (double_c(8.0) * force_[0]) / (meanU2 * double_c(surfaceAreaHemiSpheres_));
+         const double cLRealArea = (double_c(8.0) * force_[1]) / (meanU2 * double_c(surfaceAreaHemiSpheres_));
+
+         dragResults_[quantities_["timestep"]*writeCounter_ + executionCounter_] = float_c(t);
+         dragResults_[quantities_["Fx"]*writeCounter_ + executionCounter_] = float_c(force_[0]);
+         dragResults_[quantities_["Fy"]*writeCounter_ + executionCounter_] = float_c(force_[1]);
+         dragResults_[quantities_["Fz"]*writeCounter_ + executionCounter_] = float_c(force_[2]);
+         dragResults_[quantities_["CDReal"]*writeCounter_ + executionCounter_] = float_c(cDRealArea);
+         dragResults_[quantities_["CLReal"]*writeCounter_ + executionCounter_] = float_c(cLRealArea);
+      }
+   }
+   force_[0] = double_c(0.0);
+   force_[1] = double_c(0.0);
+   force_[2] = double_c(0.0);
+
+   ++executionCounter_;
+   if(executionCounter_ >= writeCounter_ || totalExecutionCounter_ == timesteps_){
+      writeResults();
+   }
+}
+
+
+void DragEvaluation::writeResults()
+{
+   extendSize[0] += executionCounter_;
+   const hsize_t selectCount[1] { executionCounter_ };
+   WALBERLA_ROOT_SECTION()
+   {
+      auto group = h5file_->openGroup("Data");
+      for (auto quantitie = quantities_.begin(); quantitie != quantities_.end(); quantitie++){
+         
+         H5::DataSet dataset = group.openDataSet(quantitie->first);
+         dataset.extend(extendSize);
+         
+         H5::DataSpace dataspace = dataset.getSpace();
+         dataspace.selectHyperslab(H5S_SELECT_SET, selectCount, selectStart);
+            
+         H5::DataSpace memSpace(1, selectCount);
+         dataset.write(&dragResults_[quantitie->second * writeCounter_], io::h5Float(), memSpace, dataspace);
+      }
+   }
+   selectStart[0] += (executionCounter_);
+   executionCounter_ = 0;
+}
+
+}
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.h b/apps/showcases/PorousMediaGPU/DragEvaluation.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e0a4461d685a76781d822306a057d8185e747a0
--- /dev/null
+++ b/apps/showcases/PorousMediaGPU/DragEvaluation.h
@@ -0,0 +1,95 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Evaluation.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+# pragma once
+
+#include "core/config/Config.h"
+#include "core/math/Constants.h"
+#include "core/Filesystem.h"
+#include "core/math/Sample.h"
+#include "core/math/Vector3.h"
+
+#include "lbm_generated/field/PdfField.h"
+
+#include "io/hdfWrapper.h"
+
+#include "Types.h"
+#include "Setup.h"
+#include "util.h"
+
+#include <deque>
+#include <fstream>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace walberla;
+
+namespace walberla
+{
+class DragEvaluation
+{
+ public:
+   DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks, BoundaryCollection_T & boundaryCollection,
+                  const IDs& ids, const Setup& setup, std::shared_ptr<Config>& config);
+
+   void forceCalculation(const uint_t level); // for calculating the force
+   void writeResults();
+
+   std::function<void (const uint_t)> forceCalculationFunctor()
+   {
+      return [this](uint_t level) { forceCalculation(level); };
+   }
+
+ protected:
+   bool initialized_{false};
+
+   std::shared_ptr< StructuredBlockForest > blocks_;
+   BoundaryCollection_T & boundaryCollection_;
+   IDs ids_;
+   Setup setup_;
+
+   std::map<std::string, uint_t> quantities_;
+   std::vector<Block *> finestBlocks_;
+   uint_t maxLevel_;
+   real_t dt_;
+   uint_t timesteps_;
+
+   std::string dragFilename_;
+   uint_t checkFrequency_;
+   uint_t writeCounter_;
+   uint_t executionCounter_{0};
+   uint_t totalExecutionCounter_{0};
+
+   std::unique_ptr<H5::H5File> h5file_;
+   hsize_t extendSize[1];
+   hsize_t selectStart[1];
+
+   std::vector<float> dragResults_;
+
+   double surfaceAreaHemiSpheres_;
+   double surfaceAreaWall_;
+   double meanVelocity_;
+
+   Vector3< double > force_;
+
+}; // class DragEvaluation
+
+}
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/Inflow.cpp b/apps/showcases/PorousMediaGPU/Inflow.cpp
index 7341db6add36c06e2f4dfa2feb07c2f19624a07b..a97dfd74cb90f38549e4f7af41490428cc666ac1 100644
--- a/apps/showcases/PorousMediaGPU/Inflow.cpp
+++ b/apps/showcases/PorousMediaGPU/Inflow.cpp
@@ -36,6 +36,9 @@ void Inflow::logging(){
 }
 
 void Inflow::updateCPU(){
+    if(!setup_.turbulentInflow){
+        return;
+    }
 
     const real_t t = real_c(executionCounter_) * dt_;
     for (auto& block : *blocks_){
@@ -68,6 +71,9 @@ void Inflow::updateCPU(){
 }
 
 void Inflow::updateGPU(){
+    if(!setup_.turbulentInflow){
+        return;
+    }
 
     const real_t t = real_c(executionCounter_) * dt_;
     for (auto& block : *blocks_){
@@ -86,7 +92,7 @@ void Inflow::init(){
     nu_    = setup_.kinViscosity;
     urms_  = setup_.urms;
     le_    = setup_.characteristicLength;
-    n_     = setup_.numberOfFourrierModes;
+    n_     = blockDimX_ * (1 + uint_c(std::floor(setup_.numberOfFourrierModes / blockDimX_)));
     dt_    = setup_.dt;
     dx_    = setup_.dxC;
     uc_[0] = setup_.uc[0];
@@ -126,7 +132,7 @@ void Inflow::init(){
 
     globalMinCorner_ = blocks_->getDomain().min();
 
-    const Vector3<uint_t> s(n_, setup_.cellsPerBlock[1] + 2 * nGhostLayers_, setup_.cellsPerBlock[2] + 2 * nGhostLayers_);
+    const Vector3<uint_t> s(blockDimX_, setup_.cellsPerBlock[1] + 2 * nGhostLayers_, setup_.cellsPerBlock[2] + 2 * nGhostLayers_);
     getBlockAndGridSize(s, block_, grid_);
 
     WALBERLA_GPU_CHECK(gpuMalloc((void **)&kappaGPU_, sizeof(real_t) * kappa_.size() ));
diff --git a/apps/showcases/PorousMediaGPU/Inflow.h b/apps/showcases/PorousMediaGPU/Inflow.h
index d089bf3195207a1531e754ed47ff25077ef9e03e..192106a124a88f82706fc07c4683c690e7a103de 100644
--- a/apps/showcases/PorousMediaGPU/Inflow.h
+++ b/apps/showcases/PorousMediaGPU/Inflow.h
@@ -78,6 +78,7 @@ class Inflow
    Vector3<real_t> uc_;
    real_t latticeVelocity_;
    const int64_t nGhostLayers_{1};
+   const uint_t blockDimX_{128};
 
    real_t firstMode_;
    real_t lastMode_;
diff --git a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
index e94eab551caee207f679acdb1d3cd5d53508cf88..73a62abc2ee8d933291dcfacf86c73cd9a0e3263 100644
--- a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
+++ b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
@@ -68,74 +68,31 @@ void initGrid(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDat
 void initHemisphers(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID flagFieldID, const Setup& setup,
                     const Config::BlockHandle& wallParameters){
 
-
-   WALBERLA_LOG_INFO_ON_ROOT("Creating rough wall")
-
    const real_t radius   = wallParameters.getParameter< real_t >("hemispheresRadius");
    const real_t hemispheresDistance = wallParameters.getParameter< real_t >("hemispheresDistance");
 
-   const real_t diameter = real_c(2.0) * radius;
-   const real_t spaceX   = (math::root_three * std::sqrt(hemispheresDistance * hemispheresDistance) ) / real_c(2.0);
-
-   const real_t radiusSquared = radius * radius;
-   std::vector<AABB> aabbs;
-
    auto domain = blocks->getDomain();
-
-   const real_t xMax = domain.xMax();
+   const real_t yMin = domain.yMin();
    const real_t yMax = domain.yMax();
-   const real_t zMax = domain.zMax();
    
-   const uint_t nx = uint_c(std::ceil(xMax / spaceX) + 1);
-   const uint_t nz = uint_c(std::ceil(zMax / hemispheresDistance) + 1);
-
-   for(uint_t i = 0; i < nx; ++i){
-      for(uint_t j = 0; j < nz; ++j){
-         const real_t xc = real_c(i) * spaceX;
-         real_t zc = real_c(j) * hemispheresDistance;
-         if(i % 2 != 0){
-            zc += diameter;
-         }
-
-         const Vector3<real_t> minCornerBottom(xc - radius, real_c(0.0), zc - radius);
-         const Vector3<real_t> maxCornerBottom(xc + radius, radius, zc + radius);
-         const Vector3<real_t> minCornerTop(xc - radius, yMax - radius, zc - radius);
-         const Vector3<real_t> maxCornerTop(xc + radius, yMax, zc + radius);
-
-         const AABB minaabb(minCornerBottom, maxCornerBottom);
-         const AABB maxaabb(minCornerTop, maxCornerTop);
-
-         aabbs.emplace_back(minaabb);
-         aabbs.emplace_back(maxaabb);
-      }
-   }
-
    for (auto& block : *blocks){
       const auto flagField    = block.getData< FlagField_T >(flagFieldID);
       const auto boundaryFlag = flagField->getOrRegisterFlag(setup.obstacleUID);
       
-      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(flagField, Cell globalCell;
+      WALBERLA_FOR_ALL_CELLS_XYZ(flagField, Cell globalCell;
          blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
          Vector3<real_t> p;
          blocks->getCellCenter(p, globalCell, 0);
+         if(p[1] > radius && p[1] < yMax - radius){
+            continue;
+         }
 
-         for(auto aabb: aabbs){
-            if (aabb.contains(p)){
-               if(hemispheresContains(p, aabb.center(), radiusSquared)) {
-                  
-                  addFlag(flagField->get(x, y, z), boundaryFlag);
-               }
-            }
+         auto center = getCenterOfHemisphere(p, radius, hemispheresDistance, yMin, yMax);
+         if(hemispheresContains(p, center, radius)) {
+            addFlag(flagField->get(x, y, z), boundaryFlag);
          }
       )
    }
-
-   
-}
-
-bool hemispheresContains(const Vector3<real_t>& point, const Vector3<real_t>& center, const real_t radiusSquared )
-{
-   return (point - center).sqrLength() <= radiusSquared;
 }
 
 } // namespace walberla
diff --git a/apps/showcases/PorousMediaGPU/InitializerFunctions.h b/apps/showcases/PorousMediaGPU/InitializerFunctions.h
index 004441e68f690bb60a4f5286e91e7ac3ef2f95bd..1499a29ac228ffa16c33cd788ed619ca73a90a68 100644
--- a/apps/showcases/PorousMediaGPU/InitializerFunctions.h
+++ b/apps/showcases/PorousMediaGPU/InitializerFunctions.h
@@ -27,6 +27,7 @@
 #include "field/vtk/VTKWriter.h"
 
 #include "Setup.h"
+#include "util.h"
 
 namespace walberla{
 void initGrid(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID flagFieldID, const Setup& setup);
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
index b26fface76636d1ae507268f8853e470a522eda6..7d32b03521b7e6d5821ed890117aa8c6495a3e6f 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
@@ -79,6 +79,7 @@
 #include <iostream>
 #include <memory>
 
+#include "DragEvaluation.h"
 #include "Inflow.h"
 #include "InitializerFunctions.h"
 #include "GridGeneration.h"
@@ -86,17 +87,13 @@
 #include "Probe.h"
 #include "TPMS.h"
 #include "Types.h"
-
-
+#include "util.h"
 
 #include "PorousMediaGPUStaticDefines.h"
 
 using namespace walberla;
 using macroFieldType = VelocityField_T::value_type;
 
-using BoundaryCollection_T = lbm::PorousMediaGPUBoundaryCollection< FlagField_T >;
-using SweepCollection_T = lbm::PorousMediaGPUSweepCollection;
-
 #if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
 using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
 using gpu::communication::NonUniformGPUScheme;
@@ -112,6 +109,68 @@ inline Vector3< uint_t > inflowSliceSize( const shared_ptr< StructuredBlockStora
    return Vector3<uint_t>( uint_c(1), blocks->getNumberOfYCells( *block ), blocks->getNumberOfZCells( *block ) );
 }
 
+class HemisphereWallDistance
+{
+ public:
+   HemisphereWallDistance(const real_t radius, const real_t hemispheresDistance) : radius_(radius), hemispheresDistance_(hemispheresDistance){} 
+
+   real_t operator()(const Cell& fluidCell, const Cell& boundaryCell,
+                     const shared_ptr< StructuredBlockForest >& SbF, IBlock& block){
+
+      auto domain = SbF->getDomain();
+      const real_t yMin = domain.yMin();
+      const real_t yMax = domain.yMax();
+
+      Vector3< real_t > boundary = SbF->getBlockLocalCellCenter( block, boundaryCell );
+      Vector3< real_t > fluid    = SbF->getBlockLocalCellCenter( block, fluidCell );
+
+      auto center = getCenterOfHemisphere(boundary, radius_, hemispheresDistance_, yMin, yMax);
+
+      hemispheresContains(boundary, center, radius_);
+
+      WALBERLA_CHECK(!hemispheresContains(fluid, center, radius_), "fluid cell is in contained in Hemisphere (" << fluid[0] << ", " << fluid[1] << ", " << fluid[2] << "). The block local cell is " << fluidCell)
+      WALBERLA_CHECK(hemispheresContains(boundary, center, radius_), "boundary cell is not in contained in Hemisphere (" << boundary[0] << ", " << boundary[1] << ", " << boundary[2] << "). The block local cell is " << boundaryCell)
+
+      return delta(fluid, boundary, center);
+   }
+
+   real_t delta(const Vector3< real_t >& fluid, const Vector3< real_t >& boundary, const Vector3< real_t >& center) const
+   {
+      // http://devmag.org.za/2009/04/17/basic-collision-detection-in-2d-part-2/
+      const Vector3< real_t > f = fluid - center;
+      const Vector3< real_t > d = (boundary - center) - f;
+
+      const real_t dDotd = d[0] * d[0] + d[1] * d[1] + d[2] * d[2];
+      const real_t fDotf = f[0] * f[0] + f[1] * f[1] + f[2] * f[2];
+      const real_t dDotf = d[0] * f[0] + d[1] * f[1] + d[2] * f[2];
+
+      const real_t b = real_c(2.0) * dDotf;
+      const real_t c = fDotf - (radius_ * radius_);
+
+      const real_t bb4ac = b * b - (real_c(4.0) * dDotd * c);
+      WALBERLA_CHECK_GREATER_EQUAL(bb4ac, real_c(0.0))
+
+      const real_t sqrtbb4ac = std::sqrt(bb4ac);
+      const real_t alpha = std::min((-b + sqrtbb4ac) / (real_c(2.0) * dDotd), (-b - sqrtbb4ac) / (real_c(2.0) * dDotd));
+
+      WALBERLA_CHECK_GREATER_EQUAL(alpha, real_c(0.0))
+      WALBERLA_CHECK_LESS_EQUAL(alpha, real_c(1.0))
+
+      return alpha;
+   }
+
+
+private:
+   const real_t radius_;
+   const real_t hemispheresDistance_;
+};
+
+real_t defaultWallDistance(const Cell& /*fluidCell*/, const Cell& /*boundaryCell*/, const shared_ptr< StructuredBlockForest >& /*SbF*/, IBlock& /*block*/){
+   return real_c(0.5);
+}
+
+
+
 int main(int argc, char** argv){
    Environment env( argc, argv );
    mpi::MPIManager::instance()->useWorldComm();
@@ -175,25 +234,35 @@ int main(int argc, char** argv){
 
    auto allocator = make_shared< gpu::HostFieldAllocator<macroFieldType> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
    ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, uint_c(1), allocator);
+   ids.meanVelocityField = field::addToStorage< VelocityField_T >(blocks, "mean velocity", macroFieldType(0.0), field::fzyx, uint_c(1), allocator);
    ids.densityField  = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, uint_c(1), allocator);
    ids.flagField     = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
 
    ids.pdfFieldGPU      = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, ids.pdfField, StorageSpec, "pdfs on GPU", true);
    ids.velocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityField, "velocity on GPU", true);
+   ids.meanVelocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.meanVelocityField, "mean velocity on GPU", true);
    ids.densityFieldGPU  = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, ids.densityField, "velocity on GPU", true);
 
+
    std::function< Vector3< uint_t > (const shared_ptr< StructuredBlockStorage >&, IBlock * const) > getInflowSliceSize = inflowSliceSize;
    ids.velocityFieldInflowSlice    = field::addToStorage< VelocityField_T >(blocks, "velocity Inflow", getInflowSliceSize, macroFieldType(0.0), field::fzyx, uint_c(1));
    ids.velocityFieldInflowSliceGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityFieldInflowSlice, "velocity Inflow on GPU", true);
 
    const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
    SweepCollection_T sweepCollection(blocks, ids.pdfFieldGPU, ids.densityFieldGPU, ids.velocityFieldGPU, setup.drivingForce, setup.omega, innerOuterSplit);
+   WelfordSweep_T welfordSweep(ids.meanVelocityFieldGPU, ids.velocityFieldGPU, real_c(0.0));
 
    WALBERLA_LOG_INFO_ON_ROOT(++step << ". Initialising data")
    for (auto& block : *blocks){
       sweepCollection.initialise(&block);
+
+      auto velocityInflow = block.getData< VelocityField_T >(ids.velocityFieldInflowSlice);
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velocityInflow,
+         velocityInflow->get(x, y, z, 0) = setup.latticeVelocity;
+      )
    }
    sweepCollection.initialiseBlockPointer();
+   gpu::fieldCpy< gpu::GPUField< VelocityField_T::value_type >, VelocityField_T >(blocks, ids.velocityFieldInflowSliceGPU, ids.velocityFieldInflowSlice);
 
    auto inflow = std::make_shared<Inflow>(blocks, setup, ids);
    auto probe = std::make_shared<Probe>(blocks, setup, ids, config->getOneBlock("Probes"));
@@ -202,11 +271,12 @@ int main(int argc, char** argv){
    const real_t res = 3.0;
    const real_t start = 0.0;
 
+/*
    const uint_t nx = uint_c(std::floor(lenght / (setup.dxC * res)));
    for(uint_t i = 0; i < nx; ++i){
       probe->addPoint(Vector3<real_t>(start + real_c(i) * (setup.dxC * res), 0.5, 0.5 ));
    }
-   
+*/ 
    probe->initialise();
 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -220,10 +290,13 @@ int main(int argc, char** argv){
       setup.inflowFlag   = flagField->registerFlag(setup.inflowUID);
       setup.outflowFlag  = flagField->registerFlag(setup.outflowUID);
       setup.obstacleFlag = flagField->registerFlag(setup.obstacleUID);
+      setup.noSlipFlag   = flagField->registerFlag(setup.noSlipUID);
       setup.fluidFlag    = flagField->registerFlag(setup.fluidUID);
    }
 
+
    geometry::initBoundaryHandling< FlagField_T >(*blocks, ids.flagField, boundariesConfig);
+   std::function< VelocityField_T::value_type (const Cell&, const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) > wallDistanceFunctor = defaultWallDistance;
    if(setup.useGrid){
       WALBERLA_LOG_INFO_ON_ROOT(++step << ". Setting Grid after inflow to produce turbulence structure")
       initGrid(blocks, ids.flagField, setup);
@@ -231,7 +304,12 @@ int main(int argc, char** argv){
 
    if(config->getNumBlocks("RoughWall") > 0){
       WALBERLA_LOG_INFO_ON_ROOT(++step << ". Using RoughWall setup")
+      auto wallParameters = config->getOneBlock("RoughWall");
+      const real_t radius   = wallParameters.getParameter< real_t >("hemispheresRadius");
+      const real_t hemispheresDistance = wallParameters.getParameter< real_t >("hemispheresDistance");
       initHemisphers(blocks, ids.flagField, setup, config->getOneBlock("RoughWall"));
+      HemisphereWallDistance wallDist(radius, hemispheresDistance);
+      wallDistanceFunctor = wallDist;
    }
 
    if(config->getNumBlocks("TPMS") > 0){
@@ -240,9 +318,11 @@ int main(int argc, char** argv){
       tpms->setupBoundary(blocks, ids.flagField);
    }
 
+
    geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, setup.fluidUID);
-   // BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, fluidFlagUID, latticeVelocity);
-   BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.velocityFieldInflowSliceGPU, ids.pdfField);
+   // BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.velocityFieldInflowSliceGPU, setup.drivingForce);
+   BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.velocityFieldInflowSliceGPU, wallDistanceFunctor, ids.pdfField);
+   // BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.pdfField); 
 
    if(config->getNumBlocks("TPMS") > 0 && loggingParameters.getParameter< bool >("writeSurfaceMeshOfTPMS", false) ){
       WALBERLA_LOG_INFO_ON_ROOT(++step << ". Writing TPMS as surface file")
@@ -251,6 +331,7 @@ int main(int argc, char** argv){
          geometry::writeMesh("TPMS.obj", *mesh);
       }
    }
+
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                           COMMUNICATION SCHEME                                             ///
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -275,6 +356,25 @@ int main(int argc, char** argv){
 
    LBMMeshRefinement.addRefinementToTimeLoop(timeloop, uint_c(0));
 
+   shared_ptr< DragEvaluation > evaluation;
+   if(config->getNumBlocks("RoughWall") > 0){
+      WALBERLA_LOG_INFO_ON_ROOT(++step << ". Setup Drag Evaluation")
+      evaluation = make_shared<DragEvaluation>(blocks, boundaryCollection, ids, setup, config);
+      LBMMeshRefinement.addPostBoundaryHandlingBlockFunction(evaluation->forceCalculationFunctor());
+
+   }
+
+   
+   timeloop.add() << BeforeFunction([&]()
+      {
+         for (auto& block : *blocks){
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+         const uint_t velCtr = uint_c(welfordSweep.getCounter());
+         welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
+      }) << Sweep(welfordSweep.getSweep(), "welford sweep");
+
+
    /*SweepTimeloop timeloop(blocks->getBlockStorage(), setup.timesteps);
    timeloop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
                   << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
@@ -304,12 +404,15 @@ int main(int argc, char** argv){
       vtkOutput->setInitialWriteCallsToSkip(initialWriteCallsToSkip);
 
       vtkOutput->addBeforeFunction([&]() {
+         /*
          for (auto& block : *blocks){
             sweepCollection.calculateMacroscopicParameters(&block);
          }
+         */
 
 #if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
          gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.velocityField, ids.velocityFieldGPU);
+         gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.meanVelocityField, ids.meanVelocityFieldGPU);
          gpu::fieldCpy< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(blocks, ids.densityField, ids.densityFieldGPU);
 #endif
       });
@@ -342,6 +445,10 @@ int main(int argc, char** argv){
             auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(ids.velocityField, "velocity");
             vtkOutput->addCellDataWriter(velWriter);
          }
+      }
+      if (VTKWriter.getParameter< bool >("meanVelocity", false)){
+         auto meanVelWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(ids.meanVelocityField, "meanVelocity");
+         vtkOutput->addCellDataWriter(meanVelWriter);
 
       }
       if (VTKWriter.getParameter< bool >("density")){
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.py b/apps/showcases/PorousMediaGPU/PorousMediaGPU.py
index b617aabc3e060483721d720926c503014e2080ae..6138be6526786178d2d4f16943faaef5074c9268 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.py
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.py
@@ -5,11 +5,12 @@ from pystencils.field import fields
 from pystencils.simp import insert_aliases, insert_constants
 
 from lbmpy import LBStencil, LBMConfig, LBMOptimisation
-from lbmpy.boundaries.boundaryconditions import ExtrapolationOutflow, UBB, NoSlip, FixedDensity
+from lbmpy.boundaries.boundaryconditions import NoSlip, NoSlipLinearBouzidi, ExtrapolationOutflow, UBB, FixedDensity
 from lbmpy.creationfunctions import create_lb_collision_rule
 from lbmpy.enums import Method, Stencil, ForceModel
+from lbmpy.flow_statistics import welford_assignments
 
-from pystencils_walberla import CodeGeneration, generate_info_header
+from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
 from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
 
 info_header = """
@@ -24,7 +25,7 @@ omega = sp.symbols("omega")
 inlet_velocity = sp.symbols("u_x")
 force_vector = tuple([sp.Symbol("F_x"), 0.0, 0.0])
 max_threads = 256
- 
+
 with CodeGeneration() as ctx:
     target = Target.GPU if ctx.gpu else Target.CPU
     sweep_params = {'block_size': (128, 1, 1)} if ctx.gpu else {}
@@ -43,11 +44,12 @@ with CodeGeneration() as ctx:
 
     pdfs = fields(f"pdfs({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
     velocity_field, density_field = fields(f"velocity({dim}), density(1) : {dtype}[{dim}D]", layout='fzyx')
+    mean_velocity = fields(f"mean_velocity({dim}): {dtype}[{stencil.D}D]", layout='fzyx')
 
     macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
 
-    method_enum = Method.CUMULANT
-    # method_enum = Method.SRT
+    # method_enum = Method.CUMULANT
+    method_enum = Method.SRT
     lbm_config = LBMConfig(
         method=method_enum,
         stencil=stencil,
@@ -58,7 +60,7 @@ with CodeGeneration() as ctx:
         # subgrid_scale_model=SubgridScaleModel.QR,
         fourth_order_correction=0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False,
         field_name='pdfs',
-        streaming_pattern=streaming_pattern,
+        streaming_pattern=streaming_pattern
     )
 
     lbm_opt = LBMOptimisation(cse_global=False, cse_pdfs=False, field_layout="fzyx",
@@ -72,26 +74,35 @@ with CodeGeneration() as ctx:
 
     no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
                                      boundary_object=NoSlip(), field_data_type=pdf_dtype)
+ 
+    quadratic_bounce_back = NoSlipLinearBouzidi(calculate_force_on_boundary=True, data_type=dtype)
+    no_slip_interpolated = lbm_boundary_generator(class_name='Obstacle', flag_uid='Obstacle',
+                                                  boundary_object=quadratic_bounce_back,
+                                                  field_data_type=pdf_dtype)
 
     ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
                                  boundary_object=UBB(velocity=velocity_field.center_vector, density=1.0, data_type=dtype, dim=dim),
                                  field_data_type=pdf_dtype)
 
-    outflow_boundary = ExtrapolationOutflow(stencil[4], lb_method, data_type=pdf_dtype, density=1.0)
+    # outflow_boundary = ExtrapolationOutflow(stencil[4], lb_method, data_type=pdf_dtype, density=1.0)
+    outflow_boundary = ExtrapolationOutflow(stencil[4], lb_method, data_type=pdf_dtype)
     outflow = lbm_boundary_generator(class_name='Outflow', flag_uid='Outflow',
                                      boundary_object=outflow_boundary,
                                      field_data_type=pdf_dtype)
 
     # outflow = lbm_boundary_generator(class_name='Outflow', flag_uid='Outflow',
-    #                                  boundary_object=FixedDensity(1.0),
+    #                                 boundary_object=FixedDensity(1.0),
     #                                  field_data_type=pdf_dtype)
 
     generate_lbm_package(ctx, name="PorousMediaGPU", collision_rule=collision_rule,
                          lbm_config=lbm_config, lbm_optimisation=lbm_opt,
-                         nonuniform=True, boundaries=[no_slip, ubb, outflow],
+                         nonuniform=True, boundaries=[no_slip, no_slip_interpolated, ubb, outflow],
                          macroscopic_fields=macroscopic_fields, gpu_indexing_params=sweep_params,
                          target=target, data_type=dtype, pdfs_data_type=pdf_dtype,
                          max_threads=max_threads)
+    
+    welford_update = welford_assignments(field=velocity_field, mean_field=mean_velocity)
+    generate_sweep(ctx, "Welford", welford_update, target=target)
 
     field_typedefs = {'VelocityField_T': velocity_field,
                       'ScalarField_T': density_field}
diff --git a/apps/showcases/PorousMediaGPU/Probe.cpp b/apps/showcases/PorousMediaGPU/Probe.cpp
index 5e85bb4dea31c2b90544e77b604a193c3346c485..160297ac6dd76b180e7f79b9737127ce1155ae7a 100644
--- a/apps/showcases/PorousMediaGPU/Probe.cpp
+++ b/apps/showcases/PorousMediaGPU/Probe.cpp
@@ -91,8 +91,15 @@ void Probe::writeHeaderFiles(){
     if( filesystem::exists( hdfFilePath ) )
         std::remove( hdfFilePath.string().c_str() );
 
-    h5file_ = io::createHdf5file(hdfFilePath);
-    H5::Group timeGroup = h5file_.createGroup("time");
+
+    H5::FileAccPropList fileAccProps;
+#ifdef WALBERLA_BUILD_WITH_MPI
+    //  Enable MPI-IO
+    H5Pset_fapl_mpio(fileAccProps.getId(), mpi::MPIManager::instance()->comm(), MPI_INFO_NULL);
+#endif
+    h5file_ = std::make_unique<H5::H5File>(hdfFilePath.c_str(), H5F_ACC_CREAT | H5F_ACC_RDWR, H5::FileCreatPropList::DEFAULT, fileAccProps);
+
+    H5::Group timeGroup = h5file_->createGroup("time");
     timeGroup.createDataSet("t", io::h5Float(), arrayDataSpace, props);
 
     uint_t counter = 0;
@@ -107,8 +114,8 @@ void Probe::writeHeaderFiles(){
             blocks_->getBlockLocalCellCenter(*block, localCell, p);
 
             std::string probeName = "Probe" + std::to_string(counter);
-            if (h5file_.exists(probeName)) { continue; }
-            H5::Group probeGroup = h5file_.createGroup(probeName);
+            if (h5file_->exists(probeName)) { continue; }
+            H5::Group probeGroup = h5file_->createGroup(probeName);
 
             hsize_t locationShape[1]{ 3 };
             H5::DataSpace spacingAttrDspace(1, locationShape, locationShape);
@@ -157,7 +164,7 @@ void Probe::writeProbes(){
             for(uint_t j = 0; j < nDatasets; ++j){
                 auto dataSetName = dataSetNames[j];
                 for(uint_t k = 0; k < executionCounter_; ++k){
-                    tmp[k] = float_c(cpuDataVector[k*nDatasets*n + j*n + i]);
+                    tmp[k] = cpuDataVector[k*nDatasets*n + j*n + i];
                 }
                 writeData(tmp, probeName, dataSetName);
             }
@@ -171,7 +178,7 @@ void Probe::writeProbes(){
 void Probe::writeData(const std::vector<float>& data, const std::string& groupName, const std::string& dataSetName){
     const hsize_t selectCount[1] { executionCounter_ };
 
-    auto group = h5file_.openGroup(groupName);
+    auto group = h5file_->openGroup(groupName);
     H5::DataSet dataset = group.openDataSet(dataSetName);
 
     dataset.extend(extendSize);
@@ -182,7 +189,7 @@ void Probe::writeData(const std::vector<float>& data, const std::string& groupNa
     dataset.write(data.data(), io::h5Float(), memSpace, dataspace);
 }
 
-void Probe::update(){
+void Probe::update(){    
     ++totalExecutionCounter_;
     if (probePoints_.empty() || (totalExecutionCounter_ - uint_c(1)) % frequency_ != 0) return;
 
@@ -194,7 +201,7 @@ void Probe::update(){
 
         auto* dataVector = block.getData< DataVector > ( probeDataID );
         int64_t* gpuCellVector = dataVector->gpuCellVector();
-        real_t* gpuDataVector = dataVector->advanceGPUDataVector();
+        float* gpuDataVector = dataVector->advanceGPUDataVector();
 
         const int64_t n = dataVector->size();
 
diff --git a/apps/showcases/PorousMediaGPU/Probe.h b/apps/showcases/PorousMediaGPU/Probe.h
index 25a7dba9c1e34e50c67e84b3edbc91a1cc1e6ec7..7dec08cf8ee025362edd42cf6e61dc45db3ae7d1 100644
--- a/apps/showcases/PorousMediaGPU/Probe.h
+++ b/apps/showcases/PorousMediaGPU/Probe.h
@@ -54,13 +54,13 @@ class Probe
         }
         
         std::vector<int64_t>& cpuCellVector() { return cpuCellVector_; }
-        std::vector<real_t>& cpuDataVector() { return cpuDataVector_; }
+        std::vector<float>& cpuDataVector() { return cpuDataVector_; }
 
         int64_t* gpuCellVector() {return gpuCellVector_;}
-        real_t* gpuDataVector() {return gpuDataVector_;}
+        float* gpuDataVector() {return gpuDataVector_;}
 
-        real_t* advanceGPUDataVector() {
-          real_t* result = cur_;
+        float* advanceGPUDataVector() {
+          float* result = cur_;
           cur_ += int64_c(4) * size();
           return result;
         }
@@ -73,8 +73,8 @@ class Probe
           WALBERLA_GPU_CHECK(gpuMalloc((void **)&gpuCellVector_, sizeof(int64_t) * cpuCellVector_.size() ));
           WALBERLA_GPU_CHECK(gpuMemcpy( gpuCellVector_, &cpuCellVector_[0], sizeof(int64_t) * cpuCellVector_.size(), gpuMemcpyHostToDevice ));
 
-          WALBERLA_GPU_CHECK(gpuMalloc((void **)&gpuDataVector_, sizeof(real_t) * cpuDataVector_.size() ));
-          WALBERLA_GPU_CHECK(gpuMemcpy( gpuDataVector_, &cpuDataVector_[0], sizeof(real_t) * cpuDataVector_.size(), gpuMemcpyHostToDevice ));
+          WALBERLA_GPU_CHECK(gpuMalloc((void **)&gpuDataVector_, sizeof(float) * cpuDataVector_.size() ));
+          WALBERLA_GPU_CHECK(gpuMemcpy( gpuDataVector_, &cpuDataVector_[0], sizeof(float) * cpuDataVector_.size(), gpuMemcpyHostToDevice ));
 
           begin_ = gpuDataVector_;
           cur_ = gpuDataVector_;
@@ -83,18 +83,18 @@ class Probe
         }
 
         void syncCPU(){
-          WALBERLA_GPU_CHECK(gpuMemcpy(&cpuDataVector_[0], gpuDataVector_, sizeof(real_t) * cpuDataVector_.size(), gpuMemcpyDeviceToHost ));
+          WALBERLA_GPU_CHECK(gpuMemcpy(&cpuDataVector_[0], gpuDataVector_, sizeof(float) * cpuDataVector_.size(), gpuMemcpyDeviceToHost ));
         }
         
     private:
         std::vector<int64_t> cpuCellVector_;
-        std::vector<real_t> cpuDataVector_;
+        std::vector<float> cpuDataVector_;
 
         int64_t *gpuCellVector_;
-        real_t *gpuDataVector_;
+        float *gpuDataVector_;
 
-        real_t *begin_ = nullptr;
-        real_t *cur_ = nullptr;
+        float *begin_ = nullptr;
+        float *cur_ = nullptr;
 
         bool synced{false};
   };
@@ -135,7 +135,8 @@ class Probe
   uint_t executionCounter_;
   uint_t totalExecutionCounter_;
 
-  H5::H5File h5file_;
+  std::unique_ptr<H5::H5File> h5file_;
+
   hsize_t extendSize[1];
   hsize_t selectStart[1];
   const std::vector<std::string> dataSetNames{"rho", "ux", "uy", "uz"};
diff --git a/apps/showcases/PorousMediaGPU/Setup.h b/apps/showcases/PorousMediaGPU/Setup.h
index 2a4ce7dd6de8a8124da70a10043ddf3f163dadec..a0a87b5faa874368c4a7d11b8583abf07c0bf554 100644
--- a/apps/showcases/PorousMediaGPU/Setup.h
+++ b/apps/showcases/PorousMediaGPU/Setup.h
@@ -30,8 +30,8 @@
 namespace walberla
 {
 
-inline real_t calculateDrivingForce(const real_t frictionVelocity, const real_t channelHeight){
-   return frictionVelocity * frictionVelocity / channelHeight;
+inline real_t calculateDrivingForce(const real_t frictionVelocity, const real_t channelHalfHeight){
+   return frictionVelocity * frictionVelocity / channelHalfHeight;
 }
 
 struct Setup
@@ -61,35 +61,49 @@ struct Setup
 
       referenceLength   = domainSize[1];
       reynoldsNumber    = parameters.getParameter< real_t >("reynoldsNumber");
-      referenceVelocity = parameters.getParameter< real_t >("referenceVelocity");
       latticeVelocity   = parameters.getParameter< real_t >("latticeVelocity");
 
       useGrid               = turbulence.getParameter< bool >("useGrid");
+      turbulentInflow       = turbulence.getParameter< bool >("turbulentInflow");
       urms                  = turbulence.getParameter< real_t >("U_rms");
       characteristicLength  = turbulence.getParameter< real_t >("characteristicLength");
       numberOfFourrierModes = turbulence.getParameter< uint_t >("numberOfFourrierModes");
       uc                    = turbulence.getParameter< Vector3< real_t > >("U_c");
 
-      const real_t latticeReferenceLength = referenceLength / coarseMeshSize;
-      const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) ));
-      machNumber = latticeVelocity / speedOfSound;
-      viscosity  = real_c((latticeVelocity * latticeReferenceLength) / reynoldsNumber);
-      omega      = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5)));
-
-      rho = real_c(1.0);
-      dxC = coarseMeshSize;
-      dxF = real_c(coarseMeshSize) / real_c( 1 << refinementLevels );
-      dt  = latticeVelocity / referenceVelocity * coarseMeshSize;
-
-      drivingForce = real_c(0.0);
       if(config->getNumBlocks("RoughWall") > 0){
+         kinViscosity = parameters.getParameter< real_t >("viscosity");
+         referenceLength *= real_c(0.5);
+         
+         referenceVelocity = (reynoldsNumber * kinViscosity) / referenceLength;
+         dt  = latticeVelocity / referenceVelocity * coarseMeshSize;
+         viscosity = (kinViscosity * dt) / (coarseMeshSize * coarseMeshSize);
+
+         const real_t latticeReferenceLength = referenceLength / coarseMeshSize;
+         const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) ));
+         machNumber = latticeVelocity / speedOfSound;
+         // viscosity  = real_c((latticeVelocity * latticeReferenceLength) / reynoldsNumber);
+         omega      = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5)));
          drivingForce = calculateDrivingForce(latticeVelocity, latticeReferenceLength);
       }
+      else{
+         referenceVelocity = parameters.getParameter< real_t >("referenceVelocity");
+         const real_t latticeReferenceLength = referenceLength / coarseMeshSize;
+         const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) ));
+         machNumber = latticeVelocity / speedOfSound;
+         viscosity  = real_c((latticeVelocity * latticeReferenceLength) / reynoldsNumber);
+         omega      = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5)));
+         drivingForce = real_c(0.0);
+
+         dt  = latticeVelocity / referenceVelocity * coarseMeshSize;
+         kinViscosity = viscosity * coarseMeshSize * coarseMeshSize / dt;
+      }
 
+      rho = real_c(1.0);
+      dxC = coarseMeshSize;
+      dxF = real_c(coarseMeshSize) / real_c( 1 << refinementLevels );
+      
       conversionFactor = dxC / dt;
 
-      kinViscosity = viscosity * coarseMeshSize * coarseMeshSize / dt;
-
       timesteps = parameters.getParameter< uint_t >("timesteps");
       numGhostLayers = refinementLevels == 0 ? uint_c(1) : uint_c(2);
 
@@ -102,7 +116,8 @@ struct Setup
       collisionModel   = infoMap["collisionOperator"];
 
       fluidUID    = FlagUID("Fluid");
-      obstacleUID = FlagUID("NoSlip");
+      noSlipUID   = FlagUID("NoSlip");
+      obstacleUID = FlagUID("Obstacle");
       inflowUID   = FlagUID("UBB");
       outflowUID  = FlagUID("Outflow");
 
@@ -128,6 +143,7 @@ struct Setup
    real_t latticeVelocity;
 
    bool useGrid;
+   bool turbulentInflow;
    real_t urms;
    real_t characteristicLength;
    uint_t numberOfFourrierModes;
@@ -157,12 +173,14 @@ struct Setup
    std::string collisionModel;
 
    FlagUID fluidUID;
+   FlagUID noSlipUID;
    FlagUID obstacleUID;
    FlagUID inflowUID;
    FlagUID outflowUID;
 
    uint8_t inflowFlag;
    uint8_t outflowFlag;
+   uint8_t noSlipFlag;
    uint8_t obstacleFlag;
    uint8_t fluidFlag;
 
diff --git a/apps/showcases/PorousMediaGPU/TPMS.prm b/apps/showcases/PorousMediaGPU/TPMS.prm
index 98ff6f5b08e4ac98979e7ac346a177528cccfc3e..b7ee4728ec9a9459a5114eb41a1c0ba912e219f0 100644
--- a/apps/showcases/PorousMediaGPU/TPMS.prm
+++ b/apps/showcases/PorousMediaGPU/TPMS.prm
@@ -4,10 +4,10 @@ Parameters
 
     referenceVelocity 1.0;
     referenceLength 1.0;
-    latticeVelocity 0.05;
+    latticeVelocity 0.025;
     initialiseWithInletVelocity true;
 
-    timesteps 10000;
+    timesteps 100001;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -19,11 +19,13 @@ Parameters
 Turbulence
 {
     useGrid false;
-    U_rms  0.01;
-    characteristicLength 0.4;
-    numberOfFourrierModes 128;
-    U_c < 2.0, 0.0, 0.0 >;
+    turbulentInflow true;
+    U_rms  0.025;
+    characteristicLength 2e-4;
+    numberOfFourrierModes 1000;
+    U_c < 1, 1, 1 >;
 }
+
 /*
 TPMS
 {
@@ -39,9 +41,9 @@ TPMS
 DomainSetup
 {
     domainSize    < 8, 1, 1 >;
-    cellsPerBlock < 384, 48, 48 >;
+    // cellsPerBlock < 384, 48, 48 >;
     // cellsPerBlock < 768, 96, 96 >;
-    // cellsPerBlock < 1536, 192, 192 >;
+    cellsPerBlock < 1536, 192, 192 >;
     blocks    < 1, 1, 1 >;
     periodic    < false, true, true >;
     refinementLevels 0;
@@ -66,13 +68,13 @@ StabilityChecker
 
 VTKWriter
 {
-    vtkWriteFrequency 0;
+    vtkWriteFrequency 50000;
     vtkGhostLayers 0;
     velocity true;
     writeVelocityAsMagnitude false;
     density false;
     flag false;
-    writeOnlySlice false;
+    writeOnlySlice true;
     sliceThickness 1;
     writeXZSlice false;
     amrFileFormat false;
@@ -93,6 +95,6 @@ Logging
 Probes
 {
     evaluationFrequency 1;
-    writeCounter 5000; // after how many evaluations the results are written to file
+    writeCounter 1000; // after how many evaluations the results are written to file
     filename probeData.h5;
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/Types.h b/apps/showcases/PorousMediaGPU/Types.h
index 6ded786456e34f55b15ad79c98f746d22ae5688c..5d2fbe11f65c22752d59d698a3c4dbaa78ca2f3d 100644
--- a/apps/showcases/PorousMediaGPU/Types.h
+++ b/apps/showcases/PorousMediaGPU/Types.h
@@ -32,16 +32,22 @@ using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
 using PdfField_T           = lbm_generated::PdfField< StorageSpecification_T >;
 using FlagField_T          = FlagField< uint8_t >;
 
+using BoundaryCollection_T = lbm::PorousMediaGPUBoundaryCollection< FlagField_T >;
+using SweepCollection_T = lbm::PorousMediaGPUSweepCollection;
+using WelfordSweep_T = pystencils::Welford;
+
 struct IDs
 {
    BlockDataID pdfField;
    BlockDataID velocityField;
+   BlockDataID meanVelocityField;
    BlockDataID velocityFieldInflowSlice;
    BlockDataID densityField;
    BlockDataID flagField;
 
    BlockDataID pdfFieldGPU;
    BlockDataID velocityFieldGPU;
+   BlockDataID meanVelocityFieldGPU;
    BlockDataID velocityFieldInflowSliceGPU;
    BlockDataID densityFieldGPU;
 };
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/roughWall.prm b/apps/showcases/PorousMediaGPU/roughWall.prm
index e85a7a2190f14fef7dcccd8ec36e1ecdcf45013e..c54b0bc6bacfe77c4b5d222d8fbc76048d533048 100644
--- a/apps/showcases/PorousMediaGPU/roughWall.prm
+++ b/apps/showcases/PorousMediaGPU/roughWall.prm
@@ -1,13 +1,10 @@
 Parameters
 {
     reynoldsNumber 400;
-
-    referenceVelocity 1.0;
-    referenceLength 1.0;
+    viscosity 1.48e-5;
     latticeVelocity 0.05;
-    initialiseWithInletVelocity true;
 
-    timesteps 10001;
+    timesteps 5001;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -19,6 +16,7 @@ Parameters
 Turbulence
 {
     useGrid false;
+    turbulentInflow false;
     U_rms  0.01;
     characteristicLength 0.4;
     numberOfFourrierModes 128;
@@ -33,10 +31,11 @@ RoughWall
 
 DomainSetup
 {
-    domainSize    < 60, 40, 12 >;
-    cellsPerBlock < 600, 400, 120 >;
-    // cellsPerBlock < 768, 96, 96 >;
-    // cellsPerBlock < 1536, 192, 192 >;
+    // domainSize    < 6.9, 3.2, 12 >;
+    // cellsPerBlock < 69, 32, 120 >;
+
+    domainSize    < 55.4, 40, 12 >;
+    cellsPerBlock < 554, 400, 120 >;
     blocks    < 1, 1, 1 >;
     periodic    < true, false, false >;
     refinementLevels 0;
@@ -66,6 +65,7 @@ VTKWriter
     velocity true;
     writeVelocityAsMagnitude true;
     density false;
+    meanVelocity true;
     flag false;
     writeOnlySlice true;
     sliceThickness 1;
@@ -89,5 +89,11 @@ Probes
 {
     evaluationFrequency 1;
     writeCounter 1000; // after how many evaluations the results are written to file
-    filename probes;
+    filename probeData.h5;
+}
+
+DragEvaluation
+{
+    evaluationFrequency 1;
+    filename drag.h5;
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/util.cpp b/apps/showcases/PorousMediaGPU/util.cpp
index cfab8533b63b1bca90d8b2bf11691eb3ed41303e..514d37d3451c97673a641e7fb5ce96d3bf35218c 100644
--- a/apps/showcases/PorousMediaGPU/util.cpp
+++ b/apps/showcases/PorousMediaGPU/util.cpp
@@ -17,6 +17,7 @@
 //! \author Markus Holzer <markus.holzer@fau.de>
 //
 //======================================================================================================================
+#include "core/logging/Logging.h"
 #include "util.h"
 
 namespace walberla
@@ -86,4 +87,47 @@ real_t getKarmanPaoEnergy(const real_t k, const real_t alpha, const real_t kappa
     return a1 * a2 / a3 * a4;
 }
 
+Vector3<real_t> getCenterOfHemisphere(const Vector3<real_t>& p, const real_t hemispheresRadius, const real_t hemispheresDistance,
+                                      const real_t domainYMin, const real_t domainYMax){
+
+    const real_t spaceX     = hemispheresDistance / real_c(2.0) * math::root_three;
+    const real_t zShift     = hemispheresDistance / real_c(2.0);
+    const real_t domainHalf = (domainYMax - domainYMin) / real_c(2.0);
+
+    const real_t xc = std::round(p[0] / spaceX) * spaceX;
+    const real_t yc = p[1] > domainHalf ? real_c(domainYMax) : real_c(domainYMin);
+    const real_t zs = real_c((uint_c(std::round(p[0] / spaceX)) % 2)) * zShift;
+    const real_t zc = (std::round((p[2] - zs) / hemispheresDistance) * (hemispheresDistance)) + zs;
+
+    return Vector3<real_t>(xc, yc, zc);
+}
+
+bool hemispheresContains(const Vector3<real_t>& point, const Vector3<real_t>& center, const real_t radius){
+   return (point - center).sqrLength() <= (radius * radius);
+}
+
+void getAllHemisphereCenters(std::vector<Vector3<real_t>>& centers, const real_t hemispheresRadius, const real_t hemispheresDistance, const math::AABB& domain){
+
+    const real_t spaceX     = hemispheresDistance / real_c(2.0) * math::root_three;
+    const real_t zShift     = hemispheresDistance / real_c(2.0);
+
+    const uint_t nx = uint_c((domain.xMax() - domain.xMin()) / spaceX) + 1;
+    const uint_t nz = uint_c((domain.zMax() - domain.zMin()) / hemispheresDistance) + 1;
+
+    for(uint_t x = 0; x <= nx; ++x){
+        for(uint_t z = 0; z <= nz; ++z){
+            const real_t cx = real_c(x) * spaceX;
+            real_t cz = real_c(z) * hemispheresDistance;
+            if(x%2 != 0){cz += zShift;}
+
+            if(cx > (domain.xMax() + hemispheresRadius) || cz > (domain.zMax() + hemispheresRadius )){
+                continue;
+            }
+
+            centers.emplace_back(Vector3<real_t>(cx, domain.yMin(), cz));
+            centers.emplace_back(Vector3<real_t>(cx, domain.yMax(), cz));
+        }
+    }
+}
+
 } // namespace walberla
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/util.h b/apps/showcases/PorousMediaGPU/util.h
index 55750c8e4435cec1bf0b67d61c77409f9ab313dc..82e804bfd13f008aec7178fcea11fcb149112567 100644
--- a/apps/showcases/PorousMediaGPU/util.h
+++ b/apps/showcases/PorousMediaGPU/util.h
@@ -23,6 +23,7 @@
 #include "core/math/Constants.h"
 #include "core/math/Vector3.h"
 #include "core/math/Random.h"
+#include "core/math/AABB.h"
 
 namespace walberla{
 Vector3<real_t> summOfFourrierModes(const Vector3<real_t>& p, const Vector3<real_t>& uc,
@@ -30,4 +31,10 @@ Vector3<real_t> summOfFourrierModes(const Vector3<real_t>& p, const Vector3<real
 
 real_t getKarmanPaoEnergy(const real_t k, const real_t alpha, const real_t kappaVKP, const real_t kappaKolmogorov, const real_t uRMS);
 
+Vector3<real_t> getCenterOfHemisphere(const Vector3<real_t>& p, const real_t hemispheresRadius, const real_t hemispheresDistance,
+                                      const real_t domainYMin, const real_t domainYMax);
+
+bool hemispheresContains(const Vector3<real_t>& point, const Vector3<real_t>& center, const real_t radius);
+void getAllHemisphereCenters(std::vector<Vector3<real_t>>& centers, const real_t hemispheresRadius, const real_t hemispheresDistance, const math::AABB& domain);                         
+
 } // namespace walberla
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/utilGPU.cu b/apps/showcases/PorousMediaGPU/utilGPU.cu
index 75abc7a4d014eb9b2aabf07b482c61dbe8073e1e..abafc7efac81b36868cf6432a48cd3e6599ff0cc 100644
--- a/apps/showcases/PorousMediaGPU/utilGPU.cu
+++ b/apps/showcases/PorousMediaGPU/utilGPU.cu
@@ -68,17 +68,8 @@ static __device__ void densityVelocityOdd(double* const _data_pdfs,
    *uY = u_1;
    *uZ = u_2;
 }
-/*
-static __device__ void warpReduce(volatile double *uX, volatile double *uY, volatile double *uZ, unsigned int tid) 
-{
-   uX[tid] += uX[tid+32]; uY[tid] += uY[tid+32]; uZ[tid] += uZ[tid+32];
-   uX[tid] += uX[tid+16]; uY[tid] += uY[tid+16]; uZ[tid] += uZ[tid+16];
-   uX[tid] += uX[tid+ 8]; uY[tid] += uY[tid+ 8]; uZ[tid] += uZ[tid+ 8];
-   uX[tid] += uX[tid+ 4]; uY[tid] += uY[tid+ 4]; uZ[tid] += uZ[tid+ 4];
-   uX[tid] += uX[tid+ 2]; uY[tid] += uY[tid+ 2]; uZ[tid] += uZ[tid+ 2];
-   uX[tid] += uX[tid+ 1]; uY[tid] += uY[tid+ 1]; uZ[tid] += uZ[tid+ 1];
-}
-*/
+
+
 static __device__ void singleMode(double* rX, double* rY, double* rZ, const double kappaN, const double ut,
                                   const double pX, const double pY, const double pZ, const int64_t linear_index)
 {
@@ -146,21 +137,22 @@ static __global__ void summOfFourrierModesKernel(double * velocity, double * con
                                                  const int64_t ySize, const int64_t zSize,
                                                  const int64_t yStride, const int64_t zStride, const int64_t fStride)
 {
-   if(blockDim.x*blockIdx.x + threadIdx.x < n && blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize)
+   if(blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize)
    {
       extern __shared__ double sharedMemory[];
 
       double* uPrimeX = &sharedMemory[0];
-      double* uPrimeY = &sharedMemory[n];
-      double* uPrimeZ = &sharedMemory[2*n];
+      double* uPrimeY = &sharedMemory[blockDim.x];
+      double* uPrimeZ = &sharedMemory[2*blockDim.x];
 
-      // const int64_t thx = threadIdx.x;
-      const int64_t x   = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t thx = threadIdx.x;
       const int64_t y   = blockDim.y*blockIdx.y + threadIdx.y;
       const int64_t z   = blockDim.z*blockIdx.z + threadIdx.z;
 
-      const int64_t linear_index = x + (y * n) + (z * n * ySize);
-     
+      uPrimeX[thx] = 0.0;
+      uPrimeY[thx] = 0.0;
+      uPrimeZ[thx] = 0.0;
+
       const real_t xCenter = real_t(0.0);
       const real_t yCenter = (yMin + ( real_t( y - nGhostLayer ) + real_t(0.5) ) * dx) + minCornerY;
       const real_t zCenter = (zMin + ( real_t( z - nGhostLayer ) + real_t(0.5) ) * dx) + minCornerZ;
@@ -169,61 +161,46 @@ static __global__ void summOfFourrierModesKernel(double * velocity, double * con
       const real_t pY = yCenter - (t * ucY);
       const real_t pZ = zCenter - (t * ucZ);
 
-      double rX, rY, rZ;
-      singleMode(&rX, &rY, &rZ, kappa[x], ut[x], pX, pY, pZ, linear_index);
+      for(uint_t i = 0; i < n; ++i){
+         const int64_t x = thx + i * blockDim.x;
+         const int64_t linear_index = x + (y * n * blockDim.x) + (z * n * blockDim.x * ySize);
 
-      uPrimeX[x] = rX;
-      uPrimeY[x] = rY;
-      uPrimeZ[x] = rZ;
+         double rX = 0.0;
+         double rY = 0.0;
+         double rZ = 0.0;
 
-      __syncthreads();
+         singleMode(&rX, &rY, &rZ, kappa[x], ut[x], pX, pY, pZ, linear_index);
 
-/*
-      for(unsigned int stride = (blockDim.x/2); stride > 32 ; stride /=2){
-         if(thx < stride){
-            uPrimeX[thx] += uPrimeX[thx + stride];
-            uPrimeY[thx] += uPrimeY[thx + stride];
-            uPrimeZ[thx] += uPrimeZ[thx + stride];
-         }
+         uPrimeX[thx] += rX;
+         uPrimeY[thx] += rY;
+         uPrimeZ[thx] += rZ;
       }
 
-      if(thx < 32){
-         warpReduce(uPrimeX, uPrimeY, uPrimeZ, thx);
-      }
+      __syncthreads();
 
-      if(thx == 0){
-         velocity[yStride*y + zStride*z + 0*fStride] = uPrimeX[0];
-         velocity[yStride*y + zStride*z + 1*fStride] = uPrimeY[0];
-         velocity[yStride*y + zStride*z + 2*fStride] = uPrimeZ[0];
-      }     
-*/
-      
-      if(x == 0){
-         real_t resultX = 0.0;
-         real_t resultY = 0.0;
-         real_t resultZ = 0.0;
-
-         for(int64_t i = 0; i < n; ++i){
-            resultX += uPrimeX[i];
-            resultY += uPrimeY[i];
-            resultZ += uPrimeZ[i];
+      for (uint_t s = 1; s < blockDim.x; s *= 2) {
+         int index = 2 * s * thx;
+         if (index < blockDim.x) {
+            uPrimeX[index] += uPrimeX[index + s];
+            uPrimeY[index] += uPrimeY[index + s];
+            uPrimeZ[index] += uPrimeZ[index + s];
          }
-
-         velocity[yStride*y + zStride*z + 0*fStride] = latticeVelocity + resultX;
-         velocity[yStride*y + zStride*z + 1*fStride] = resultY;
-         velocity[yStride*y + zStride*z + 2*fStride] = resultZ;
+      __syncthreads();
       }
+
+      velocity[yStride*y + zStride*z + 0*fStride] = latticeVelocity + uPrimeX[0];
+      velocity[yStride*y + zStride*z + 1*fStride] = uPrimeY[0];
+      velocity[yStride*y + zStride*z + 2*fStride] = uPrimeZ[0];
    }
 }
 
 
-static __global__ void evaluateProbePoints(double* const pdfs, int64_t* cellData, double* probeData,
+static __global__ void evaluateProbePoints(double* const pdfs, int64_t* cellData, float* probeData,
                                            const int64_t n, const uint8_t timestep, const real_t velocityScalingFactor,
                                            const int64_t xStride, const int64_t yStride, const int64_t zStride, const int64_t fStride)
 {
    if(blockDim.x*blockIdx.x + threadIdx.x < n)
    {
-
       const int64_t ctr = blockDim.x*blockIdx.x + threadIdx.x;
 
       const int64_t x = cellData[ctr];
@@ -239,10 +216,10 @@ static __global__ void evaluateProbePoints(double* const pdfs, int64_t* cellData
          densityVelocityOdd(pdfs, &rho, &uX, &uY, &uZ, x, y, z, xStride, yStride, zStride, fStride);
       }
 
-      probeData[ctr] = rho;
-      probeData[ctr + n] = velocityScalingFactor * uX;
-      probeData[ctr + 2*n] = velocityScalingFactor * uY;
-      probeData[ctr + 3*n] = velocityScalingFactor * uZ;
+      probeData[ctr] = ((float)(rho));
+      probeData[ctr + n] = ((float)(velocityScalingFactor)) * ((float)(uX));
+      probeData[ctr + 2*n] = ((float)(velocityScalingFactor)) * ((float)(uY));
+      probeData[ctr + 3*n] = ((float)(velocityScalingFactor)) * ((float)(uZ));
    }
 }
 
@@ -274,8 +251,11 @@ void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real
    dim3 block_(block[0], block[1], block[2]);
    dim3 grid_(grid[0], grid[1], grid[2]);
 
+   WALBERLA_ASSERT_EQUAL(grid[0], 0, "The summOfFourrierModesKernel kernel contains a reduction which is solved using a loop inside the block of threads. This does not work with multiple threads.")
+
    // In shared memory we store the result of the u_prime (x, y, z)
-   const uint_t sharedMemorySize = 3 * n * sizeof(real_t);
+   const uint_t sharedMemorySize = 3 * block[0] * sizeof(real_t);
+   const uint_t factor = n / block[0];
  
    double * data_velocity = velocityGPU->dataAt(-nGhostLayers, -nGhostLayers, -nGhostLayers, 0);
    const int64_t size_velocity_1 = int64_c(velocityGPU->ySize()) + 2 * nGhostLayers;
@@ -285,13 +265,13 @@ void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real
    const int64_t stride_velocity_3 = int64_c(velocityGPU->fStride());
    internal::summOfFourrierModesKernel<<<grid_, block_, sharedMemorySize, nullptr>>>(data_velocity, kappa, ut,
                                                                       globalMinCorner[1], globalMinCorner[2], blockMinCorner[1], blockMinCorner[2],
-                                                                      dx, uc[0], uc[1], uc[2], t, n, latticeVelocity, nGhostLayers,
+                                                                      dx, uc[0], uc[1], uc[2], t, factor, latticeVelocity, nGhostLayers,
                                                                       size_velocity_1, size_velocity_2,
                                                                       stride_velocity_1, stride_velocity_2, stride_velocity_3);
 
 }
 
-void updateProbe(gpu::GPUField<real_t>* pdfs, int64_t* cellData, real_t* probeData, const int64_t n, const real_t velocityScalingFactor,
+void updateProbe(gpu::GPUField<real_t>* pdfs, int64_t* cellData, float* probeData, const int64_t n, const real_t velocityScalingFactor,
                  const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid){
 
    dim3 block_(block[0], block[1], block[2]);
diff --git a/apps/showcases/PorousMediaGPU/utilGPU.h b/apps/showcases/PorousMediaGPU/utilGPU.h
index cdaf6ff40f4404472522bd9beb41d8eac8bb3c00..6b0ac4cf9ce1c3cca0f3303b842bcb50888b0d90 100644
--- a/apps/showcases/PorousMediaGPU/utilGPU.h
+++ b/apps/showcases/PorousMediaGPU/utilGPU.h
@@ -34,7 +34,7 @@ void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real
                          const real_t dx, const real_t t, const uint_t n, const real_t latticeVelocity, const int64_t nGhostLayers,
                          const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid);
 
-void updateProbe(gpu::GPUField<real_t>* pdfs, int64_t* cellData, real_t* probeData, const int64_t n, const real_t velocityScalingFactor,
+void updateProbe(gpu::GPUField<real_t>* pdfs, int64_t* cellData, float* probeData, const int64_t n, const real_t velocityScalingFactor,
                  const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid);
 
 } // namespace walberla
\ No newline at end of file
diff --git a/python/lbmpy_walberla/additional_data_handler.py b/python/lbmpy_walberla/additional_data_handler.py
index b6d8a98f4457691a34f4f63b36045711adc6b606..9b04df4fdbd541be42ce1d3d27370d678a7a53de 100644
--- a/python/lbmpy_walberla/additional_data_handler.py
+++ b/python/lbmpy_walberla/additional_data_handler.py
@@ -17,11 +17,12 @@ from pystencils_walberla.additional_data_handler import AdditionalDataHandler
 
 interpolation_bc_check_template = """
 if(!isFlagSet(it.neighbor({cx}, {cy}, {cz}, 0), domainFlag)){{ 
-   //Linear-Bouzidi requires 2 fluid nodes: if the 2nd node is not available abort, 
-   //apply Bounce Back at that point. This clearly lowers the accuracy and makes inconsistent the
-   //calculation of the total force
+   // Linear-Bouzidi requires 2 fluid nodes: if the 2nd node is available apply Bounce Back at that point.
+   // This lowers the accuracy and leads to inconsitencies in force calculation
    element.q = -1.0;
-   WALBERLA_LOG_INFO_ON_ROOT("Warning: Bouzidi cannot be applied at least on one boundary link.")
+   #ifdef WALBERLA_LOGLEVEL_DETAIL
+   WALBERLA_LOG_WARNING_ON_ROOT("Bouzidi cannot be applied at cell: " << it.cell() << " on link: " << {f})
+   #endif
 }} //end if to check Bouzidi applicability  
 """
 
@@ -184,7 +185,7 @@ class NoSlipLinearBouzidiAdditionalDataHandler(AdditionalDataHandler):
         cz = self._walberla_stencil[direction][2]
         fluid_cell = "Cell(it.x(), it.y(), it.z())"
         boundary_cell = f"Cell(it.x() + {cx}, it.y() + {cy}, it.z() + {cz})"
-        check_str = interpolation_bc_check_template.format(cx=-cx, cy=-cy, cz=-cz)
+        check_str = interpolation_bc_check_template.format(cx=-cx, cy=-cy, cz=-cz, f=direction)
         init_element = f"elementInitialiser({fluid_cell}, {boundary_cell}, blocks, *block)"
         init_list = [f"const {self._dtype} q = (({self._dtype}) {init_element});",
                      "element.q = q;",