From 6138d5b86cb9044cb3898994d443a8c0c85b5069 Mon Sep 17 00:00:00 2001
From: Markus Holzer <mholzer@ldmpe865h.onecert.fr>
Date: Fri, 21 Mar 2025 16:05:17 +0100
Subject: [PATCH] [skip ci] Update

---
 CMakeLists.txt                                |   1 +
 apps/showcases/PorousMediaGPU/CMakeLists.txt  |   5 +
 .../PorousMediaGPU/DragEvaluation.cpp         | 170 ++++++++++--------
 .../showcases/PorousMediaGPU/DragEvaluation.h |   3 +
 .../PorousMediaGPU/InitializerFunctions.cpp   |  35 ++++
 apps/showcases/PorousMediaGPU/JinCase.prm     |  43 ++---
 .../PorousMediaGPU/PorousMediaGPU.cpp         | 166 ++++++++++-------
 apps/showcases/PorousMediaGPU/Probe.cpp       |  97 +++++-----
 apps/showcases/PorousMediaGPU/Probe.h         |   5 +-
 apps/showcases/PorousMediaGPU/Setup.h         |  21 ++-
 apps/showcases/PorousMediaGPU/TPMS.prm        |  16 +-
 apps/showcases/PorousMediaGPU/roughWall.prm   |   1 +
 apps/showcases/PorousMediaGPU/utilGPU.cu      |  12 +-
 apps/showcases/PorousMediaGPU/utilGPU.h       |  45 ++++-
 14 files changed, 388 insertions(+), 232 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7b21f6268..ec69c641d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1069,6 +1069,7 @@ if ( WALBERLA_BUILD_WITH_CUDA )
         endif()
 
         enable_language(CUDA)
+        find_package(CUDAToolkit REQUIRED)
 
         #include directories and cudart lib is needed for cpp files that use cuda headers/libs
         include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
diff --git a/apps/showcases/PorousMediaGPU/CMakeLists.txt b/apps/showcases/PorousMediaGPU/CMakeLists.txt
index 9b7a4e28a..b97dc1247 100644
--- a/apps/showcases/PorousMediaGPU/CMakeLists.txt
+++ b/apps/showcases/PorousMediaGPU/CMakeLists.txt
@@ -23,6 +23,11 @@ if (WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP)
     waLBerla_add_executable ( NAME PorousMediaGPU
             FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h DragEvaluation.cpp InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp utilGPU.cu
             DEPENDS blockforest boundary core field gpu io lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen )
+
+if(TARGET CUDA::curand)
+    target_link_libraries(PorousMediaGPU CUDA::curand)
+endif()
+
 else()
     waLBerla_add_executable ( NAME PorousMediaGPU
             FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h DragEvaluation.cpp InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp
diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.cpp b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
index 0617986e5..8d84b7906 100644
--- a/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
+++ b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
@@ -61,29 +61,28 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks,
    {
       if( filesystem::exists( hdfFilePath ) )
          std::remove( hdfFilePath.string().c_str() );
-   }
 
-   extendSize[0] = 0;
-   selectStart[0] = 0;
-   H5::FileAccPropList fileAccProps;
-#ifdef WALBERLA_BUILD_WITH_MPI
-   //  Enable MPI-IO
-   H5Pset_fapl_mpio(fileAccProps.getId(), mpi::MPIManager::instance()->comm(), MPI_INFO_NULL);
-#endif
-   h5file_ = std::make_unique<H5::H5File>(hdfFilePath.c_str(), H5F_ACC_CREAT | H5F_ACC_RDWR, H5::FileCreatPropList::DEFAULT, fileAccProps);
-   H5::Group dataGroup = h5file_->createGroup("Data");
+      extendSize[0] = 0;
+      selectStart[0] = 0;
+      H5::FileAccPropList fileAccProps;
+      h5file_ = std::make_unique<H5::H5File>(hdfFilePath.c_str(), H5F_ACC_CREAT | H5F_ACC_RDWR, H5::FileCreatPropList::DEFAULT, fileAccProps);
+      H5::Group dataGroup = h5file_->createGroup("Data");
 
-   const hsize_t dsetSize[1]{ 1 };
-   const hsize_t maxSize[1]{ H5S_UNLIMITED };
-   const hsize_t chunkSize[1]{writeCounter_};
+      const hsize_t dsetSize[1]{ 1 };
+      const hsize_t maxSize[1]{ H5S_UNLIMITED };
+      const hsize_t chunkSize[1]{writeCounter_};
 
-   H5::DataSpace arrayDataSpace(1, dsetSize, maxSize);
-   H5::DSetCreatPropList props;
-   props.setChunk(1, chunkSize);
+      H5::DataSpace arrayDataSpace(1, dsetSize, maxSize);
+      H5::DSetCreatPropList props;
+      props.setChunk(1, chunkSize);
+
+      for (auto quantitie = quantities_.begin(); quantitie != quantities_.end(); quantitie++){
+         dataGroup.createDataSet(quantitie->first, io::h5Float(), arrayDataSpace, props);
+      }
 
-   for (auto quantitie = quantities_.begin(); quantitie != quantities_.end(); quantitie++){
-      dataGroup.createDataSet(quantitie->first, io::h5Float(), arrayDataSpace, props);
    }
+   WALBERLA_MPI_BARRIER()
+
    dragResults_.resize(quantities_.size() * writeCounter_);
 
    auto domain = blocks->getDomain();
@@ -142,7 +141,10 @@ void DragEvaluation::forceCalculation(const uint_t level)
          force_ += Vector3<double>(boundaryCollection_.ObstacleObject->getForce(b));
       }
 
-      mpi::reduceInplace(force_, mpi::SUM);
+      // Gather force on root rank and distrute result to all other processes
+      mpi::reduceInplace(force_, mpi::SUM, 0, MPIManager::instance()->comm());
+      mpi::broadcastObject(force_, 0, MPIManager::instance()->comm());
+
       WALBERLA_ROOT_SECTION(){
          const double t = double_c(totalExecutionCounter_ - 1) * dt_;
          const double meanU2 = double_c(meanVelocity_) * double_c(meanVelocity_);
@@ -167,7 +169,9 @@ void DragEvaluation::forceCalculation(const uint_t level)
       ++meanQuantitiesCounter_;
       ++executionCounter_;
       if(executionCounter_ >= writeCounter_ || totalExecutionCounter_ == timesteps_){
-         writeResults();
+         WALBERLA_ROOT_SECTION(){
+            writeResults();
+         }
       }
    }
 }
@@ -209,49 +213,49 @@ Plotter::Plotter(std::shared_ptr< StructuredBlockForest >& blocks, const Setup&
    {
       if( filesystem::exists( hdfFilePath ) )
          std::remove( hdfFilePath.string().c_str() );
-   }
 
-   H5::FileAccPropList fileAccProps;
-#ifdef WALBERLA_BUILD_WITH_MPI
-   //  Enable MPI-IO
-   H5Pset_fapl_mpio(fileAccProps.getId(), mpi::MPIManager::instance()->comm(), MPI_INFO_NULL);
-#endif
-   h5file_ = std::make_unique<H5::H5File>(hdfFilePath.c_str(), H5F_ACC_CREAT | H5F_ACC_RDWR, H5::FileCreatPropList::DEFAULT, fileAccProps);
-   H5::Group dataGroup = h5file_->createGroup("Data");
+      H5::FileAccPropList fileAccProps;
+      h5file_ = std::make_unique<H5::H5File>(hdfFilePath.c_str(), H5F_ACC_CREAT | H5F_ACC_RDWR, H5::FileCreatPropList::DEFAULT, fileAccProps);
+      H5::Group dataGroup = h5file_->createGroup("Data");
+   }
 }
 
 void Plotter::writeResult(const real_t frictionVelocity, const real_t welfordCounter){
 
-   auto finalDomain = blocks_->getDomain();
-
-   std::map<real_t, uint_t> centerToIntex;
-   uint_t idx = 0;
-
-   Vector3<real_t> p = finalDomain.min();
-   IBlock* block = blocks_->getBlock(p);
-   while(block){
-      const uint_t ny = blocks_->getNumberOfYCells(*block);
+   std::set<real_t> localYCoordinates;
+   for (auto& block : *blocks_){
+      const uint_t ny = blocks_->getNumberOfYCells(block);
       for(uint_t y = 0; y < ny; ++y){
-            const Cell localCell(cell_idx_c(0), cell_idx_c(y), cell_idx_c(0));
-            Vector3<real_t> globalCellCenter;
-            blocks_->getBlockLocalCellCenter(*block, localCell, globalCellCenter);
-            centerToIntex[globalCellCenter[1]] = idx;
-            idx++;
-      }
-      p[1] = block->getAABB().yMax() + setup_.dxF;
-      block = blocks_->getBlock(p);
-      if(block == nullptr){
-         WALBERLA_CHECK_GREATER(p[1], finalDomain.yMax(), "This method does is not yet implemented for MPI parallel simulations")
+         const Cell localCell(cell_idx_c(0), cell_idx_c(y), cell_idx_c(0));
+         Vector3<real_t> globalCellCenter;
+         blocks_->getBlockLocalCellCenter(block, localCell, globalCellCenter);
+         localYCoordinates.insert(globalCellCenter[1]);
       }
    }
 
-   const real_t viscosity = setup_.viscosity;
+   std::vector<real_t> allYCoordinates(localYCoordinates.begin(), localYCoordinates.end());
+
+   // gather all coordinates on the root rank and broadcast to all other ranks
+   allYCoordinates = mpi::gatherv(allYCoordinates, 0, MPIManager::instance()->comm());
+   mpi::broadcastObject(allYCoordinates, 0, MPIManager::instance()->comm());
+
+   std::sort( allYCoordinates.begin(), allYCoordinates.end() );
+   allYCoordinates.erase( std::unique( allYCoordinates.begin(), allYCoordinates.end() ), allYCoordinates.end() );
+
+   std::map<real_t, uint_t> centerToIntex;
+   for(uint_t idx = 0; idx < allYCoordinates.size(); ++idx){
+      centerToIntex[allYCoordinates[idx]] = idx;
+   }
+
+   const real_t viscosity = setup_.kinViscosity;
    const real_t channelHalfHeight = setup_.referenceLength;
 
-   addAttribute(viscosity, "viscosity");
-   addAttribute(channelHalfHeight, "channelHalfHeight");
-   addAttribute(frictionVelocity, "frictionVelocity");
-   addAttribute(welfordCounter, "welfordCounter");
+   WALBERLA_ROOT_SECTION(){
+      addAttribute(viscosity, "viscosity");
+      addAttribute(channelHalfHeight, "channelHalfHeight");
+      addAttribute(frictionVelocity, "frictionVelocity");
+      addAttribute(welfordCounter, "welfordCounter");
+   }
 
    std::vector<real_t> yCellCenter(centerToIntex.size(), real_c(0.0));
    std::vector<real_t> yRel(centerToIntex.size(), real_c(0.0));
@@ -285,7 +289,7 @@ void Plotter::writeResult(const real_t frictionVelocity, const real_t welfordCou
             WALBERLA_ABORT("y coordinate of center to index map was not contained")
          }
 
-         idx = centerToIntex[globalCellCenter[1]];
+         const uint_t idx = centerToIntex[globalCellCenter[1]];
          cellCounterI[idx]++;
 
          if(isFlagSet(flagField->get(*cellIt), domainFlag)){
@@ -294,40 +298,50 @@ void Plotter::writeResult(const real_t frictionVelocity, const real_t welfordCou
             meanVelocityI[idx] += meanVelocityField->get(*cellIt, 0);
 
             for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
-               reynoldsStressVector[i * centerToIntex.size() + idx] = sosField->get(*cellIt, i) / welfordCounter;
+               reynoldsStressVector[i * centerToIntex.size() + idx] += sosField->get(*cellIt, i) / welfordCounter;
             }
          }
       }
    }
 
-   for(uint_t n = 0; n < centerToIntex.size(); ++n){
-      meanVelocityI[n] /= real_c(cellCounterI[n]);
-      meanVelocityE[n] /= real_c(cellCounterE[n]);
+   mpi::reduceInplace(cellCounterI, mpi::SUM, 0, MPIManager::instance()->comm());
+   mpi::reduceInplace(cellCounterE, mpi::SUM, 0, MPIManager::instance()->comm());
+   mpi::reduceInplace(meanVelocityE, mpi::SUM, 0, MPIManager::instance()->comm());
+   mpi::reduceInplace(meanVelocityI, mpi::SUM, 0, MPIManager::instance()->comm());
+   mpi::reduceInplace(reynoldsStressVector, mpi::SUM, 0, MPIManager::instance()->comm());
+
 
-      for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
-         reynoldsStressVector[i * centerToIntex.size() + n] /= real_c(cellCounterE[n]);
+   WALBERLA_ROOT_SECTION(){
+      for(uint_t n = 0; n < centerToIntex.size(); ++n){
+         meanVelocityI[n] /= real_c(cellCounterI[n]);
+         meanVelocityE[n] /= real_c(cellCounterE[n]);
+
+         for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+            reynoldsStressVector[i * centerToIntex.size() + n] /= real_c(cellCounterE[n]);
+         }
+
+         uPlusI[n] = meanVelocityI[n] / frictionVelocity;
+         uPlusE[n] = meanVelocityE[n] / frictionVelocity;
       }
 
-      uPlusI[n] = meanVelocityI[n] / frictionVelocity;
-      uPlusE[n] = meanVelocityE[n] / frictionVelocity;
-   }
+      writeData(yCellCenter.data(), centerToIntex.size(), "y");
+      writeData(yRel.data(), centerToIntex.size(), "yRel");
+      writeData(yPlus.data(), centerToIntex.size(), "yPlus");
+      writeData(meanVelocityI.data(), centerToIntex.size(), "meanVelocityI");
+      writeData(meanVelocityE.data(), centerToIntex.size(), "meanVelocityE");
+      writeData(uPlusI.data(), centerToIntex.size(), "uPlusI");
+      writeData(uPlusE.data(), centerToIntex.size(), "uPlusE");
+      writeData(&reynoldsStressVector[0 * centerToIntex.size()], centerToIntex.size(), "uu_rms");
+      writeData(&reynoldsStressVector[1 * centerToIntex.size()], centerToIntex.size(), "uv_rms");
+      writeData(&reynoldsStressVector[2 * centerToIntex.size()], centerToIntex.size(), "uw_rms");
+      writeData(&reynoldsStressVector[3 * centerToIntex.size()], centerToIntex.size(), "vu_rms");
+      writeData(&reynoldsStressVector[4 * centerToIntex.size()], centerToIntex.size(), "vv_rms");
+      writeData(&reynoldsStressVector[5 * centerToIntex.size()], centerToIntex.size(), "vw_rms");
+      writeData(&reynoldsStressVector[6 * centerToIntex.size()], centerToIntex.size(), "wu_rms");
+      writeData(&reynoldsStressVector[7 * centerToIntex.size()], centerToIntex.size(), "wv_rms");
+      writeData(&reynoldsStressVector[8 * centerToIntex.size()], centerToIntex.size(), "ww_rms");
 
-   writeData(yCellCenter.data(), centerToIntex.size(), "y");
-   writeData(yRel.data(), centerToIntex.size(), "yRel");
-   writeData(yPlus.data(), centerToIntex.size(), "yPlus");
-   writeData(meanVelocityI.data(), centerToIntex.size(), "meanVelocityI");
-   writeData(meanVelocityE.data(), centerToIntex.size(), "meanVelocityE");
-   writeData(uPlusI.data(), centerToIntex.size(), "uPlusI");
-   writeData(uPlusE.data(), centerToIntex.size(), "uPlusE");
-   writeData(&reynoldsStressVector[0 * centerToIntex.size()], centerToIntex.size(), "uu_rms");
-   writeData(&reynoldsStressVector[1 * centerToIntex.size()], centerToIntex.size(), "uv_rms");
-   writeData(&reynoldsStressVector[2 * centerToIntex.size()], centerToIntex.size(), "uw_rms");
-   writeData(&reynoldsStressVector[3 * centerToIntex.size()], centerToIntex.size(), "vu_rms");
-   writeData(&reynoldsStressVector[4 * centerToIntex.size()], centerToIntex.size(), "vv_rms");
-   writeData(&reynoldsStressVector[5 * centerToIntex.size()], centerToIntex.size(), "vw_rms");
-   writeData(&reynoldsStressVector[6 * centerToIntex.size()], centerToIntex.size(), "wu_rms");
-   writeData(&reynoldsStressVector[7 * centerToIntex.size()], centerToIntex.size(), "wv_rms");
-   writeData(&reynoldsStressVector[8 * centerToIntex.size()], centerToIntex.size(), "ww_rms");
+   }
 }
 
 void Plotter::writeData(const real_t* data, const uint_t size, const std::string& dataSetName){
diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.h b/apps/showcases/PorousMediaGPU/DragEvaluation.h
index 4d621325b..a344180b4 100644
--- a/apps/showcases/PorousMediaGPU/DragEvaluation.h
+++ b/apps/showcases/PorousMediaGPU/DragEvaluation.h
@@ -24,6 +24,9 @@
 #include "core/Filesystem.h"
 #include "core/math/Sample.h"
 #include "core/math/Vector3.h"
+#include "core/mpi/Broadcast.h"
+#include "core/mpi/Gatherv.h"
+#include "core/mpi/Reduce.h"
 
 #include "lbm_generated/field/PdfField.h"
 
diff --git a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
index fcae3b718..6c45517a9 100644
--- a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
+++ b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
@@ -94,6 +94,41 @@ void initHemisphers(const shared_ptr< StructuredBlockStorage >& blocks, const Bl
    }
 }
 
+void initGenericPorousMatrix(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID flagFieldID, const Setup& setup,
+                             const Config::BlockHandle& gpmParameters){
+
+   const real_t poreSize     = gpmParameters.getParameter< real_t >("poreSize");
+   const real_t barSize      = gpmParameters.getParameter< real_t >("barSize");
+   const real_t halfPoreSize = poreSize * real_c(0.5);
+   const real_t PoreSize2    = poreSize * real_c(2.0);
+   const real_t halfbarSize  = barSize * real_c(0.5);
+
+   auto domain = blocks->getDomain();
+   const real_t zMin = domain.zMin();
+   const real_t zMax = domain.zMax();
+
+   for (auto bIt = blocks->begin(); bIt != blocks->end(); ++bIt){
+      Block& b             = dynamic_cast< Block& >(*bIt);
+      const auto flagField    = b.getData< FlagField_T >(flagFieldID);
+      const auto boundaryFlag = flagField->getOrRegisterFlag(setup.noSlipUID);
+
+      for( auto it = flagField->beginWithGhostLayer(0); it != flagField->end(); ++it ){
+         Vector3< real_t > cellCenter = blocks->getBlockLocalCellCenter( b, it.cell() );
+         blocks->mapToPeriodicDomain(cellCenter);
+
+         const real_t xc = std::round(cellCenter[0] / poreSize) * poreSize;
+         const real_t ys = real_c((uint_c(std::round(cellCenter[0] / poreSize)) % 2)) * poreSize;
+         const real_t yc = (std::round((cellCenter[1] - ys) / PoreSize2) * (PoreSize2)) + ys;
+
+         const AABB barAABB(xc - halfbarSize, yc - halfbarSize, zMin, xc + halfbarSize, yc + halfbarSize, zMax);
+         if(barAABB.contains(cellCenter)) {
+            addFlag(flagField->get(it.x(), it.y(), it.z()), boundaryFlag);
+         }
+         
+      }
+   }
+}
+
 void setVelocityFieldsAsmuth(const shared_ptr< StructuredBlockStorage >& blocks,
                              const Setup& setup, const IDs& ids) {
 
diff --git a/apps/showcases/PorousMediaGPU/JinCase.prm b/apps/showcases/PorousMediaGPU/JinCase.prm
index 8215f12bf..6ace3c854 100644
--- a/apps/showcases/PorousMediaGPU/JinCase.prm
+++ b/apps/showcases/PorousMediaGPU/JinCase.prm
@@ -1,16 +1,13 @@
 Parameters
 {
-    reynoldsNumber 400;
-    bulkReynoldsNumber 4466;
+    reynoldsNumber 700;
     latticeVelocity 0.05;
     initialiseWithlatticeVelocity true;
-    initialiseWithTurbulence true;
-    forceUpdateFrequency 10;
+    initialiseWithTurbulence false;
+    forceUpdateFrequency 0;
+    pressureGradient 0.188;
 
-    welfordSamplingStart 200000;
-    welfordSamplingFrequency 1;
-
-    timesteps 10;
+    timesteps 100001;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -29,23 +26,19 @@ Turbulence
     U_c < 2.0, 0.0, 0.0 >;
 }
 
-RoughWall
+genericPorousMatrix 
 {
-    hemispheresRadius 1;
-    hemispheresDistance 4;
+    poreSize 2;
+    barSize 1;
 }
 
 DomainSetup
 {
-    domainSize    < 48.4, 40, 12 >;
-    cellsPerBlock < 121, 10, 30 >;
-    blocks    < 1, 10, 1 >;
-
-    // domainSize    < 48.4, 40, 12 >;
-    // cellsPerBlock < 242, 20, 60 >;
-    // blocks    < 1, 10, 1 >;
+    domainSize    < 12, 8, 4 >;
+    cellsPerBlock < 960, 640, 320 >;
+    blocks    < 1, 1, 1 >;
 
-    periodic    < true, false, true >;
+    periodic    < true, true, true >;
     refinementLevels 0;
     numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
     blockForestFilestem porousMediaBlockForest;
@@ -54,10 +47,8 @@ DomainSetup
 
 Boundaries
 {
-    Border { direction N;    walldistance -1;  flag Obstacle; }
-    Border { direction S;    walldistance -1;  flag Obstacle; }
-}
 
+}
 
 StabilityChecker
 {
@@ -68,7 +59,7 @@ StabilityChecker
 
 VTKWriter
 {
-    vtkWriteFrequency 2000;
+    vtkWriteFrequency 10000;
     vtkGhostLayers 0;
     velocity true;
     writeVelocityAsMagnitude false;
@@ -92,7 +83,7 @@ Logging
     remainingTimeLoggerFrequency 20; // in seconds
     writeSurfaceMeshOfTPMS false;
     outputFolder output;
-    filestem Re400;
+    filestem Jin;
 }
 
 Probes
@@ -103,6 +94,6 @@ Probes
 
 DragEvaluation
 {
-    evaluationStart 200000;
-    evaluationFrequency 10;
+    evaluationStart 0;
+    evaluationFrequency 0;
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
index 35af02ce8..2d70f31d3 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
@@ -285,15 +285,11 @@ int main(int argc, char** argv){
 
    auto allocator = make_shared< gpu::HostFieldAllocator<macroFieldType> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
    ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
-   ids.meanVelocityField = field::addToStorage< VelocityField_T >(blocks, "mean velocity", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
-   ids.sosField = field::addToStorage< TensorField_T >(blocks, "sos velocity", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
    ids.densityField  = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, setup.numGhostLayers, allocator);
    ids.flagField     = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", setup.numGhostLayers);
 
    ids.pdfFieldGPU      = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, ids.pdfField, StorageSpec, "pdfs on GPU", true);
    ids.velocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityField, "velocity on GPU", true);
-   ids.meanVelocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.meanVelocityField, "mean velocity on GPU", true);
-   ids.sosFieldGPU = gpu::addGPUFieldToStorage< TensorField_T >(blocks, ids.sosField, "sos field on GPU", true);
    ids.densityFieldGPU  = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, ids.densityField, "velocity on GPU", true);
    ids.flagFieldGPU     = gpu::addGPUFieldToStorage< FlagField_T >(blocks, ids.flagField, "flag on GPU", true);
 
@@ -303,7 +299,6 @@ int main(int argc, char** argv){
 
    const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
    SweepCollection_T sweepCollection(blocks, ids.densityFieldGPU, ids.pdfFieldGPU, ids.velocityFieldGPU, setup.drivingForce, setup.omega, innerOuterSplit);
-   WelfordSweep_T welfordSweep(ids.meanVelocityFieldGPU, ids.sosFieldGPU, ids.velocityFieldGPU, real_c(0.0));
    WALBERLA_LOG_INFO_ON_ROOT(++step << ". Initialising data")
 
    if(setup.initialiseWithlatticeVelocity){
@@ -330,30 +325,6 @@ int main(int argc, char** argv){
    sweepCollection.initialiseBlockPointer();
    gpu::fieldCpy< gpu::GPUField< VelocityField_T::value_type >, VelocityField_T >(blocks, ids.velocityFieldInflowSliceGPU, ids.velocityFieldInflowSlice);
 
-   auto inflow = std::make_shared<Inflow>(blocks, setup, ids);
-   auto probe = std::make_shared<Probe>(blocks, setup, ids, config->getOneBlock("Probes"));
-   auto plotter = std::make_shared<Plotter>(blocks, setup, ids);
-
-
-/*
-   const real_t lenght = 8.0;
-   const real_t res = 3.0;
-   const real_t start = 0.0;
-   const uint_t nx = uint_c(std::floor(lenght / (setup.dxC * res)));
-   for(uint_t i = 0; i < nx; ++i){
-      probe->addPoint(Vector3<real_t>(start + real_c(i) * (setup.dxC * res), 0.5, 0.5 ));
-   }
-*/ 
-
-   const uint_t nx = blocks->getXSize() * blocks->getNumberOfXCells();
-   for(uint_t i = 0; i < nx; ++i){
-      probe->addPoint(Vector3<real_t>(real_c(i) * setup.dxC, finalDomain.center()[1], finalDomain.center()[2] ));
-   }
-
-   probe->addPoint(finalDomain.center());
-   probe->initialise();
-
-
 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
@@ -371,7 +342,6 @@ int main(int argc, char** argv){
       setup.fluidFlag           = flagField->registerFlag(setup.fluidUID);
    }
 
-
    geometry::initBoundaryHandling< FlagField_T >(*blocks, ids.flagField, boundariesConfig);
    std::function< VelocityField_T::value_type (const Cell&, const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) > wallDistanceFunctor = defaultWallDistance;
    if(setup.useGrid){
@@ -379,7 +349,6 @@ int main(int argc, char** argv){
       initGrid(blocks, ids.flagField, setup);
    }
 
-
    if(config->getNumBlocks("RoughWall") > 0){
       WALBERLA_LOG_INFO_ON_ROOT(++step << ". Using RoughWall setup")
       auto wallParameters = config->getOneBlock("RoughWall");
@@ -390,13 +359,17 @@ int main(int argc, char** argv){
       wallDistanceFunctor = wallDist;
    }
 
+   if(config->getNumBlocks("genericPorousMatrix") > 0){
+      WALBERLA_LOG_INFO_ON_ROOT(++step << ". Using Generic Porous Matrix setup")
+      initGenericPorousMatrix(blocks, ids.flagField, setup, config->getOneBlock("genericPorousMatrix"));
+   }
+
    if(config->getNumBlocks("TPMS") > 0){
       WALBERLA_LOG_INFO_ON_ROOT(++step << ". Placing TPMS material")
       auto tpms = std::make_shared<TPMS>(config->getOneBlock("TPMS"), setup);
       tpms->setupBoundary(blocks, ids.flagField);
    }
 
-
    geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, setup.fluidUID, cell_idx_c(0));
    BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID,
                                            ids.velocityFieldInflowSliceGPU, setup.latticeVelocity, wallDistanceFunctor, ids.pdfField);
@@ -408,9 +381,7 @@ int main(int argc, char** argv){
          geometry::writeMesh("TPMS.obj", *mesh);
       }
    }
-
    gpu::fieldCpy< GPUFlagField_T, FlagField_T >(blocks, ids.flagFieldGPU, ids.flagField);
-   auto bvc = std::make_shared<BulkVelocityCalculator>(blocks, ids, setup);
 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                           COMMUNICATION SCHEME                                             ///
@@ -424,13 +395,11 @@ int main(int argc, char** argv){
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                          TIME STEP DEFINITIONS                                             ///
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
    const int streamLowPriority = 0;
    auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
    SweepTimeloop timeloop(blocks->getBlockStorage(), setup.timesteps);
 
-   timeloop.addFuncBeforeTimeStep(inflow->updateGPUFunctor(), "Update inflow velocity");
-   timeloop.addFuncBeforeTimeStep(probe->updateFunctor(), "Update Probe Data");
-
    lbm_generated::BasicRecursiveTimeStepGPU< GPUPdfField_T, SweepCollection_T, BoundaryCollection_T >
       LBMMeshRefinement(blocks, ids.pdfFieldGPU, sweepCollection, boundaryCollection, nonUniformCommunication, nonUniformPackInfo);
 
@@ -445,26 +414,49 @@ int main(int argc, char** argv){
    }
 
    const uint_t forceUpdateFrequency = parameters.getParameter< uint_t >("forceUpdateFrequency", 0);
+   auto bvc = std::make_shared<BulkVelocityCalculator>(blocks, ids, setup, forceUpdateFrequency);
    if(forceUpdateFrequency > 0){
       auto updateForce = [&](){
       if(timeloop.getCurrentTimeStep() % forceUpdateFrequency  == 0) {
             const real_t correctedDrivingForce = bvc->getDrivingForce();
-            sweepCollection.setFx(correctedDrivingForce);
+            if(parameters.getParameter< bool >("adaptDrivingForce", false)){
+               sweepCollection.setFx(correctedDrivingForce);
+            }
+            
          }
       };
       timeloop.addFuncBeforeTimeStep(updateForce, "Update force");
    }
 
+   std::function<void(IBlock*)> copyVelocity = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.velocityField, ids.velocityFieldGPU);
+   std::function<void(IBlock*)> copyDensity  = gpu::fieldCpyFunctor< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(ids.densityField, ids.densityFieldGPU);
+   std::function<void(IBlock*)> copyMeanVelocity;
+   std::function<void(IBlock*)> copySOSField;
+
    const uint_t welfordSamplingStart = parameters.getParameter< uint_t >("welfordSamplingStart", 0);
    const uint_t welfordSamplingFrequency = parameters.getParameter< uint_t >("welfordSamplingFrequency", 0);
+   std::shared_ptr<WelfordSweep_T> welfordSweep;
 
    if(welfordSamplingFrequency > 0){
+
+      WALBERLA_LOG_INFO_ON_ROOT(++step << ". Adding welford algorithm to evaluate mean field quantities")
+
+      ids.meanVelocityField = field::addToStorage< VelocityField_T >(blocks, "mean velocity", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
+      ids.sosField = field::addToStorage< TensorField_T >(blocks, "sos velocity", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
+      ids.meanVelocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.meanVelocityField, "mean velocity on GPU", true);
+      ids.sosFieldGPU = gpu::addGPUFieldToStorage< TensorField_T >(blocks, ids.sosField, "sos field on GPU", true);
+
+      copyMeanVelocity = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.meanVelocityField, ids.meanVelocityFieldGPU);
+      copySOSField     = gpu::fieldCpyFunctor< TensorField_T, gpu::GPUField< TensorField_T::value_type > >(ids.sosField, ids.sosFieldGPU);
+
+      welfordSweep = make_shared<WelfordSweep_T>(ids.meanVelocityFieldGPU, ids.sosFieldGPU, ids.velocityFieldGPU, real_c(0.0));
+
       auto welfordUpdate = [&](){
          if(timeloop.getCurrentTimeStep() > welfordSamplingStart  && (timeloop.getCurrentTimeStep() % welfordSamplingFrequency) == 0) {
-            const uint_t velCtr = uint_c(welfordSweep.getCounter());
-            welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
+            const uint_t velCtr = uint_c(welfordSweep->getCounter());
+            welfordSweep->setCounter(welfordSweep->getCounter() + real_c(1.0));
             for (auto& block : *blocks){
-               welfordSweep.run(&block);
+               welfordSweep->run(&block);
             }
          };
       };
@@ -472,12 +464,54 @@ int main(int argc, char** argv){
    }
 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   ///                                             VTK Output                                                     ///
+   ///                                          Post Processing                                                   ///
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-   auto copyVelocity     = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.velocityField, ids.velocityFieldGPU);
-   auto copyDensity      = gpu::fieldCpyFunctor< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(ids.densityField, ids.densityFieldGPU);
-   auto copyMeanVelocity = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.meanVelocityField, ids.meanVelocityFieldGPU);
+   auto inflow = std::make_shared<Inflow>(blocks, setup, ids);
+   auto probe = std::make_shared<Probe>(blocks, setup, ids, config->getOneBlock("Probes"));
+
+
+/*
+   const real_t lenght = 8.0;
+   const real_t res = 3.0;
+   const real_t start = 0.0;
+   const uint_t nx = uint_c(std::floor(lenght / (setup.dxC * res)));
+   for(uint_t i = 0; i < nx; ++i){
+      probe->addPoint(Vector3<real_t>(start + real_c(i) * (setup.dxC * res), 0.5, 0.5 ));
+   }
+*/ 
+
+   // const uint_t nx = blocks->getXSize() * blocks->getNumberOfXCells();
+   // for(uint_t i = 0; i < nx; ++i){
+   //    probe->addPoint(Vector3<real_t>(real_c(i) * setup.dxC, finalDomain.center()[1], finalDomain.center()[2] ));
+   // }
+   // probe->initialise();
+
+   
+
+   for(uint_t i = 0; i < blocks->getXSize() * blocks->getNumberOfXCells(); ++i){
+       probe->addPoint(Vector3<real_t>(real_c(i) * setup.dxC, finalDomain.center()[1], finalDomain.center()[2] ));
+   }
+
+/*
+   for(uint_t i = 0; i < blocks->getYSize() * blocks->getNumberOfYCells(); ++i){
+       probe->addPoint(Vector3<real_t>(finalDomain.center()[0], real_c(i) * setup.dxC, finalDomain.center()[2] ));
+   }
+
+   for(uint_t i = 0; i < blocks->getZSize() * blocks->getNumberOfZCells(); ++i){
+       probe->addPoint(Vector3<real_t>(finalDomain.center()[0], finalDomain.center()[1], real_c(i) * setup.dxC));
+   }
+
+*/
+   probe->initialise();
+
+   
+   timeloop.addFuncBeforeTimeStep(inflow->updateGPUFunctor(), "Update inflow velocity");
+   timeloop.addFuncBeforeTimeStep(probe->updateFunctor(), "Update Probe Data");
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                             VTK Output                                                     ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
    // VTK
    auto VTKWriter = config->getOneBlock("VTKWriter");
@@ -488,6 +522,7 @@ int main(int argc, char** argv){
    const real_t samplingResolution      = VTKWriter.getParameter< real_t >("samplingResolution", real_c(-1.0));
    const uint_t initialWriteCallsToSkip = VTKWriter.getParameter< uint_t >("initialWriteCallsToSkip", uint_c(0.0));
    const real_t sliceThickness          = VTKWriter.getParameter< real_t >("sliceThickness", real_c(1.0));
+   const bool lastStepAsVolumeFile      = VTKWriter.getParameter< bool >("lastStepAsVolumeFile", false);
 
    if (vtkWriteFrequency > 0){
       const std::string identifier = setup.fileStem;
@@ -501,18 +536,12 @@ int main(int argc, char** argv){
          for (auto& block : *blocks){
             if (VTKWriter.getParameter< bool >("velocity")){copyVelocity(&block);}
             if (VTKWriter.getParameter< bool >("density")){copyDensity(&block);}
-            if (VTKWriter.getParameter< bool >("meanVelocity")){copyMeanVelocity(&block);}
+            if (VTKWriter.getParameter< bool >("meanVelocity") && copyMeanVelocity){copyMeanVelocity(&block);}
          }
 
-         auto drivingForeceGPU = bvc->getDrivingForce();
-         auto initialDrivingForce = setup.drivingForce;
-
-         auto bulkVelocityCPU = calculateBulkVelocity(blocks, ids, setup.fluidUID);
-         auto blukVelocityGPU = bvc->bulkVelocity();
-         WALBERLA_LOG_DEVEL_VAR(bulkVelocityCPU);
-         WALBERLA_LOG_DEVEL_VAR(blukVelocityGPU);
-         WALBERLA_LOG_DEVEL_VAR(blukVelocityGPU);
-         WALBERLA_LOG_DEVEL_VAR(drivingForeceGPU);
+         if(lastStepAsVolumeFile && timeloop.getCurrentTimeStep() == setup.timesteps - 1){
+            vtkOutput->removeCellInclusionFilters();
+         }
 
       });
 
@@ -520,6 +549,7 @@ int main(int argc, char** argv){
 
       field::FlagFieldCellFilter<FlagField_T> fluidFilter( ids.flagField );
       fluidFilter.addFlag( setup.obstacleUID );
+      fluidFilter.addFlag( setup.noSlipUID );
       vtkOutput->addCellExclusionFilter(fluidFilter);
 
 
@@ -545,7 +575,7 @@ int main(int argc, char** argv){
             vtkOutput->addCellDataWriter(velWriter);
          }
       }
-      if (VTKWriter.getParameter< bool >("meanVelocity", false)){
+      if (VTKWriter.getParameter< bool >("meanVelocity", false) && ids.meanVelocityField){
          auto meanVelWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(ids.meanVelocityField, "meanVelocity");
          vtkOutput->addCellDataWriter(meanVelWriter);
 
@@ -606,17 +636,25 @@ int main(int argc, char** argv){
 #endif
    WALBERLA_MPI_BARRIER()
    simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   double time = double_c(simTimer.max());
 
-   gpu::fieldCpy< TensorField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.sosField, ids.sosFieldGPU);
+   if(copySOSField && evaluation){
+      WALBERLA_CHECK_NOT_NULLPTR(welfordSweep)
+      WALBERLA_LOG_INFO_ON_ROOT("Evaluating mean field")
+      auto plotter = std::make_shared<Plotter>(blocks, setup, ids);
+      for (auto& block : *blocks){
+         copySOSField(&block);
+      }
 
-   const double ut = evaluation->getMeanFrictionVelocity();
-   const double welfordCounter = welfordSweep.getCounter();
-   WALBERLA_LOG_DEVEL_VAR(ut)
-   WALBERLA_LOG_DEVEL_VAR(welfordCounter)
-   plotter->writeResult(ut, welfordCounter);
+      const double ut = evaluation->getMeanFrictionVelocity();
+      const double welfordCounter = welfordSweep->getCounter();
+      WALBERLA_LOG_DEVEL_VAR(ut)
+      WALBERLA_LOG_DEVEL_VAR(welfordCounter)
+
+      plotter->writeResult(ut, welfordCounter);
+   }
 
-   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-   double time = double_c(simTimer.max());
    WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
    performance.logResultOnRoot(setup.timesteps, time);
 
diff --git a/apps/showcases/PorousMediaGPU/Probe.cpp b/apps/showcases/PorousMediaGPU/Probe.cpp
index 37323d336..aa06bcebc 100644
--- a/apps/showcases/PorousMediaGPU/Probe.cpp
+++ b/apps/showcases/PorousMediaGPU/Probe.cpp
@@ -25,35 +25,46 @@ namespace walberla{
 void Probe::initialise(){
     if(probePoints_.empty()){return;}
 
-    const AABB domain = blocks_->getDomain();
+    uint_t idx = 0;
+    std::vector<Vector3<real_t>> cellCenters;
  
     for(auto p: probePoints_){
-        if(!domain.contains(p)){continue;}
         IBlock* block = blocks_->getBlock(p);
         if(block != nullptr){
+            const uint_t level = blocks_->getLevel(*block);
+            auto * flagField = block->template getData<FlagField_T>(ids_.flagField);
+            auto domainFlag = flagField->getFlag(setup_.fluidUID);
             const Cell localCell = blocks_->getBlockLocalCell(*block, p);
-            Vector3<real_t> localCellCenter;
-            blocks_->getCellCenter(localCellCenter, localCell);
-            pointsPerBlock_[block].emplace_back(localCellCenter);
+            if(isFlagSet(flagField->get(localCell), domainFlag)){
+                Cell globalCell;
+                blocks_->transformBlockLocalToGlobalCell(globalCell, *block, localCell);
+                Vector3<real_t> gloablCellCenter = blocks_->getCellCenter(globalCell, level);
+                if(std::find(cellCenters.begin(), cellCenters.end(), gloablCellCenter) == cellCenters.end()){
+                    cellCenters.emplace_back(gloablCellCenter);
+
+                    globalPointsPerBlock_[block].emplace_back(std::make_pair(gloablCellCenter, idx));
+                    localCellsPerBlock_[block].emplace_back(localCell);
+                    idx++;
+                }
+            }
         }
     }
 
-    uint_t totalNumberOfPoints = 0;
-    for (auto const& it : pointsPerBlock_){
+    if(globalPointsPerBlock_.empty()){return;}
+
+    for (auto const& it : localCellsPerBlock_){
         IBlock* block = it.first;
-        std::vector<Vector3<real_t>> points = it.second;
+        auto localCells = it.second;
 
         auto * dataVector = block->getData< DataVector > ( probeDataID );
         auto & cpuCellVector = dataVector->cpuCellVector();
         auto & cpuDataVector = dataVector->cpuDataVector();
 
-        const uint_t n = points.size();
-        totalNumberOfPoints += n;
+        const uint_t n = localCells.size();
         cpuCellVector.resize(3 * n);
 
         uint_t i = 0;
-        for(auto p: points){
-            const Cell localCell = blocks_->getBlockLocalCell(*block, p);
+        for(const auto localCell: localCells){
 
             cpuCellVector[i] = localCell[0];
             cpuCellVector[i + n] = localCell[1];
@@ -78,7 +89,6 @@ void Probe::initialise(){
     selectStart[0] = 0;
 
     writeHeaderFiles();
-    WALBERLA_LOG_INFO_ON_ROOT("Added " << totalNumberOfPoints << " probes for postprocessing")
 }
 
 void Probe::writeHeaderFiles(){
@@ -93,37 +103,34 @@ void Probe::writeHeaderFiles(){
 
     const auto outputFolder = setup_.outputFolder;
     const auto fileStem = setup_.fileStem;
-    const auto fileName = outputFolder + "/" + fileStem + "Probe.h5";
-    filesystem::path hdfFilePath( fileName.c_str() );
+    const auto probeFolderString = outputFolder + "/" + fileStem + "Probes";
 
+    filesystem::path probeFolder( probeFolderString );
     WALBERLA_ROOT_SECTION()
     {
-        if( filesystem::exists( hdfFilePath ) )
-             std::remove( hdfFilePath.string().c_str() );
+        if( filesystem::exists( probeFolder ))
+            filesystem::remove_all( probeFolder );
+
+        if ( !filesystem::exists( probeFolder ) )
+            filesystem::create_directories( probeFolder );
     }
 
+    WALBERLA_MPI_BARRIER()
+
+    const auto fileName = probeFolderString + "/" + "ProbeRank_" + std::to_string(MPIManager::instance()->rank()) + ".h5";
+    filesystem::path hdfFilePath( fileName.c_str() );
+
     H5::FileAccPropList fileAccProps;
-#ifdef WALBERLA_BUILD_WITH_MPI
-    //  Enable MPI-IO
-    H5Pset_fapl_mpio(fileAccProps.getId(), mpi::MPIManager::instance()->comm(), MPI_INFO_NULL);
-#endif
     h5file_ = std::make_unique<H5::H5File>(hdfFilePath.c_str(), H5F_ACC_CREAT | H5F_ACC_RDWR, H5::FileCreatPropList::DEFAULT, fileAccProps);
 
     H5::Group timeGroup = h5file_->createGroup("time");
     timeGroup.createDataSet("t", io::h5Float(), arrayDataSpace, props);
 
-    uint_t counter = 0;
-    for (auto const& it : pointsPerBlock_){
-        IBlock* block = it.first;
-        auto* dataVector = block->getData< DataVector > ( probeDataID );
-        auto & cpuCellVector = dataVector->cpuCellVector();
-        const uint_t n = uint_c(dataVector->size());
-        for(uint_t i = 0; i < n; ++i){
-            const Cell localCell(cell_idx_c(cpuCellVector[i]), cell_idx_c(cpuCellVector[i + n]), cell_idx_c(cpuCellVector[i + 2*n]));
-            Vector3< real_t > p;
-            blocks_->getBlockLocalCellCenter(*block, localCell, p);
-
-            std::string probeName = "Probe" + std::to_string(counter);
+    for (auto const& it : globalPointsPerBlock_){
+        for(uint_t i = 0; i < it.second.size(); ++i){
+            const auto p = it.second[i].first;
+            const auto idx = it.second[i].second;
+            std::string probeName = "Probe_" + std::to_string(idx);
             if (h5file_->exists(probeName)) { continue; }
             H5::Group probeGroup = h5file_->createGroup(probeName);
 
@@ -137,24 +144,23 @@ void Probe::writeHeaderFiles(){
             for(auto name: dataSetNames){
                 probeGroup.createDataSet(name, io::h5Float(), arrayDataSpace, props);
             }
-            counter++;
         }
     }
 }
 
 void Probe::writeProbes(){
+    if(globalPointsPerBlock_.empty()){return;}
 
     extendSize[0] += executionCounter_;
-    
-    std::vector<float> tmp(executionCounter_);
+    std::vector<float> timeData(executionCounter_);
     const real_t t = (real_c(totalExecutionCounter_ + frequency_ - uint_c(1)) * setup_.dt) - ( real_c(executionCounter_ * frequency_) * setup_.dt);
     for(uint_t i = 0; i < executionCounter_; ++i){
-        tmp[i] = float_c(t) + float_c(i)*float_c(setup_.dt);
+        timeData[i] = float_c(t) + float_c(i)*float_c(setup_.dt);
     }
 
-    writeData(tmp, "time", "t");
+    writeData(timeData, "time", "t");
 
-    for (auto const& it : pointsPerBlock_){
+    for (auto const& it : globalPointsPerBlock_){
         IBlock* block = it.first;
         auto* dataVector = block->getData< DataVector > ( probeDataID );
         dataVector->syncCPU();
@@ -163,14 +169,19 @@ void Probe::writeProbes(){
 
     const uint_t nDatasets = dataSetNames.size();
 
-    for (auto const& it : pointsPerBlock_){
+    std::vector<float> tmp(executionCounter_);
+    for (auto const& it : globalPointsPerBlock_){
         IBlock* block = it.first;
+
         auto* dataVector = block->getData< DataVector > ( probeDataID );
         auto & cpuDataVector = dataVector->cpuDataVector();
-        const uint_t n = uint_c(dataVector->size());
 
+        const uint_t n = uint_c(dataVector->size());
+        WALBERLA_CHECK_EQUAL(n, it.second.size())
         for(uint_t i = 0; i < n; ++i){
-            const std::string probeName = "Probe" + std::to_string(i);
+            const auto idx = it.second[i].second;
+
+            const std::string probeName = "Probe_" + std::to_string(idx);
             for(uint_t j = 0; j < nDatasets; ++j){
                 auto dataSetName = dataSetNames[j];
                 for(uint_t k = 0; k < executionCounter_; ++k){
@@ -205,7 +216,7 @@ void Probe::update(){
 
     const float velocityScalingFactor = float_c(setup_.conversionFactor);
 
-    for (auto const& it : pointsPerBlock_){
+    for (auto const& it : globalPointsPerBlock_){
         IBlock* block = it.first;
         auto density = block->getData< gpu::GPUField<double> >(ids_.densityFieldGPU);
         auto velocity = block->getData< gpu::GPUField<double> >(ids_.velocityFieldGPU);
diff --git a/apps/showcases/PorousMediaGPU/Probe.h b/apps/showcases/PorousMediaGPU/Probe.h
index f787233ad..374547f88 100644
--- a/apps/showcases/PorousMediaGPU/Probe.h
+++ b/apps/showcases/PorousMediaGPU/Probe.h
@@ -24,6 +24,8 @@
 #include "core/logging/Initialization.h"
 #include "core/math/Constants.h"
 #include "core/Filesystem.h"
+#include "core/mpi/Broadcast.h"
+#include "core/mpi/Gatherv.h"
 
 #include "gpu/GPUWrapper.h"
 #include "gpu/ErrorChecking.h"
@@ -140,7 +142,8 @@ class Probe
   const std::vector<std::string> dataSetNames{"rho", "ux", "uy", "uz"};
 
   std::vector<Vector3<real_t>> probePoints_;
-  std::map<IBlock*, std::vector<Vector3<real_t>> > pointsPerBlock_;
+  std::map<IBlock*, std::vector<    std::pair< Vector3<real_t>, uint_t > > > globalPointsPerBlock_;
+  std::map<IBlock*, std::vector<Cell> > localCellsPerBlock_;
 
   std::map<IBlock*, Vector3<uint32_t>> gpuBlock_;
   std::map<IBlock*, Vector3<uint32_t>> gpuGrid_;
diff --git a/apps/showcases/PorousMediaGPU/Setup.h b/apps/showcases/PorousMediaGPU/Setup.h
index d156bccc2..44ce970eb 100644
--- a/apps/showcases/PorousMediaGPU/Setup.h
+++ b/apps/showcases/PorousMediaGPU/Setup.h
@@ -72,6 +72,8 @@ struct Setup
       numberOfFourrierModes = turbulence.getParameter< uint_t >("numberOfFourrierModes");
       uc                    = turbulence.getParameter< Vector3< real_t > >("U_c");
 
+      const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) ));
+
       if(config->getNumBlocks("RoughWall") > 0){
          // kinViscosity                    = parameters.getParameter< real_t >("viscosity");
          referenceLength *= real_c(0.5);
@@ -84,17 +86,32 @@ struct Setup
 
          kinViscosity = referenceLength * referenceVelocity / bulkReynoldsNumber;
 
-         const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) ));
          machNumber = latticeVelocity / speedOfSound;
          omega      = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5)));
 
          frictionVelocity = (reynoldsNumber * viscosity) / latticeReferenceLength;
          drivingForce = calculateDrivingForce(frictionVelocity, latticeReferenceLength, latticeVelocity, latticeVelocity);
       }
+      else if(config->getNumBlocks("genericPorousMatrix") > 0){
+         auto gpm = config->getOneBlock("genericPorousMatrix");
+
+         referenceLength = gpm.getParameter< real_t >("barSize");
+         const real_t latticeReferenceLength = referenceLength / coarseMeshSize;
+         viscosity = latticeReferenceLength * latticeVelocity / reynoldsNumber;
+         referenceVelocity = latticeVelocity;
+         dt  = latticeVelocity / referenceVelocity * coarseMeshSize;
+         kinViscosity = referenceLength * referenceVelocity / reynoldsNumber;
+         machNumber = latticeVelocity / speedOfSound;
+         omega      = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5)));
+
+         const real_t pressureGradient = parameters.getParameter< real_t >("pressureGradient");
+         const real_t XYslice = real_c(rootBlocks[0] * cellsPerBlock[0] * rootBlocks[1] * cellsPerBlock[1]);
+         drivingForce = pressureGradient / XYslice;
+      }
       else{
          referenceVelocity = parameters.getParameter< real_t >("referenceVelocity");
          const real_t latticeReferenceLength = referenceLength / coarseMeshSize;
-         const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) ));
+         
          machNumber = latticeVelocity / speedOfSound;
          viscosity  = real_c((latticeVelocity * latticeReferenceLength) / reynoldsNumber);
          omega      = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5)));
diff --git a/apps/showcases/PorousMediaGPU/TPMS.prm b/apps/showcases/PorousMediaGPU/TPMS.prm
index 019bd4890..abae5ca9e 100644
--- a/apps/showcases/PorousMediaGPU/TPMS.prm
+++ b/apps/showcases/PorousMediaGPU/TPMS.prm
@@ -5,9 +5,12 @@ Parameters
     referenceVelocity 1.0;
     referenceLength 1.0;
     latticeVelocity 0.025;
-    initialiseWithlatticeVelocity true;
+    initialiseWithlatticeVelocity false;
 
-    timesteps 100001;
+    welfordSamplingStart 0;
+    welfordSamplingFrequency 1;
+
+    timesteps 21;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -41,9 +44,9 @@ TPMS
 DomainSetup
 {
     domainSize    < 8, 1, 1 >;
-    // cellsPerBlock < 384, 48, 48 >;
+    cellsPerBlock < 384, 48, 48 >;
     // cellsPerBlock < 768, 96, 96 >;
-    cellsPerBlock < 1536, 192, 192 >;
+    // cellsPerBlock < 1536, 192, 192 >;
     blocks    < 1, 1, 1 >;
     periodic    < false, true, true >;
     refinementLevels 0;
@@ -68,10 +71,11 @@ StabilityChecker
 
 VTKWriter
 {
-    vtkWriteFrequency 50000;
+    vtkWriteFrequency 10;
     vtkGhostLayers 0;
     velocity true;
     writeVelocityAsMagnitude false;
+    meanVelocity true;
     density false;
     flag false;
     writeOnlySlice true;
@@ -81,6 +85,7 @@ VTKWriter
     oneFilePerProcess false;
     samplingResolution -1;
     initialWriteCallsToSkip 0;
+    lastStepAsVolumeFile true;
 }
 
 Logging
@@ -90,6 +95,7 @@ Logging
     readSetupFromFile false;
     remainingTimeLoggerFrequency 20; // in seconds
     writeSurfaceMeshOfTPMS false;
+    filestem TPMS;
 }
 
 Probes
diff --git a/apps/showcases/PorousMediaGPU/roughWall.prm b/apps/showcases/PorousMediaGPU/roughWall.prm
index 8215f12bf..bf9a0f5f5 100644
--- a/apps/showcases/PorousMediaGPU/roughWall.prm
+++ b/apps/showcases/PorousMediaGPU/roughWall.prm
@@ -6,6 +6,7 @@ Parameters
     initialiseWithlatticeVelocity true;
     initialiseWithTurbulence true;
     forceUpdateFrequency 10;
+    adaptDrivingForce false;
 
     welfordSamplingStart 200000;
     welfordSamplingFrequency 1;
diff --git a/apps/showcases/PorousMediaGPU/utilGPU.cu b/apps/showcases/PorousMediaGPU/utilGPU.cu
index c1c6f47cc..741cb6688 100644
--- a/apps/showcases/PorousMediaGPU/utilGPU.cu
+++ b/apps/showcases/PorousMediaGPU/utilGPU.cu
@@ -18,7 +18,7 @@
 //
 //======================================================================================================================
 #include "utilGPU.h"
-// #include <curand_kernel.h>
+#include <curand_kernel.h>
 
 namespace walberla
 {
@@ -47,7 +47,7 @@ static __device__ void singleMode(double* rX, double* rY, double* rZ, const doub
 {
 
    const real_t pi2 = 6.28318530717958647693;
-/*
+
    curandStatePhilox4_32_10_t state;
    curand_init(42, linear_index, 0, &state);
 
@@ -60,8 +60,8 @@ static __device__ void singleMode(double* rX, double* rY, double* rZ, const doub
    const real_t zetaX = real_t(2.0) * real_t(r1.w) - real_t(1.0);
    const real_t zetaY = real_t(2.0) * real_t(r2.x) - real_t(1.0);
    const real_t zetaZ = real_t(2.0) * real_t(r2.y) - real_t(1.0);
-*/  
 
+/*
    // For debugging
    const real_t a     = 0.1; // math::realRandom(real_c(0.0), real_c(1.0));
    const real_t phi   = 0.2 * pi2; //math::realRandom(real_c(0.0), real_c(1.0)) * pi2;
@@ -69,7 +69,7 @@ static __device__ void singleMode(double* rX, double* rY, double* rZ, const doub
    const real_t zetaX = -0.4; //math::realRandom(real_c(-1.0), real_c(1.0));
    const real_t zetaY = -0.6; //math::realRandom(real_c(-1.0), real_c(1.0));
    const real_t zetaZ = 0.8; // math::realRandom(real_c(-1.0), real_c(1.0));
-
+*/
 
    const real_t theta = acos(real_t(2.0) * a - real_t(1.0));
 
@@ -236,8 +236,6 @@ void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real
                          const real_t dx, const real_t t, const uint_t n, const real_t latticeVelocity, const int64_t nGhostLayers,
                          const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid){
 
-   WALBERLA_ABORT("Turbulent Inflow not usuable at the moment")
-
    dim3 block_(block[0], block[1], block[2]);
    dim3 grid_(grid[0], grid[1], grid[2]);
 
@@ -326,7 +324,7 @@ void BulkVelocityCalculator::calculateBulkVelocity() {
       bulkVelocity += bulkVelocityPerBlock;
    }
 
-   mpi::allReduceInplace< double >(bulkVelocity, mpi::SUM);
+   mpi::allReduceInplace< double >(bulkVelocity, mpi::SUM, MPIManager::instance()->comm());
    bulkVelocity_ = double_c(bulkVelocity) / volume_;
 
    WALBERLA_GPU_CHECK(gpuMemset(resultGPU_, 0.0, sizeof(double) * resultCPU_.size()));
diff --git a/apps/showcases/PorousMediaGPU/utilGPU.h b/apps/showcases/PorousMediaGPU/utilGPU.h
index 7fff9bffc..1d1bbc6c7 100644
--- a/apps/showcases/PorousMediaGPU/utilGPU.h
+++ b/apps/showcases/PorousMediaGPU/utilGPU.h
@@ -23,6 +23,8 @@
 
 #include "core/DataTypes.h"
 #include "core/math/Constants.h"
+#include "core/Filesystem.h"
+#include "core/mpi/Reduce.h"
 
 #include "gpu/GPUWrapper.h"
 #include "gpu/ErrorChecking.h"
@@ -47,8 +49,8 @@ void updateProbe(gpu::GPUField<real_t>* density, gpu::GPUField<real_t>* velocity
 
 class BulkVelocityCalculator {
 public:
-    BulkVelocityCalculator(const std::shared_ptr<StructuredBlockForest> & blocks, const IDs& ids, const Setup& setup) 
-    : blocks_(blocks), setup_(setup), ids_(ids)
+    BulkVelocityCalculator(const std::shared_ptr<StructuredBlockForest> & blocks, const IDs& ids, const Setup& setup, const uint_t forceUpdateFrequency) 
+    : blocks_(blocks), setup_(setup), ids_(ids), forceUpdateFrequency_(forceUpdateFrequency), executionCounter_{0}
     {
 #if __CUDA_ARCH__ < 600
     WALBERLA_LOG_WARNING_ON_ROOT("Warning atomic add in double precision not supported. Falling back to software implementation")
@@ -75,14 +77,30 @@ public:
 
         const Vector3<uint_t> s(blocks_->getNumberOfXCellsPerBlock(), blocks_->getNumberOfYCellsPerBlock(), blocks_->getNumberOfZCellsPerBlock());
         getBlockAndGridSize(s, block_, grid_);
-        mpi::allReduceInplace< double >(volume_, mpi::SUM);
-
-
+        mpi::allReduceInplace< double >(volume_, mpi::SUM, MPIManager::instance()->comm());
+        
         resultCPU_.resize(nBlocks);
 
         WALBERLA_GPU_CHECK(gpuMalloc((void **)&resultGPU_, sizeof(double) * resultCPU_.size() ));
         WALBERLA_GPU_CHECK(gpuMemset(resultGPU_, 0.0, sizeof(double) * resultCPU_.size()));
 
+        const auto outputFolder = setup_.outputFolder;
+        const auto fileStem = setup_.fileStem;
+        fileName_ = outputFolder + "/" + fileStem + "DrivingForce.csv"; 
+        filesystem::path drivingForceOuputFile( fileName_.c_str() );
+
+        WALBERLA_ROOT_SECTION()
+        {
+            if( filesystem::exists( drivingForceOuputFile ) )
+                std::remove( drivingForceOuputFile.string().c_str() );
+        
+            std::ofstream outfile( drivingForceOuputFile.c_str() );
+            outfile << "SEP=," << "\n";
+
+            outfile << "timestep," << "force," << "U_b" << "\n";
+            outfile.close();
+        }
+
     }
 
     double bulkVelocity() const { return bulkVelocity_; }
@@ -93,7 +111,18 @@ public:
         calculateBulkVelocity();
 
         const real_t latticeReferenceLength = setup_.referenceLength / setup_.dxC;
-        return calculateDrivingForce(setup_.frictionVelocity, latticeReferenceLength, setup_.latticeVelocity, bulkVelocity_);
+        const real_t drivingForce = calculateDrivingForce(setup_.frictionVelocity, latticeReferenceLength, setup_.latticeVelocity, bulkVelocity_);
+        const real_t timestep = real_c(executionCounter_ * forceUpdateFrequency_) * setup_.dt;
+
+        WALBERLA_ROOT_SECTION()
+        {
+            std::ofstream outfile( fileName_.c_str(), std::ios_base::app );
+            outfile << timestep << "," << drivingForce << "," << bulkVelocity_ << "\n";
+            outfile.close();
+        }
+
+        executionCounter_++;
+        return drivingForce;
     }
 
 private:
@@ -101,6 +130,10 @@ private:
     const Setup setup_;
     const IDs ids_;
 
+    const uint_t forceUpdateFrequency_;
+    uint_t executionCounter_;
+    std::string fileName_;
+
     double volume_;
     double bulkVelocity_{0.0};
 
-- 
GitLab