From 0ac31586a2189878b99769dba93bd9e3272da568 Mon Sep 17 00:00:00 2001
From: Markus Holzer <markus.holzer@onera.fr>
Date: Wed, 16 Apr 2025 16:24:36 +0200
Subject: [PATCH] [skip ci] update

---
 .../showcases/PorousMediaGPU/DragEvaluation.h | 146 ++++++++++++++++++
 apps/showcases/PorousMediaGPU/JinCase.prm     |  14 +-
 .../PorousMediaGPU/JinCaseInflow.prm          |  11 +-
 .../PorousMediaGPU/PorousMediaGPU.cpp         |  85 ++++++----
 apps/showcases/PorousMediaGPU/Probe.h         |   1 +
 apps/showcases/PorousMediaGPU/Setup.h         |   3 +
 apps/showcases/PorousMediaGPU/TPMS.h          |   2 -
 .../showcases/PorousMediaGPU/TPMSPeriodic.prm |  39 +++--
 apps/showcases/PorousMediaGPU/roughWall.prm   |  35 ++++-
 apps/showcases/PorousMediaGPU/utilGPU.cu      |  41 +++--
 apps/showcases/PorousMediaGPU/utilGPU.h       |  45 ++++--
 11 files changed, 337 insertions(+), 85 deletions(-)

diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.h b/apps/showcases/PorousMediaGPU/DragEvaluation.h
index 9b57b4958..0c9a2661e 100644
--- a/apps/showcases/PorousMediaGPU/DragEvaluation.h
+++ b/apps/showcases/PorousMediaGPU/DragEvaluation.h
@@ -68,6 +68,10 @@ class DragEvaluation
     return (sumFx_ / double_c(meanQuantitiesCounter_)) / totalSurface_;
    }
 
+  void setTotalExecutionCounter(const uint_t counter){
+    totalExecutionCounter_ = counter * ( 1 << setup_.refinementLevels );
+   }
+
  protected:
    bool initialized_{false};
 
@@ -135,4 +139,146 @@ class Plotter
   uint_t checkFrequency_{10000};
 }; // class Plotter
 
+
+class PressureDropEvaluator
+{
+ public:
+   PressureDropEvaluator(const shared_ptr< StructuredBlockForest >& blocks, const Setup& setup, const IDs& ids, const Config::BlockHandle& pressureParameters)
+                         : blocks_(blocks), setup_(setup), ids_(ids)
+      {
+
+        evalAABB_     = pressureParameters.getParameter< AABB >("evaluationAABB");
+        evalInterval_ = pressureParameters.getParameter< uint_t >("evaluationFrequency");
+
+        pressureDrop_ = real_c(0.0);
+        executionCounter_ = uint_c(0);
+
+        fileName_ = setup_.outputFolder + "/" + setup_.fileStem + "PressureDrop.csv"; 
+        filesystem::path pressureDropOuputFile( fileName_.c_str() );
+
+        WALBERLA_ROOT_SECTION()
+        {
+            if(!setup_.loadSnapshot){
+                if( filesystem::exists( pressureDropOuputFile ) )
+                    std::remove( pressureDropOuputFile.string().c_str() );
+            }
+
+            if(!filesystem::exists( pressureDropOuputFile ) ){
+                std::ofstream outfile( pressureDropOuputFile.c_str() );
+                outfile << "SEP=," << "\n";
+
+                outfile << "timestep," << "time," << "pressureDrop," << "x0," << "x1" << "\n";
+                outfile.close();
+            }
+        }
+      }
+
+  std::function<void()>  updateFunctor() {return [this](){ updatePressureDrop(); };}
+  void setInitialTimestep(const uint_t initialTimestep){
+      executionCounter_ = initialTimestep;
+  }
+
+
+
+ private:
+   const shared_ptr< StructuredBlockForest > blocks_;
+   Setup setup_;
+   IDs ids_;
+   AABB evalAABB_;
+   uint_t evalInterval_;
+   real_t pressureDrop_;
+
+   uint_t executionCounter_; // number of times operator() has been called
+   std::string fileName_;
+
+   void syncDensity(){
+      gpu::fieldCpy< ScalarField_T , gpu::GPUField< ScalarField_T::value_type > >(blocks_, ids_.densityField, ids_.densityFieldGPU);
+   }
+
+   void updatePressureDrop(){
+
+      ++executionCounter_;
+      if (executionCounter_ % evalInterval_ != uint_c(0)) { return; }
+
+      syncDensity();
+
+      real_t lowerDensity       = real_c(0);
+      real_t upperDensity       = real_c(0);
+      uint_t numCellsLowerSlice = uint_c(0);
+      uint_t numCellsUpperSlice = uint_c(0);
+
+      auto domainFlag = setup_.fluidFlag;
+
+      for (auto blockIterator = blocks_->begin(); blockIterator != blocks_->end(); ++blockIterator)
+      {
+         const uint_t level = blocks_->getLevel(*blockIterator);
+
+         // global lower cell BB slice for density measurement (shifted 1 cell away from porous medium in
+         // z-direction)
+         CellInterval lowerCellBB = blocks_->getCellBBFromAABB(evalAABB_, level);
+         lowerCellBB.xMax()       = lowerCellBB.xMin() + cell_idx_c(1);
+         // lowerCellBB.xMin()       = lowerCellBB.xMin();
+         numCellsLowerSlice       = lowerCellBB.numCells();
+
+         // global upper cell BB slice for density measurement (shifted 1 cell away from porous medium in
+         // z-direction)
+         CellInterval upperCellBB = blocks_->getCellBBFromAABB(evalAABB_, level);
+         upperCellBB.xMin()       = upperCellBB.xMax() - cell_idx_c(1);
+         // upperCellBB.xMax()       = upperCellBB.xMin();
+         numCellsUpperSlice       = upperCellBB.numCells();
+
+         // block's cell BB that intersects the global slices (still in global coordinates)
+         CellInterval lowerBlockCellBB = blocks_->getBlockCellBB(*blockIterator);
+         lowerBlockCellBB.intersect(lowerCellBB);
+         CellInterval upperBlockCellBB = blocks_->getBlockCellBB(*blockIterator);
+         upperBlockCellBB.intersect(upperCellBB);
+
+         // transform the global coordinates of relevant cells to block local coordinates
+         CellInterval lowerBlockLocalCellBB;
+         CellInterval upperBlockLocalCellBB;
+         blocks_->transformGlobalToBlockLocalCellInterval(lowerBlockLocalCellBB, *blockIterator, lowerBlockCellBB);
+         blocks_->transformGlobalToBlockLocalCellInterval(upperBlockLocalCellBB, *blockIterator, upperBlockCellBB);
+
+         auto densityField = blockIterator->template getData< ScalarField_T >(ids_.densityField);
+         auto flagField = blockIterator->template getData< FlagField_T >(ids_.flagField);
+
+         // sum density of relevant cells
+         // clang-format off
+         WALBERLA_FOR_ALL_CELLS_IN_INTERVAL_XYZ(lowerBlockLocalCellBB,
+                                                if (isFlagSet(flagField->get(x, y, z), domainFlag))
+                                                {
+                                                   lowerDensity += densityField->get(x, y, z);
+                                                }
+         ) // WALBERLA_FOR_ALL_CELLS_IN_INTERVAL_XYZ
+
+         WALBERLA_FOR_ALL_CELLS_IN_INTERVAL_XYZ(upperBlockLocalCellBB,
+                                                if (isFlagSet(flagField->get(x, y, z), domainFlag))
+                                                {
+                                                   upperDensity += densityField->get(x, y, z);
+                                                }
+         ) // WALBERLA_FOR_ALL_CELLS_IN_INTERVAL_XYZ
+         // clang-format on
+      }
+
+      mpi::allReduceInplace< real_t >(lowerDensity, mpi::SUM);
+      mpi::allReduceInplace< real_t >(upperDensity, mpi::SUM);
+
+      // average density
+      lowerDensity /= real_c(numCellsLowerSlice);
+      upperDensity /= real_c(numCellsUpperSlice);
+
+      pressureDrop_ = (upperDensity - lowerDensity) / real_c(3.0);
+      const real_t timestep = real_c(executionCounter_) * setup_.dt;
+
+
+      WALBERLA_ROOT_SECTION(){
+          std::ofstream outfile( fileName_.c_str(), std::ios_base::app );
+          outfile << std::setprecision(16);
+          outfile << executionCounter_ << "," << timestep << "," << pressureDrop_ << "," << evalAABB_.xMin() << "," << evalAABB_.xMax() << "\n";
+          outfile.close();
+      }
+
+   }
+}; // class PressureDropEvaluator
+
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/JinCase.prm b/apps/showcases/PorousMediaGPU/JinCase.prm
index dd57dc225..2984378f2 100644
--- a/apps/showcases/PorousMediaGPU/JinCase.prm
+++ b/apps/showcases/PorousMediaGPU/JinCase.prm
@@ -6,7 +6,7 @@ Parameters
     initialiseWithTurbulence false;
     pressureGradient 0.190;
 
-    timesteps 5000001;
+    timesteps 11;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -17,8 +17,8 @@ Parameters
 
 Snapshot
 {
-    loadSnapshot        false;
-    storeSnapshot       true;
+    loadSnapshot        true;
+    storeSnapshot       false;
     recreateBlockForest true;
     snapshotFrequency   10;
     snapshotBaseFolder  snapshots;
@@ -28,6 +28,8 @@ ForcingParameters
 {
     forceUpdateFrequency 1000;
     adaptDrivingForce    false;
+    writeFrequency       100;
+    targetBulkVelocity   0.04;
     proportionalGain     2e-4;
     derivativeGain       1e-6;
     integralGain         2e-4;
@@ -54,9 +56,9 @@ genericPorousMatrix
 DomainSetup
 {
     domainSize    < 12, 8, 4 >;
-    cellsPerBlock < 360, 240, 120 >;
     // cellsPerBlock < 360, 240, 120 >;
-    blocks    < 2, 2, 2 >;
+    cellsPerBlock < 120, 160, 80 >;
+    blocks    < 2, 1, 1 >;
 
     periodic    < true, true, true >;
     refinementLevels 0;
@@ -79,7 +81,7 @@ StabilityChecker
 
 VTKWriter
 {
-    vtkWriteFrequency 100000;
+    vtkWriteFrequency 10;
     vtkGhostLayers 0;
     velocity true;
     writeVelocityAsMagnitude false;
diff --git a/apps/showcases/PorousMediaGPU/JinCaseInflow.prm b/apps/showcases/PorousMediaGPU/JinCaseInflow.prm
index 1a14df767..74ba0e0be 100644
--- a/apps/showcases/PorousMediaGPU/JinCaseInflow.prm
+++ b/apps/showcases/PorousMediaGPU/JinCaseInflow.prm
@@ -6,7 +6,7 @@ Parameters
     initialiseWithTurbulence false;
     pressureGradient 0.190;
 
-    timesteps 100001;
+    timesteps 101;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -19,7 +19,7 @@ Snapshot
 {
     loadSnapshot       false;
     storeSnapshot      true;
-    snapshotFrequency  100000;
+    snapshotFrequency  100;
     snapshotBaseFolder snapshots;
 }
 
@@ -42,9 +42,10 @@ genericPorousMatrix
 DomainSetup
 {
     domainSize    < 12, 8, 4 >;
-    cellsPerBlock < 360, 240, 120 >;
     // cellsPerBlock < 360, 240, 120 >;
-    blocks    < 2, 2, 2 >;
+    cellsPerBlock < 240, 160, 80 >;
+    // blocks    < 2, 2, 2 >;
+    blocks    < 1, 1, 1 >;
 
     periodic    < false, true, true >;
     refinementLevels 0;
@@ -68,7 +69,7 @@ StabilityChecker
 
 VTKWriter
 {
-    vtkWriteFrequency 100000;
+    vtkWriteFrequency 100;
     vtkGhostLayers 0;
     velocity true;
     writeVelocityAsMagnitude false;
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
index d2d09661e..873e0c112 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
@@ -299,6 +299,8 @@ int main(int argc, char** argv){
       WALBERLA_LOG_INFO_ON_ROOT(" - beginTimeStep = " << beginTimeStep)
    }
 
+   setup.timesteps += beginTimeStep;
+
    const StorageSpecification_T StorageSpec = StorageSpecification_T();
    // Creating fields
    uint_t step = 0;
@@ -384,10 +386,10 @@ int main(int argc, char** argv){
       gpu::fieldCpy< gpu::GPUField< VelocityField_T::value_type >, VelocityField_T >(blocks, ids.velocityFieldInflowSliceGPU, ids.velocityFieldInflowSlice);
    
    }
-   else{
-      for (auto& block : *blocks)
-         sweepCollection.calculateMacroscopicParameters(&block, cell_idx_c(setup.numGhostLayers - 1));
-   }
+   // else{
+   //    for (auto& block : *blocks)
+   //       sweepCollection.calculateMacroscopicParameters(&block, cell_idx_c(setup.numGhostLayers - 1));
+   // }
 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
@@ -455,6 +457,23 @@ int main(int argc, char** argv){
    auto nonUniformPackInfo      = lbm_generated::setupNonuniformGPUPdfCommunication< GPUPdfField_T >(blocks, ids.pdfFieldGPU);
    nonUniformCommunication->addPackInfo(nonUniformPackInfo);
 
+   if(setup.loadSnapshot){
+      const uint_t maxLevel = blocks->getDepth();
+      for (uint_t level = 0; level <= maxLevel; level++){
+         nonUniformCommunication->communicateEqualLevel(level);
+         if(level + 1 <= maxLevel){
+            nonUniformCommunication->communicateCoarseToFine(level + 1);
+            nonUniformCommunication->communicateFineToCoarse(level + 1);
+         }
+      }
+   }
+
+   WALBERLA_MPI_BARRIER()
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                          TIME STEP DEFINITIONS                                             ///
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -462,6 +481,7 @@ int main(int argc, char** argv){
    const int streamLowPriority = 0;
    auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
    SweepTimeloop timeloop(blocks->getBlockStorage(), setup.timesteps);
+   timeloop.setCurrentTimeStep(beginTimeStep);
 
    lbm_generated::BasicRecursiveTimeStepGPU< GPUPdfField_T, SweepCollection_T, BoundaryCollection_T >
       LBMMeshRefinement(blocks, ids.pdfFieldGPU, sweepCollection, boundaryCollection, nonUniformCommunication, nonUniformPackInfo);
@@ -473,6 +493,7 @@ int main(int argc, char** argv){
    if(config->getNumBlocks("RoughWall") > 0){
       WALBERLA_LOG_INFO_ON_ROOT(++step << ". Setup Drag Evaluation")
       evaluation = make_shared<DragEvaluation>(blocks, boundaryCollection, ids, setup, config);
+      evaluation->setTotalExecutionCounter(beginTimeStep);
       LBMMeshRefinement.addPostBoundaryHandlingBlockFunction(evaluation->forceCalculationFunctor());
    }
 
@@ -483,18 +504,26 @@ int main(int argc, char** argv){
 
       if(bvc->forceUpdateFrequency() > 0){
          auto updateForce = [&](){
-         if(timeloop.getCurrentTimeStep() % bvc->forceUpdateFrequency()  == 0) {
+         if(timeloop.getCurrentTimeStep() % bvc->forceUpdateFrequency() == 0) {
                const real_t correctedDrivingForce = bvc->getDrivingForce();
-               if(bvc->adaptDrivingForce()){
-                  WALBERLA_LOG_DEVEL_VAR(correctedDrivingForce)
-                  sweepCollection.setFx(correctedDrivingForce);
-               }
+               sweepCollection.setFx(correctedDrivingForce);
             }
          };
          timeloop.addFuncBeforeTimeStep(updateForce, "Update force");
       }
    }
 
+   shared_ptr< PressureDropEvaluator > pde;
+   if(config->getNumBlocks("PressureEvaluation") > 0){
+      WALBERLA_LOG_INFO_ON_ROOT(++step << ". Add pressure drop calculator")
+
+      pde = std::make_shared<PressureDropEvaluator>(blocks, setup, ids, config->getOneBlock("PressureEvaluation"));
+      pde->setInitialTimestep(beginTimeStep);
+      timeloop.addFuncBeforeTimeStep(pde->updateFunctor(), "Pressure Drop Evaluation");
+
+   }
+
+
    std::function<void(IBlock*)> copyPDFs = gpu::fieldCpyFunctor< PdfField_T, gpu::GPUField< PdfField_T::value_type > >(ids.pdfField, ids.pdfFieldGPU);
    std::function<void(IBlock*)> copyVelocityInflow = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.velocityFieldInflowSlice, ids.velocityFieldInflowSliceGPU);
    std::function<void(IBlock*)> copyVelocity = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.velocityField, ids.velocityFieldGPU);
@@ -541,29 +570,23 @@ int main(int argc, char** argv){
    auto probe = std::make_shared<Probe>(blocks, setup, ids, config->getOneBlock("Probes"));
    probe->setInitialTimestep(beginTimeStep);
 
-/*
-   const real_t lenght = 8.0;
-   const real_t res = 3.0;
-   const real_t start = 0.0;
-   const uint_t nx = uint_c(std::floor(lenght / (setup.dxC * res)));
-   for(uint_t i = 0; i < nx; ++i){
-      probe->addPoint(Vector3<real_t>(start + real_c(i) * (setup.dxC * res), 0.5, 0.5 ));
-   }
-*/ 
-
-   // const uint_t nx = blocks->getXSize() * blocks->getNumberOfXCells();
-   // for(uint_t i = 0; i < nx; ++i){
-   //    probe->addPoint(Vector3<real_t>(real_c(i) * setup.dxC, finalDomain.center()[1], finalDomain.center()[2] ));
-   // }
-   // probe->initialise();
 
    probe->addPoint(finalDomain.center());
+   const real_t xStart = finalDomain.xMin();
+   const uint_t nx = uint_c(finalDomain.xSize() / setup.dxC);
+   for(uint_t i = 0; i <= nx; i+=4){
+      probe->addPoint(Vector3<real_t>(xStart + real_c(i) * setup.dxC, finalDomain.center()[1], finalDomain.center()[2] ));
+   }
+
 
+/*
    const real_t xStart = finalDomain.xMin();
    const uint_t spacing = uint_c(4);
    const uint_t nx = uint_c(finalDomain.xSize() / setup.dxC);
 
-   for(uint_t j = 1; j < 2; ++j){
+   for(uint_t j = 0; j < 3; ++j){
+      const Vector3<real_t> lineCenter(finalDomain.center()[0], real_c(2.0) + real_c(j) * real_c(2.0), finalDomain.center()[2]);
+      probe->addPoint(lineCenter);
       for(uint_t i = 0; i <= nx; i+=spacing){
          probe->addPoint(Vector3<real_t>(xStart + real_c(i) * setup.dxC, real_c(2.0) + real_c(j) * real_c(2.0), finalDomain.center()[2] ));
       }
@@ -572,16 +595,15 @@ int main(int argc, char** argv){
    const real_t yStart = finalDomain.yMin();
    const uint_t ny = uint_c(finalDomain.ySize() / setup.dxC);
 
-   for(uint_t j = 2; j < 3; ++j){
+   for(uint_t j = 0; j < 5; ++j){
+      const Vector3<real_t> lineCenter(real_c(2.0) + real_c(j) * real_c(2.0), finalDomain.center()[1], finalDomain.center()[2]);
+      probe->addPoint(lineCenter);
       for(uint_t i = 0; i <= ny; i+=spacing){
          probe->addPoint(Vector3<real_t>(real_c(2.0) + real_c(j) * real_c(2.0), yStart + real_c(i) * setup.dxC, finalDomain.center()[2] ));
       }
    }
 
-   // for(uint_t i = 0; i < blocks->getYSize() * blocks->getNumberOfYCells(); ++i){
-   //     probe->addPoint(Vector3<real_t>(finalDomain.center()[0], real_c(i) * setup.dxC, finalDomain.center()[2] ));
-   // }
-
+*/
 
    probe->initialise();
 
@@ -695,6 +717,7 @@ int main(int argc, char** argv){
    if(setup.storeSnapshot){
       auto snapShotFunction = [&](){
          if(timeloop.getCurrentTimeStep() > uint_c(0)  && ( (timeloop.getCurrentTimeStep() + 1) % setup.snapshotFrequency) == 0) {
+            WALBERLA_LOG_INFO_ON_ROOT("Writing Snapshot")
 
             for (auto& block : *blocks){
                copyPDFs(&block);
@@ -702,7 +725,7 @@ int main(int argc, char** argv){
                if(copyMeanVelocity){copyMeanVelocity(&block);}
                if(copySOSField){copySOSField(&block);}
             }
-            
+
             blocks->saveBlockData(setup.pdfFieldFile + "_tmp", ids.pdfField);
             blocks->saveBlockData(setup.inflowFile + "_tmp", ids.velocityFieldInflowSlice);
 
diff --git a/apps/showcases/PorousMediaGPU/Probe.h b/apps/showcases/PorousMediaGPU/Probe.h
index 4bf483853..7534da555 100644
--- a/apps/showcases/PorousMediaGPU/Probe.h
+++ b/apps/showcases/PorousMediaGPU/Probe.h
@@ -133,6 +133,7 @@ class Probe
   void update();
   void setInitialTimestep(const uint_t initialTimestep){
     totalExecutionCounter_ = initialTimestep;
+    rampUpTime_ += initialTimestep;
   }
 
 
diff --git a/apps/showcases/PorousMediaGPU/Setup.h b/apps/showcases/PorousMediaGPU/Setup.h
index 950234c94..8749aa716 100644
--- a/apps/showcases/PorousMediaGPU/Setup.h
+++ b/apps/showcases/PorousMediaGPU/Setup.h
@@ -124,6 +124,9 @@ struct Setup
          kinViscosity = viscosity * coarseMeshSize * coarseMeshSize / dt;
       }
 
+      if( parameters.isDefined("drivingForce") )
+         drivingForce = parameters.getParameter< real_t >("drivingForce");
+
       omegaF = lbm_generated::relaxationRateScaling(omega, refinementLevels);
 
       rho = real_c(1.0);
diff --git a/apps/showcases/PorousMediaGPU/TPMS.h b/apps/showcases/PorousMediaGPU/TPMS.h
index 32c3d5afc..e72a72a7e 100644
--- a/apps/showcases/PorousMediaGPU/TPMS.h
+++ b/apps/showcases/PorousMediaGPU/TPMS.h
@@ -56,8 +56,6 @@ class TPMS
       tpmsBeginn_ = tpmsParameters.getParameter< real_t >("start");
       tpmsEnd_    = tpmsParameters.getParameter< real_t >("end");
 
-      WALBERLA_LOG_DEVEL_VAR(waveNumber_)
-
       const std::string tpmsString = tpmsParameters.getParameter< std::string >("type");
 
       if (tpmsString.compare("gyroid") == 0){
diff --git a/apps/showcases/PorousMediaGPU/TPMSPeriodic.prm b/apps/showcases/PorousMediaGPU/TPMSPeriodic.prm
index bb947e407..ed3599bf7 100644
--- a/apps/showcases/PorousMediaGPU/TPMSPeriodic.prm
+++ b/apps/showcases/PorousMediaGPU/TPMSPeriodic.prm
@@ -6,11 +6,12 @@ Parameters
     referenceLength 1.0;
     latticeVelocity 0.025;
     initialiseWithlatticeVelocity false;
+    drivingForce 1e-8;
 
     welfordSamplingStart 0;
     welfordSamplingFrequency 1;
 
-    timesteps 10;
+    timesteps 101;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -23,14 +24,15 @@ Snapshot
 {
     loadSnapshot       true;
     storeSnapshot      true;
-    snapshotFrequency  10;
+    snapshotFrequency  100;
     snapshotBaseFolder snapshots;
 }
 
 ForcingParameters
 {
-    forceUpdateFrequency 0;
+    forceUpdateFrequency 25;
     adaptDrivingForce false;
+    writeFrequency 1;
     // targetMeanVelocityMagnitude 0.02;
     proportionalGain            2e-4;
     derivativeGain              1e-6;
@@ -38,6 +40,12 @@ ForcingParameters
     maxRamp                     1e-4;
 }
 
+PressureEvaluation
+{
+    evaluationAABB [ <0, 0, 0>, <4.5, 1, 1> ];
+    evaluationFrequency 25;
+}
+
 Turbulence
 {
     useGrid false;
@@ -51,7 +59,8 @@ Turbulence
 
 TPMS
 {
-    waveNumber 18.84955592;
+    // waveNumber 18.8495559215;
+    waveNumber 6.2831853072;
     scaling 1.0;
     strut 0.75;
     start 0.0;
@@ -62,11 +71,16 @@ TPMS
 //! [domainSetup]
 DomainSetup
 {
+    // domainSize    < 1, 1, 1 >;
+    // cellsPerBlock < 128, 128, 128 >;
+
+
     domainSize    < 9, 1, 1 >;
-    cellsPerBlock < 270, 30, 30 >;
-    // cellsPerBlock < 2700, 300, 300 >;
+    cellsPerBlock < 288, 32, 32 >;
+    // cellsPerBlock < 720, 320, 320 >;
     // cellsPerBlock < 768, 96, 96 >;
     // cellsPerBlock < 1536, 192, 192 >;
+    // blocks    < 4, 1, 1 >;
     blocks    < 1, 1, 1 >;
     periodic    < true, true, true >;
     refinementLevels 0;
@@ -77,7 +91,8 @@ DomainSetup
 
 Boundaries
 {
-
+    // Border { direction W;    walldistance -1;  flag TurbulentInflow; }
+    // Border { direction E;    walldistance -1;  flag Outflow; }
 }
 
 StabilityChecker
@@ -89,7 +104,7 @@ StabilityChecker
 
 VTKWriter
 {
-    vtkWriteFrequency 10;
+    vtkWriteFrequency 100;
     vtkGhostLayers 0;
     velocity true;
     writeVelocityAsMagnitude false;
@@ -103,7 +118,7 @@ VTKWriter
     oneFilePerProcess false;
     samplingResolution -1;
     initialWriteCallsToSkip 0;
-    lastStepAsVolumeFile true;
+    lastStepAsVolumeFile false;
 }
 
 Logging
@@ -113,12 +128,12 @@ Logging
     readSetupFromFile false;
     remainingTimeLoggerFrequency 20; // in seconds
     writeSurfaceMeshOfTPMS true;
-    filestem TPMS;
+    filestem TPMSPeriodic;
 }
 
 Probes
 {
     evaluationFrequency 1;
-    writeCounter 1000; // after how many evaluations the results are written to file
-    filename probeData.h5;
+    writeCounter 50; // after how many evaluations the results are written to file
+    rampUpTime 50;
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/roughWall.prm b/apps/showcases/PorousMediaGPU/roughWall.prm
index bf9a0f5f5..aeaa20eab 100644
--- a/apps/showcases/PorousMediaGPU/roughWall.prm
+++ b/apps/showcases/PorousMediaGPU/roughWall.prm
@@ -5,13 +5,11 @@ Parameters
     latticeVelocity 0.05;
     initialiseWithlatticeVelocity true;
     initialiseWithTurbulence true;
-    forceUpdateFrequency 10;
-    adaptDrivingForce false;
 
     welfordSamplingStart 200000;
     welfordSamplingFrequency 1;
 
-    timesteps 10;
+    timesteps 11;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -20,6 +18,27 @@ Parameters
     gpuEnabledMPI false;
 }
 
+Snapshot
+{
+    loadSnapshot       false;
+    storeSnapshot      true;
+    snapshotFrequency  10;
+    snapshotBaseFolder snapshots;
+}
+
+
+ForcingParameters
+{
+    forceUpdateFrequency 10;
+    adaptDrivingForce    true;
+    writeFrequency       100;
+    targetBulkVelocity   0.05;
+    proportionalGain     2e-4;
+    derivativeGain       1e-6;
+    integralGain         2e-4;
+    maxRamp              1e-4;
+}
+
 Turbulence
 {
     useGrid false;
@@ -38,8 +57,8 @@ RoughWall
 
 DomainSetup
 {
-    domainSize    < 48.4, 40, 12 >;
-    cellsPerBlock < 121, 10, 30 >;
+    domainSize    < 40, 40, 12 >;
+    cellsPerBlock < 100, 10, 30 >;
     blocks    < 1, 10, 1 >;
 
     // domainSize    < 48.4, 40, 12 >;
@@ -47,7 +66,7 @@ DomainSetup
     // blocks    < 1, 10, 1 >;
 
     periodic    < true, false, true >;
-    refinementLevels 0;
+    refinementLevels 1;
     numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
     blockForestFilestem porousMediaBlockForest;
 }
@@ -104,6 +123,6 @@ Probes
 
 DragEvaluation
 {
-    evaluationStart 200000;
-    evaluationFrequency 10;
+    evaluationStart 10;
+    evaluationFrequency 1;
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/utilGPU.cu b/apps/showcases/PorousMediaGPU/utilGPU.cu
index 741cb6688..c5fddfd81 100644
--- a/apps/showcases/PorousMediaGPU/utilGPU.cu
+++ b/apps/showcases/PorousMediaGPU/utilGPU.cu
@@ -192,22 +192,40 @@ static __global__ void evaluateProbePoints(double* const density, double* const
    }
 }
 
-static __global__ void sumVelocity(double* result, double* const velocity, uint8_t* const flagField, const int64_t n, const double cellVolume, const uint8_t domainFlag,
+static __global__ void sumVelocity(double* result, double* const velocity, uint8_t* const flagField, const int64_t n, const double cellVolume, const uint_t fluidBitNr,
                                    const int64_t xSize, const int64_t ySize, const int64_t zSize,
                                    const int64_t xStrideVelocity, const int64_t yStrideVelocity, const int64_t zStrideVelocity, const int64_t fStrideVelocity,
                                    const int64_t xStrideFlag, const int64_t yStrideFlag, const int64_t zStrideFlag)
 {
-   if(blockDim.x*blockIdx.x + threadIdx.x < xSize && blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize)
+   if(blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize)
    {
-      const int64_t x = blockDim.x*blockIdx.x + threadIdx.x;
+
+      extern __shared__ double sdata[];
+
+      const int64_t thx = threadIdx.x;
+      const int64_t x = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
       const int64_t y = blockDim.y*blockIdx.y + threadIdx.y;
       const int64_t z = blockDim.z*blockIdx.z + threadIdx.z;
 
-      const double uX = cellVolume * velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z + 0*fStrideVelocity];
+      // const double ux = ((flagField[xStrideFlag*x + yStrideFlag*y + zStrideFlag*z]) >> (fluidBitNr) & 1) * cellVolume * velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z + 0*fStrideVelocity];
+      // atomicAdd(&result[n], ux);
+
+      
+      double firstSum = (x < xSize) ? ((flagField[xStrideFlag*x + yStrideFlag*y + zStrideFlag*z]) >> (fluidBitNr) & 1) * cellVolume * velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z + 0*fStrideVelocity] : 0;
+      if (x + blockDim.x < xSize) firstSum += ((flagField[xStrideFlag*(x + blockDim.x) + yStrideFlag*y + zStrideFlag*z]) >> (fluidBitNr) & 1) * cellVolume * velocity[xStrideVelocity*(x + blockDim.x) + yStrideVelocity*y + zStrideVelocity*z + 0*fStrideVelocity];
+      sdata[thx] = firstSum;
+      __syncthreads();
 
-      if( ( flagField[xStrideFlag*x + yStrideFlag*y + zStrideFlag*z] & domainFlag) == domainFlag ){
-         atomicAdd(&result[n], uX);
+      for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+         if (thx < s) {
+            sdata[thx] += sdata[thx + s];
+         }
+         __syncthreads();
       }
+
+      if (thx == 0) atomicAdd(&result[n], sdata[0]);
+      
+
    }
 }
 
@@ -285,6 +303,8 @@ void BulkVelocityCalculator::calculateBulkVelocity() {
    dim3 b(block_[0], block_[1], block_[2]);
    dim3 g(grid_[0], grid_[1], grid_[2]);
 
+   const uint_t sharedMemorySize = block_[0] * sizeof(double);
+
    int64_t counter = 0;
    for( auto it = blocks_->begin(); it != blocks_->end(); ++it ){
       auto* block = dynamic_cast< Block* >(it.get());
@@ -310,10 +330,11 @@ void BulkVelocityCalculator::calculateBulkVelocity() {
       const int64_t yStrideFlag = int64_c(flagField->yStride());
       const int64_t zStrideFlag = int64_c(flagField->zStride());
 
-      internal::sumVelocity<<<g, b, 0, nullptr>>>(resultGPU_, data_velocity, data_flagField, counter, cellVolume, setup_.fluidFlag,
-                                                  xSize, ySize, zSize,
-                                                  xStrideVelocity, yStrideVelocity, zStrideVelocity, fStrideVelocity,
-                                                  xStrideFlag, yStrideFlag, zStrideFlag);
+      internal::sumVelocity<<<g, b, sharedMemorySize, nullptr>>>(resultGPU_, data_velocity, data_flagField, counter, cellVolume, fluidBitNr_,
+                                                                 xSize, ySize, zSize,
+                                                                 xStrideVelocity, yStrideVelocity, zStrideVelocity, fStrideVelocity,
+                                                                 xStrideFlag, yStrideFlag, zStrideFlag);
+      counter++;
 
       
    }
diff --git a/apps/showcases/PorousMediaGPU/utilGPU.h b/apps/showcases/PorousMediaGPU/utilGPU.h
index 99bbc9fe5..ad732b0c7 100644
--- a/apps/showcases/PorousMediaGPU/utilGPU.h
+++ b/apps/showcases/PorousMediaGPU/utilGPU.h
@@ -50,26 +50,30 @@ class BulkVelocityCalculator {
 public:
     BulkVelocityCalculator(const std::shared_ptr<StructuredBlockForest> & blocks, const IDs& ids, const Setup& setup, const Config::BlockHandle& forceParameters) 
     : blocks_(blocks), setup_(setup), ids_(ids) {
-#if __CUDA_ARCH__ < 600
-        WALBERLA_LOG_WARNING_ON_ROOT("Warning atomic add in double precision not supported. Falling back to software implementation")
-#endif
 
         bool snapshotExists = false;
         const std::string snapshotFilename = setup_.snapshotBaseFolder + "/" + setup_.fileStem + "DrivingForceSnapshot.file";
-        filesystem::path snapshotFilePath( fileName_.c_str() );
+        filesystem::path snapshotFilePath( snapshotFilename.c_str() );
         if( filesystem::exists( snapshotFilePath ) )
             snapshotExists = true;
 
-
         if(snapshotExists && setup_.loadSnapshot){
             WALBERLA_LOG_INFO_ON_ROOT("Intialise PID Controller from Snapshot")
             forceUpdateFrequency_ = forceParameters.getParameter< uint_t >("forceUpdateFrequency", 0);
             adaptDrivingForce_ = forceParameters.getParameter< bool >("adaptDrivingForce", false);
+            writeFrequency_ = forceParameters.getParameter< uint_t >("writeFrequency", 100);
+            targetBulkVelocity_ = forceParameters.isDefined("targetBulkVelocity") ? forceParameters.getParameter< real_t >("targetBulkVelocity") : setup_.latticeVelocity;
             readStateFromFile();
+
+            if(!adaptDrivingForce_){
+                drivingForce_ = setup_.drivingForce;
+            }
         }
         else{
             forceUpdateFrequency_ = forceParameters.getParameter< uint_t >("forceUpdateFrequency", 0);
             adaptDrivingForce_ = forceParameters.getParameter< bool >("adaptDrivingForce", false);
+            writeFrequency_ = forceParameters.getParameter< uint_t >("writeFrequency", 100);
+            targetBulkVelocity_ = forceParameters.isDefined("targetBulkVelocity") ? forceParameters.getParameter< real_t >("targetBulkVelocity") : setup_.latticeVelocity;
             bulkVelocity_ = setup_.latticeVelocity;
             drivingForce_ = setup_.drivingForce;
 
@@ -119,6 +123,8 @@ public:
         WALBERLA_GPU_CHECK(gpuMalloc((void **)&resultGPU_, sizeof(double) * resultCPU_.size() ));
         WALBERLA_GPU_CHECK(gpuMemset(resultGPU_, 0.0, sizeof(double) * resultCPU_.size()));
 
+        fluidBitNr_ = uint_c(std::log(setup_.fluidFlag) / std::log(2));
+
         fileName_ = setup_.outputFolder + "/" + setup_.fileStem + "DrivingForce.csv"; 
         filesystem::path drivingForceOuputFile( fileName_.c_str() );
 
@@ -190,7 +196,20 @@ public:
     real_t getDrivingForce() {
         calculateBulkVelocity();
 
-        const real_t error = setup_.latticeVelocity - bulkVelocity_;
+        const real_t timestep = real_c(executionCounter_ * forceUpdateFrequency_) * setup_.dt;
+        if(!adaptDrivingForce_){
+            if(executionCounter_ % writeFrequency_ == 0){
+                WALBERLA_ROOT_SECTION(){
+                    std::ofstream outfile( fileName_.c_str(), std::ios_base::app );
+                    outfile << executionCounter_ * forceUpdateFrequency_ << "," << timestep << "," << drivingForce_ << "," << bulkVelocity_ << "\n";
+                    outfile.close();
+                }
+            }
+            executionCounter_++;
+            return drivingForce_;
+        }
+
+        const real_t error = targetBulkVelocity_ - bulkVelocity_;
         const real_t d = (error + real_c(3.0) * errorHistory_[0] - real_c(3.0) * errorHistory_[1] - errorHistory_[2]) * (real_c(1.0) / real_c(6.0));
 
         std::rotate(errorHistory_, errorHistory_ + 1, errorHistory_ + sizeof(errorHistory_) / sizeof(real_t));
@@ -208,11 +227,12 @@ public:
 
         drivingForce_ = std::min(std::max(minValue, newDrivingForce), maxValue);
 
-        const real_t timestep = real_c(executionCounter_ * forceUpdateFrequency_) * setup_.dt;
-        WALBERLA_ROOT_SECTION(){
-            std::ofstream outfile( fileName_.c_str(), std::ios_base::app );
-            outfile << executionCounter_ * forceUpdateFrequency_ << "," << timestep << "," << drivingForce_ << "," << bulkVelocity_ << "\n";
-            outfile.close();
+        if(executionCounter_ % writeFrequency_ == 0){
+            WALBERLA_ROOT_SECTION(){
+                std::ofstream outfile( fileName_.c_str(), std::ios_base::app );
+                outfile << executionCounter_ * forceUpdateFrequency_ << "," << timestep << "," << drivingForce_ << "," << bulkVelocity_ << "\n";
+                outfile.close();
+            }
         }
 
         executionCounter_++;
@@ -225,10 +245,12 @@ private:
     const IDs ids_;
 
     uint_t forceUpdateFrequency_;
+    uint_t writeFrequency_;
     bool adaptDrivingForce_;
 
     real_t bulkVelocity_;
     real_t drivingForce_;
+    real_t targetBulkVelocity_;
 
     real_t proportionalGain_;
     real_t derivateGain_;
@@ -245,6 +267,7 @@ private:
     std::string fileName_;
 
     double volume_;
+    uint_t fluidBitNr_;
 
     std::vector<double> resultCPU_;
     double* resultGPU_;
-- 
GitLab