From 902a0196237a1ab9356bc0f1835fa6f324a75db4 Mon Sep 17 00:00:00 2001
From: Markus Holzer <markus.holzer@onera.fr>
Date: Fri, 21 Feb 2025 14:01:14 +0100
Subject: [PATCH] [skip ci] update

---
 apps/benchmarks/NonUniformGridGPU/LdcSetup.h  |  17 +-
 .../NonUniformGridGPU/NonUniformGridGPU.cpp   |   3 +-
 .../NonUniformGridGPU/NonUniformGridGPU.py    |  12 +-
 .../simulation_setup/benchmark_configs.py     |  15 +-
 apps/showcases/Debug/CMakeLists.txt           |  49 ++
 apps/showcases/Debug/Debug.cpp                | 590 ++++++++++++++++++
 apps/showcases/Debug/Debug.py                 | 126 ++++
 apps/showcases/PorousMediaGPU/CMakeLists.txt  |  19 +-
 apps/showcases/PorousMediaGPU/Debug.cpp       | 468 ++++++++++++++
 apps/showcases/PorousMediaGPU/Debug.py        | 127 ++++
 .../PorousMediaGPU/DragEvaluation.cpp         |  48 +-
 .../showcases/PorousMediaGPU/GridGeneration.h |  19 +-
 .../PorousMediaGPU/InitializerFunctions.cpp   |  27 +-
 .../PorousMediaGPU/PorousMediaGPU.cpp         | 151 +++--
 .../PorousMediaGPU/PorousMediaGPU.py          |  39 +-
 apps/showcases/PorousMediaGPU/Setup.h         |  44 +-
 apps/showcases/PorousMediaGPU/TPMS.prm        |   4 +-
 apps/showcases/PorousMediaGPU/roughWall.prm   |  24 +-
 python/lbmpy_walberla/packing_kernels.py      |   6 +-
 python/lbmpy_walberla/sweep_collection.py     |  14 -
 python/lbmpy_walberla/walberla_lbm_package.py |  25 +-
 python/pystencils_walberla/jinja_filters.py   |  88 ++-
 .../gpu/BasicRecursiveTimeStepGPU.impl.h      |   2 +-
 .../refinement/RefinementScaling.h            |  17 +-
 24 files changed, 1753 insertions(+), 181 deletions(-)
 create mode 100644 apps/showcases/Debug/CMakeLists.txt
 create mode 100644 apps/showcases/Debug/Debug.cpp
 create mode 100644 apps/showcases/Debug/Debug.py
 create mode 100644 apps/showcases/PorousMediaGPU/Debug.cpp
 create mode 100644 apps/showcases/PorousMediaGPU/Debug.py

diff --git a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
index b8431f2f2..49564790f 100644
--- a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
@@ -48,13 +48,18 @@ class LDCRefinement
    {
       const AABB & domain = forest.getDomain();
 
-      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
-      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );
+      const real_t t = 1.0;
+
+      const AABB minAABB(domain.xMin(), domain.yMin(), domain.zMin(), domain.xMax(), domain.yMin() + t, domain.zMax());
+      const AABB maxAABB(domain.xMin(), domain.yMax() - t, domain.zMin(), domain.xMax(), domain.yMax(), domain.zMax());
+
+      // const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
+      // const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );
 
       for(auto & block : forest)
       {
          auto & aabb = block.getAABB();
-         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
+         if( minAABB.intersects( aabb ) || maxAABB.intersects( aabb ) )
          {
             if( block.getLevel() < refinementDepth_)
                block.setMarker( true );
@@ -88,14 +93,14 @@ class LDC
          const uint_t level       = b.getLevel();
          auto flagField     = b.getData< FlagField_T >(flagFieldID);
          const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
-         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         // const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
          for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
          {
             const Cell localCell = cIt.cell();
             Cell globalCell(localCell);
             sbfs.transformBlockLocalToGlobalCell(globalCell, b);
-            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
-            else if (globalCell.y() < 0 || globalCell.x() < 0 || globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)))
+            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, noslipFlag); }
+            else if (globalCell.y() < 0)
             {
                flagField->addFlag(localCell, noslipFlag);
             }
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
index 818f612b9..b824434af 100644
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
@@ -98,6 +98,7 @@ int main(int argc, char** argv)
       // Reading parameters
       auto parameters   = config->getOneBlock("Parameters");
       const real_t omega             = parameters.getParameter< real_t >("omega", real_c(1.95));
+      const real_t fx             = parameters.getParameter< real_t >("force", real_c(1e-6));
       const uint_t timesteps         = parameters.getParameter< uint_t >("timesteps", uint_c(50));
       const bool gpuEnabledMPI       = parameters.getParameter< bool >("gpuEnabledMPI", false);
 
@@ -145,7 +146,7 @@ int main(int argc, char** argv)
          Cell(parameters.getParameter< Vector3< cell_idx_t > >("innerOuterSplit", Vector3< cell_idx_t >(1, 1, 1)));
       Vector3< int32_t > gpuBlockSize =
          parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
-      SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0],
+      SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, fx, gpuBlockSize[0],
                                         gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit);
 
       for (auto& iBlock : *blocks){
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
index a1f5bff23..ebe42a676 100644
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
@@ -11,6 +11,7 @@ from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil, Subgri
 
 from pystencils_walberla import CodeGeneration, generate_info_header
 from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+from lbmpy_walberla import RefinementScaling
 
 omega = sp.symbols("omega")
 omega_free = sp.Symbol("omega_free")
@@ -36,8 +37,8 @@ with CodeGeneration() as ctx:
 
     streaming_pattern = 'esopull'
     timesteps = get_timesteps(streaming_pattern)
-    stencil = LBStencil(Stencil.D3Q19)
-    method_enum = Method.CUMULANT
+    stencil = LBStencil(Stencil.D3Q27)
+    method_enum = Method.SRT
 
     fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
     collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()
@@ -47,7 +48,9 @@ with CodeGeneration() as ctx:
     density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
     macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
 
+    fx = sp.Symbol("Fx")
     lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
+                           force=tuple([fx, 0.0, 0.0]),
                            fourth_order_correction=fourth_order_correction,
                            streaming_pattern=streaming_pattern)
     lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
@@ -60,10 +63,13 @@ with CodeGeneration() as ctx:
     ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
                                  boundary_object=UBB([0.05, 0, 0], data_type=field_type))
 
+    refinement_scaling = RefinementScaling()
+    refinement_scaling.add_standard_relaxation_rate_scaling(omega)
+    refinement_scaling.add_force_scaling(fx)
     generate_lbm_package(ctx, name="NonUniformGridGPU",
                          collision_rule=collision_rule,
                          lbm_config=lbm_config, lbm_optimisation=lbm_opt,
-                         nonuniform=True, boundaries=[no_slip, ubb],
+                         nonuniform=True, refinement_scaling=refinement_scaling, boundaries=[no_slip, ubb],
                          macroscopic_fields=macroscopic_fields,
                          target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
                          max_threads=max_threads)
diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
index 6f7b6820a..f876ae86f 100644
--- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
@@ -40,7 +40,7 @@ class Scenario:
         self.domain_size = domain_size
         self.root_blocks = root_blocks
         self.cells_per_block = cells_per_block
-        self.periodic = (0, 0, 1)
+        self.periodic = (1, 0, 1)
 
         self.refinement_depth = refinement_depth
         self.num_processes = num_processes
@@ -72,16 +72,17 @@ class Scenario:
                 'blockForestFilestem': self.bfs_filestem,
                 'writeVtk': self.write_setup_vtk,
                 'outputStatistics': True,
-                'writeSetupForestAndReturn': True,
+                'writeSetupForestAndReturn': False,
             },
             'Parameters': {
                 'omega': 1.95,
+                'force': 1e-6,
                 'timesteps': self.timesteps,
                 'remainingTimeLoggerFrequency': self.logger_frequency,
                 'vtkWriteFrequency': self.vtk_write_frequency,
-                'useVTKAMRWriter': True,
+                'useVTKAMRWriter': False,
                 'oneFilePerProcess': False,
-                'writeOnlySlice': False,
+                'writeOnlySlice': True,
                 'gpuEnabledMPI': self.gpu_enabled_mpi,
                 'gpuBlockSize': (128, 1, 1),
             },
@@ -144,9 +145,9 @@ def validation_run():
     scenario = Scenario(domain_size=domain_size,
                         root_blocks=root_blocks,
                         cells_per_block=cells_per_block,
-                        timesteps=0,
-                        vtk_write_frequency=0,
-                        refinement_depth=3,
+                        timesteps=1001,
+                        vtk_write_frequency=1000,
+                        refinement_depth=1,
                         gpu_enabled_mpi=False)
     scenarios.add(scenario)
 
diff --git a/apps/showcases/Debug/CMakeLists.txt b/apps/showcases/Debug/CMakeLists.txt
new file mode 100644
index 000000000..31af001f3
--- /dev/null
+++ b/apps/showcases/Debug/CMakeLists.txt
@@ -0,0 +1,49 @@
+waLBerla_link_files_to_builddir( *.py )
+waLBerla_link_files_to_builddir( *.prm )
+waLBerla_link_files_to_builddir( *.png )
+waLBerla_link_files_to_builddir( *.obj )
+waLBerla_link_files_to_builddir( *.stl )
+waLBerla_link_files_to_builddir( *.mtl )
+
+waLBerla_generate_target_from_python(NAME PorousMediaGPUCodeGen
+        FILE PorousMediaGPU.py
+        OUT_FILES PorousMediaGPUSweepCollection.h PorousMediaGPUSweepCollection.${CODEGEN_FILE_SUFFIX}
+        PorousMediaGPUStorageSpecification.h PorousMediaGPUStorageSpecification.${CODEGEN_FILE_SUFFIX}
+        NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX}
+        Obstacle.h Obstacle.${CODEGEN_FILE_SUFFIX}
+        UBB.h UBB.${CODEGEN_FILE_SUFFIX}
+        TurbulentInflow.h TurbulentInflow.${CODEGEN_FILE_SUFFIX}
+        Outflow.h Outflow.${CODEGEN_FILE_SUFFIX}
+        Welford.h Welford.${CODEGEN_FILE_SUFFIX}
+        PorousMediaGPUBoundaryCollection.h
+        PorousMediaGPUInfoHeader.h
+        PorousMediaGPUStaticDefines.h)
+
+if (WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP)
+    waLBerla_add_executable ( NAME PorousMediaGPU
+            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h DragEvaluation.cpp InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp utilGPU.cu
+            DEPENDS blockforest boundary core field gpu io lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen )
+else()
+    waLBerla_add_executable ( NAME PorousMediaGPU
+            FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h DragEvaluation.cpp InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp
+            DEPENDS blockforest boundary core field io lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen )
+
+endif(WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP)
+
+waLBerla_generate_target_from_python(NAME DebugCodeGen
+        FILE Debug.py
+        OUT_FILES DebugSweepCollection.h DebugSweepCollection.cpp
+        DebugStorageSpecification.h DebugStorageSpecification.cpp
+        NoSlip.h NoSlip.cpp
+        Obstacle.h Obstacle.cpp
+        UBB.h UBB.cpp
+        TurbulentInflow.h TurbulentInflow.cpp
+        Outflow.h Outflow.cpp
+        Welford.h Welford.cpp
+        DebugBoundaryCollection.h
+        DebugInfoHeader.h
+        DebugStaticDefines.h)
+
+waLBerla_add_executable ( NAME Debug
+        FILES Debug.cpp GridGeneration.h Setup.h Types.h DragEvaluation.cpp InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp
+        DEPENDS blockforest boundary core field io lbm_generated postprocessing stencil timeloop vtk DebugCodeGen )
diff --git a/apps/showcases/Debug/Debug.cpp b/apps/showcases/Debug/Debug.cpp
new file mode 100644
index 000000000..c7b8a949f
--- /dev/null
+++ b/apps/showcases/Debug/Debug.cpp
@@ -0,0 +1,590 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PorousMediaGPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/Abort.h"
+#include "core/DataTypes.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/logging/Logging.h"
+#include "core/math/Vector3.h"
+#include "core/math/Constants.h"
+#include "core/Environment.h"
+#include "core/mpi/MPIManager.h"
+#include "core/MemoryUsage.h"
+#include "core/mpi/Reduce.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include "field/AddToStorage.h"
+#include "field/CellCounter.h"
+#include "field/FlagField.h"
+#include "field/StabilityChecker.h"
+#include "field/adaptors/AdaptorCreators.h"
+#include "field/iterators/FieldIterator.h"
+#include "field/vtk/VTKWriter.h"
+#include "field/vtk/FlagFieldCellFilter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+#include "geometry/mesh/TriangleMeshIO.h"
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/DeviceSelectMPI.h"
+#   include "gpu/ErrorChecking.h"
+#   include "gpu/FieldCopy.h"
+#   include "gpu/HostFieldAllocator.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/NonUniformGPUScheme.h"
+#endif
+
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+#   include "lbm_generated/gpu/AddToStorage.h"
+#   include "lbm_generated/gpu/GPUPdfField.h"
+#   include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h"
+#   include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.h"
+#endif
+
+#include "postprocessing/FieldToSurfaceMesh.h"
+
+#include "timeloop/SweepTimeloop.h"
+#include "vtk/VTKOutput.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+
+#include "DragEvaluation.h"
+#include "Inflow.h"
+#include "InitializerFunctions.h"
+#include "GridGeneration.h"
+#include "Setup.h"
+#include "Probe.h"
+#include "TPMS.h"
+#include "Types.h"
+#include "util.h"
+
+#include "PorousMediaGPUStaticDefines.h"
+
+using namespace walberla;
+using macroFieldType = VelocityField_T::value_type;
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
+using gpu::communication::NonUniformGPUScheme;
+using lbm_generated::NonuniformGeneratedGPUPdfPackInfo;
+#else
+using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
+using blockforest::communication::UniformBufferedScheme;
+
+using lbm_generated::UniformGeneratedPdfPackInfo;
+#endif
+
+inline Vector3< uint_t > inflowSliceSize( const shared_ptr< StructuredBlockStorage > & blocks, IBlock * const block ){
+   return Vector3<uint_t>( uint_c(1), blocks->getNumberOfYCells( *block ), blocks->getNumberOfZCells( *block ) );
+}
+
+class HemisphereWallDistance
+{
+ public:
+   HemisphereWallDistance(const real_t radius, const real_t hemispheresDistance) : radius_(radius), hemispheresDistance_(hemispheresDistance){} 
+
+   real_t operator()(const Cell& fluidCell, const Cell& boundaryCell,
+                     const shared_ptr< StructuredBlockForest >& SbF, IBlock& block){
+
+      auto domain = SbF->getDomain();
+      const real_t yMin = domain.yMin();
+      const real_t yMax = domain.yMax();
+
+      Vector3< real_t > boundary = SbF->getBlockLocalCellCenter( block, boundaryCell );
+      Vector3< real_t > fluid    = SbF->getBlockLocalCellCenter( block, fluidCell );
+
+      auto center = getCenterOfHemisphere(boundary, radius_, hemispheresDistance_, yMin, yMax);
+
+      hemispheresContains(boundary, center, radius_);
+
+      WALBERLA_CHECK(!hemispheresContains(fluid, center, radius_), "fluid cell is in contained in Hemisphere (" << fluid[0] << ", " << fluid[1] << ", " << fluid[2] << "). The block local cell is " << fluidCell)
+      WALBERLA_CHECK(hemispheresContains(boundary, center, radius_), "boundary cell is not in contained in Hemisphere (" << boundary[0] << ", " << boundary[1] << ", " << boundary[2] << "). The block local cell is " << boundaryCell)
+
+      return delta(fluid, boundary, center);
+   }
+
+   real_t delta(const Vector3< real_t >& fluid, const Vector3< real_t >& boundary, const Vector3< real_t >& center) const
+   {
+      // http://devmag.org.za/2009/04/17/basic-collision-detection-in-2d-part-2/
+      const Vector3< real_t > f = fluid - center;
+      const Vector3< real_t > d = (boundary - center) - f;
+
+      const real_t dDotd = d[0] * d[0] + d[1] * d[1] + d[2] * d[2];
+      const real_t fDotf = f[0] * f[0] + f[1] * f[1] + f[2] * f[2];
+      const real_t dDotf = d[0] * f[0] + d[1] * f[1] + d[2] * f[2];
+
+      const real_t b = real_c(2.0) * dDotf;
+      const real_t c = fDotf - (radius_ * radius_);
+
+      const real_t bb4ac = b * b - (real_c(4.0) * dDotd * c);
+      WALBERLA_CHECK_GREATER_EQUAL(bb4ac, real_c(0.0))
+
+      const real_t sqrtbb4ac = std::sqrt(bb4ac);
+      const real_t alpha = std::min((-b + sqrtbb4ac) / (real_c(2.0) * dDotd), (-b - sqrtbb4ac) / (real_c(2.0) * dDotd));
+
+      WALBERLA_CHECK_GREATER_EQUAL(alpha, real_c(0.0))
+      WALBERLA_CHECK_LESS_EQUAL(alpha, real_c(1.0))
+
+      return alpha;
+   }
+
+
+private:
+   const real_t radius_;
+   const real_t hemispheresDistance_;
+};
+
+real_t defaultWallDistance(const Cell& /*fluidCell*/, const Cell& /*boundaryCell*/, const shared_ptr< StructuredBlockForest >& /*SbF*/, IBlock& /*block*/){
+   return real_c(0.5);
+}
+
+void refineWalls(SetupBlockForest& forest, const real_t hemispheresRadius, const real_t dx, const uint_t level)
+{
+   auto domain = forest.getDomain();
+   const real_t yMin = domain.yMin();
+   const real_t yMax = domain.yMax();
+
+   const real_t t = hemispheresRadius + dx;
+
+   const AABB minAABB(domain.xMin(), domain.yMin(), domain.zMin(), domain.xMax(), domain.yMin() + t, domain.zMax());
+   // const AABB maxAABB(domain.xMin(), domain.yMax() - t, domain.zMin(), domain.xMax(), domain.yMax(), domain.zMax());
+
+   for (auto block = forest.begin(); block != forest.end(); ++block){
+      const AABB& aabb = block->getAABB();
+
+      // if (block->getLevel() < level && (minAABB.intersects(aabb) || maxAABB.intersects(aabb)) ){
+      if (block->getLevel() < level && minAABB.intersects(aabb) ){
+         block->setMarker(true);
+      }
+   }
+}
+
+
+
+int main(int argc, char** argv){
+   Environment env( argc, argv );
+   mpi::MPIManager::instance()->useWorldComm();
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   gpu::selectDeviceBasedOnMpiRank();
+#endif
+
+   auto config = env.config();
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                        SETUP AND CONFIGURATION                                             ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+   auto parameters        = config->getOneBlock("Parameters");
+   auto loggingParameters = config->getOneBlock("Logging");
+   auto boundariesConfig  = config->getBlock("Boundaries");
+
+   Setup setup(config, infoMap);
+   WALBERLA_LOG_INFO_ON_ROOT( "Simulation run data:"
+                              "\n- simulation parameters:"   << std::setprecision(16) <<
+                              "\n   + collision model:     " << setup.collisionModel <<
+                              "\n   + stencil:             " << setup.stencil <<
+                              "\n   + streaming:           " << setup.streamingPattern <<
+                              "\n   + compressible:        " << ( StorageSpecification_T::compressible ? "yes" : "no" ) <<
+                              "\n- simulation properties:  "
+                              "\n   + kin. viscosity:      " << setup.kinViscosity << " [m^2/s] (on the coarsest grid)" <<
+                              "\n   + viscosity LB:        " << setup.viscosity  << " [dx*dx/dt] (on the coarsest grid)" <<
+                              "\n   + omega:               " << setup.omega << " (on the coarsest grid)" <<
+                              "\n   + omega:               " << setup.omegaF << " (on the finest grid)" <<
+                              "\n   + inflow velocity:     " << setup.referenceVelocity << " [m/s]" <<
+                              "\n   + lattice velocity:    " << setup.latticeVelocity << " [dx/dt]" <<
+                              "\n   + Reynolds number:     " << setup.reynoldsNumber <<
+                              "\n   + Mach number:         " << setup.machNumber <<
+                              "\n   + driving force:       " << setup.drivingForce <<
+                              "\n   + dx (coarsest grid):  " << setup.dxC << " [m]" <<
+                              "\n   + dx (finest grid):    " << setup.dxF << " [m]" <<
+                              "\n   + dt (coarsest grid):  " << setup.dt << " [s]" <<
+                              "\n   + conversion Faktor:   " << setup.conversionFactor <<
+                              "\n   + #time steps:         " << setup.timesteps << " (on the coarsest grid, " << ( real_c(1.0) / setup.dt ) << " for 1s of real time)" <<
+                              "\n   + simulation time:     " << ( real_c( setup.timesteps ) * setup.dt ) << " [s]" )
+
+   bool writeSetupForestAndReturn = loggingParameters.getParameter< bool >("writeSetupForestAndReturn", false);
+   if (uint_c(MPIManager::instance()->numProcesses()) > 1) writeSetupForestAndReturn = false;
+   shared_ptr< BlockForest > bfs;
+
+   auto refinementSelection = std::bind(refineWalls, _1, real_c(1.0), setup.dxF, setup.refinementLevels);
+
+   createBlockForest(bfs, setup, refinementSelection);
+
+   if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1){
+      WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+      logging::Logging::printFooterOnStream();
+      return EXIT_SUCCESS;
+   }
+
+   auto blocks = std::make_shared< StructuredBlockForest >(bfs, setup.cellsPerBlock[0], setup.cellsPerBlock[1], setup.cellsPerBlock[2]);
+   blocks->createCellBoundingBoxes();
+   auto finalDomain = blocks->getDomain();
+
+   const StorageSpecification_T StorageSpec = StorageSpecification_T();
+   // Creating fields
+   uint_t step = 0;
+   WALBERLA_LOG_INFO_ON_ROOT(++step << ". Allocating data")
+   IDs ids;
+   ids.pdfField  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(1), field::fzyx);
+
+   auto allocator = make_shared< gpu::HostFieldAllocator<macroFieldType> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
+   ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, uint_c(1), allocator);
+   ids.meanVelocityField = field::addToStorage< VelocityField_T >(blocks, "mean velocity", macroFieldType(0.0), field::fzyx, uint_c(1), allocator);
+   ids.densityField  = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, uint_c(1), allocator);
+   ids.flagField     = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
+
+   ids.pdfFieldGPU      = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, ids.pdfField, StorageSpec, "pdfs on GPU", true);
+   ids.velocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityField, "velocity on GPU", true);
+   ids.meanVelocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.meanVelocityField, "mean velocity on GPU", true);
+   ids.densityFieldGPU  = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, ids.densityField, "velocity on GPU", true);
+
+
+   std::function< Vector3< uint_t > (const shared_ptr< StructuredBlockStorage >&, IBlock * const) > getInflowSliceSize = inflowSliceSize;
+   ids.velocityFieldInflowSlice    = field::addToStorage< VelocityField_T >(blocks, "velocity Inflow", getInflowSliceSize, macroFieldType(0.0), field::fzyx, uint_c(1));
+   ids.velocityFieldInflowSliceGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityFieldInflowSlice, "velocity Inflow on GPU", true);
+
+   const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
+   SweepCollection_T sweepCollection(blocks, ids.pdfFieldGPU, ids.densityFieldGPU, ids.velocityFieldGPU, setup.drivingForce, setup.omega, innerOuterSplit);
+   WelfordSweep_T welfordSweep(ids.meanVelocityFieldGPU, ids.velocityFieldGPU, real_c(0.0));
+
+   WALBERLA_LOG_INFO_ON_ROOT(++step << ". Initialising data")
+   for (auto& block : *blocks){
+      sweepCollection.initialise(&block);
+
+      auto velocityInflow = block.getData< VelocityField_T >(ids.velocityFieldInflowSlice);
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velocityInflow,
+         velocityInflow->get(x, y, z, 0) = setup.latticeVelocity;
+      )
+   }
+   sweepCollection.initialiseBlockPointer();
+   gpu::fieldCpy< gpu::GPUField< VelocityField_T::value_type >, VelocityField_T >(blocks, ids.velocityFieldInflowSliceGPU, ids.velocityFieldInflowSlice);
+
+   auto inflow = std::make_shared<Inflow>(blocks, setup, ids);
+   auto probe = std::make_shared<Probe>(blocks, setup, ids, config->getOneBlock("Probes"));
+
+   const real_t lenght = 8.0;
+   const real_t res = 3.0;
+   const real_t start = 0.0;
+
+/*
+   const uint_t nx = uint_c(std::floor(lenght / (setup.dxC * res)));
+   for(uint_t i = 0; i < nx; ++i){
+      probe->addPoint(Vector3<real_t>(start + real_c(i) * (setup.dxC * res), 0.5, 0.5 ));
+   }
+*/ 
+   probe->initialise();
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+   // Boundaries
+   WALBERLA_LOG_INFO_ON_ROOT(++step << ". Setting boundary conditions")
+   for (auto& block : *blocks){
+      auto flagField = block.getData< FlagField_T >(ids.flagField);
+      setup.inflowFlag          = flagField->registerFlag(setup.inflowUID);
+      setup.turbulentInflowFlag = flagField->registerFlag(setup.turbulentInflowUID);
+      setup.outflowFlag         = flagField->registerFlag(setup.outflowUID);
+      setup.obstacleFlag        = flagField->registerFlag(setup.obstacleUID);
+      setup.noSlipFlag          = flagField->registerFlag(setup.noSlipUID);
+      setup.fluidFlag           = flagField->registerFlag(setup.fluidUID);
+   }
+
+
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, ids.flagField, boundariesConfig);
+   std::function< VelocityField_T::value_type (const Cell&, const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) > wallDistanceFunctor = defaultWallDistance;
+   if(setup.useGrid){
+      WALBERLA_LOG_INFO_ON_ROOT(++step << ". Setting Grid after inflow to produce turbulence structure")
+      initGrid(blocks, ids.flagField, setup);
+   }
+
+/*
+   if(config->getNumBlocks("RoughWall") > 0){
+      WALBERLA_LOG_INFO_ON_ROOT(++step << ". Using RoughWall setup")
+      auto wallParameters = config->getOneBlock("RoughWall");
+      const real_t radius   = wallParameters.getParameter< real_t >("hemispheresRadius");
+      const real_t hemispheresDistance = wallParameters.getParameter< real_t >("hemispheresDistance");
+      initHemisphers(blocks, ids.flagField, setup, config->getOneBlock("RoughWall"));
+      HemisphereWallDistance wallDist(radius, hemispheresDistance);
+      wallDistanceFunctor = wallDist;
+   }
+*/
+   if(config->getNumBlocks("TPMS") > 0){
+      WALBERLA_LOG_INFO_ON_ROOT(++step << ". Placing TPMS material")
+      auto tpms = std::make_shared<TPMS>(config->getOneBlock("TPMS"), setup);
+      tpms->setupBoundary(blocks, ids.flagField);
+   }
+
+
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, setup.fluidUID);
+   // BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.velocityFieldInflowSliceGPU, setup.drivingForce);
+   BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.velocityFieldInflowSliceGPU, setup.latticeVelocity, wallDistanceFunctor, ids.pdfField);
+   // BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.pdfField); 
+
+   if(config->getNumBlocks("TPMS") > 0 && loggingParameters.getParameter< bool >("writeSurfaceMeshOfTPMS", false) ){
+      WALBERLA_LOG_INFO_ON_ROOT(++step << ". Writing TPMS as surface file")
+      auto mesh = postprocessing::flagFieldToSurfaceMesh< FlagField_T >(blocks, ids.flagField, setup.obstacleFlag, true);
+      WALBERLA_EXCLUSIVE_WORLD_SECTION(0){
+         geometry::writeMesh("TPMS.obj", *mesh);
+      }
+   }
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                           COMMUNICATION SCHEME                                             ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+   const bool gpuEnabledMPI = parameters.getParameter< bool >("gpuEnabledMPI", false);
+   auto nonUniformCommunication = std::make_shared< NonUniformGPUScheme< CommunicationStencil_T > >(blocks, gpuEnabledMPI);
+   auto nonUniformPackInfo      = lbm_generated::setupNonuniformGPUPdfCommunication< GPUPdfField_T >(blocks, ids.pdfFieldGPU);
+   nonUniformCommunication->addPackInfo(nonUniformPackInfo);
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                          TIME STEP DEFINITIONS                                             ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   const int streamLowPriority = 0;
+   auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+   SweepTimeloop timeloop(blocks->getBlockStorage(), setup.timesteps);
+
+   timeloop.addFuncBeforeTimeStep(inflow->updateGPUFunctor(), "Update inflow velocity");
+   timeloop.addFuncBeforeTimeStep(probe->updateFunctor(), "Update Probe Data");
+
+   lbm_generated::BasicRecursiveTimeStepGPU< GPUPdfField_T, SweepCollection_T, BoundaryCollection_T >
+      LBMMeshRefinement(blocks, ids.pdfFieldGPU, sweepCollection, boundaryCollection, nonUniformCommunication, nonUniformPackInfo);
+
+   // timeloop.addFuncBeforeTimeStep(LBMMeshRefinement, "Refinement timestep");
+   LBMMeshRefinement.addRefinementToTimeLoop(timeloop, uint_c(0));
+
+   shared_ptr< DragEvaluation > evaluation;
+   if(config->getNumBlocks("RoughWall") > 0){
+      WALBERLA_LOG_INFO_ON_ROOT(++step << ". Setup Drag Evaluation")
+      evaluation = make_shared<DragEvaluation>(blocks, boundaryCollection, ids, setup, config);
+      LBMMeshRefinement.addPostBoundaryHandlingBlockFunction(evaluation->forceCalculationFunctor());
+
+   }
+
+
+   const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - setup.dxF,
+                      finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + setup.dxF);
+
+/*                      
+   CellInterval globalCellInterval;
+   blocks->getCellBBFromAABB( globalCellInterval, sliceXY, uint_c(0) );
+
+   std::map<IBlock*, CellInterval> sliceCellIntervals;
+
+   for (auto& block : *blocks){
+      CellInterval cellBB;
+      blocks->getCellBBFromAABB( cellBB, sliceXY, blocks->getLevel(block) );
+      blocks->transformGlobalToBlockLocalCellInterval( cellBB, block );
+
+      Cell maxCell(cell_idx_c(0), cell_idx_c(0), cell_idx_c(0));
+      Cell minCell(cell_idx_c( blocks->getNumberOfXCells( block ) ), cell_idx_c( blocks->getNumberOfYCells( block ) ), cell_idx_c( blocks->getNumberOfZCells( block ) ));
+
+      for( cell_idx_t z = cell_idx_c(0); z != cell_idx_c( blocks->getNumberOfZCells( block ) ); ++z ){
+         for( cell_idx_t y = cell_idx_c(0); y != cell_idx_c( blocks->getNumberOfYCells( block ) ); ++y ){
+            for( cell_idx_t x = cell_idx_c(0); x != cell_idx_c( blocks->getNumberOfXCells( block ) ); ++x ){
+               if( cellBB.contains(x,y,z) ){
+                  if(Cell(x, y, z) < minCell){
+                     minCell = Cell(x, y, z);
+                  }
+                  if(!(Cell(x, y, z) < maxCell)){
+                     maxCell = Cell(x, y, z);
+                  }
+               }
+            }
+         }
+      }
+      CellInterval localCellInterval(minCell, maxCell);
+      if(!localCellInterval.empty()){
+         sliceCellIntervals[&block] = localCellInterval;
+      }
+   }
+
+   timeloop.add() << BeforeFunction([&]()
+   {
+      for (auto const& sliceCellInterval : sliceCellIntervals){
+         sweepCollection.calculateMacroscopicParametersCellInterval(sliceCellInterval.first, sliceCellInterval.second);
+      }
+
+      const uint_t velCtr = uint_c(welfordSweep.getCounter());
+      welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
+   }) << Sweep(welfordSweep.getSweepOnCellInterval(blocks, globalCellInterval), "welford sweep");
+
+*/
+
+   /*SweepTimeloop timeloop(blocks->getBlockStorage(), setup.timesteps);
+   timeloop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
+                  << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
+   timeloop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");*/
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                             VTK Output                                                     ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+   // VTK
+   auto VTKWriter = config->getOneBlock("VTKWriter");
+   const uint_t vtkWriteFrequency       = VTKWriter.getParameter< uint_t >("vtkWriteFrequency", 0);
+   const uint_t vtkGhostLayers          = VTKWriter.getParameter< uint_t >("vtkGhostLayers", 0);
+   const bool amrFileFormat             = VTKWriter.getParameter< bool >("amrFileFormat", false);
+   const bool oneFilePerProcess         = VTKWriter.getParameter< bool >("oneFilePerProcess", false);
+   const real_t samplingResolution      = VTKWriter.getParameter< real_t >("samplingResolution", real_c(-1.0));
+   const uint_t initialWriteCallsToSkip = VTKWriter.getParameter< uint_t >("initialWriteCallsToSkip", uint_c(0.0));
+   const real_t sliceThickness          = VTKWriter.getParameter< real_t >("sliceThickness", real_c(1.0));
+
+   if (vtkWriteFrequency > 0){
+      const std::string vtkFolder = "VTKPorousMediaRE_" + std::to_string(uint64_c(setup.reynoldsNumber));
+      auto vtkOutput =
+         vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, vtkGhostLayers, false, vtkFolder,
+                                        "simulation_step", false, true, true, false, 0, amrFileFormat, oneFilePerProcess);
+
+      vtkOutput->setInitialWriteCallsToSkip(initialWriteCallsToSkip);
+
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks){
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+
+
+         // for (auto const& sliceCellInterval : sliceCellIntervals){
+         //    sweepCollection.calculateMacroscopicParametersCellInterval(sliceCellInterval.first, sliceCellInterval.second);
+         // }
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+         gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.velocityField, ids.velocityFieldGPU);
+         // gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.meanVelocityField, ids.meanVelocityFieldGPU);
+         // gpu::fieldCpy< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(blocks, ids.densityField, ids.densityFieldGPU);
+#endif
+      });
+
+      vtkOutput->setSamplingResolution(samplingResolution);
+
+      field::FlagFieldCellFilter<FlagField_T> fluidFilter( ids.flagField );
+      fluidFilter.addFlag( setup.obstacleUID );
+      vtkOutput->addCellExclusionFilter(fluidFilter);
+
+
+      if (VTKWriter.getParameter< bool >("writeOnlySlice", true)){
+         // const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - sliceThickness * setup.dxF,
+         //                    finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + sliceThickness * setup.dxF);
+         vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
+
+         if (VTKWriter.getParameter< bool >("writeXZSlice", false)){
+            const AABB sliceXZ(finalDomain.xMin(), finalDomain.center()[1] - setup.dxF, finalDomain.zMin(),
+                               finalDomain.xMax(), finalDomain.center()[1] + setup.dxF, finalDomain.zMax());
+            vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXZ));
+         }
+      }
+
+      if (VTKWriter.getParameter< bool >("velocity")){
+         if (VTKWriter.getParameter< bool >("writeVelocityAsMagnitude")){
+            auto velWriter = make_shared< field::VTKMagnitudeWriter< VelocityField_T, float32 > >(ids.velocityField, "velocityMagnitude");
+            vtkOutput->addCellDataWriter(velWriter);
+         }
+         else{
+            auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(ids.velocityField, "velocity");
+            vtkOutput->addCellDataWriter(velWriter);
+         }
+      }
+      if (VTKWriter.getParameter< bool >("meanVelocity", false)){
+         auto meanVelWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(ids.meanVelocityField, "meanVelocity");
+         vtkOutput->addCellDataWriter(meanVelWriter);
+
+      }
+      if (VTKWriter.getParameter< bool >("density")){
+         auto densityWriter = make_shared< field::VTKWriter< ScalarField_T, float32 > >(ids.densityField, "density");
+         vtkOutput->addCellDataWriter(densityWriter);
+      }
+      if (VTKWriter.getParameter< bool >("flag")){
+         auto flagWriter = make_shared< field::VTKWriter< FlagField_T > >(ids.flagField, "flag");
+         vtkOutput->addCellDataWriter(flagWriter);
+      }
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   // log remaining time
+   if (uint_c(setup.remainingTimeLoggerFrequency) > 0){
+      timeloop.addFuncAfterTimeStep(
+         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), setup.remainingTimeLoggerFrequency),
+         "remaining time logger");
+   }
+
+   auto CheckerParameters      = config->getOneBlock("StabilityChecker");
+   const uint_t checkFrequency = CheckerParameters.getParameter< uint_t >("checkFrequency", uint_t(0));
+   if (checkFrequency > 0){
+      auto checkFunction = [](VelocityField_T::value_type value) {  return value < math::abs(VelocityField_T::value_type(10)); };
+      timeloop.addFuncAfterTimeStep(
+         makeSharedFunctor(field::makeStabilityChecker< VelocityField_T, FlagField_T >(
+            config, blocks, ids.velocityField, ids.flagField, setup.fluidUID, checkFunction)),
+         "Stability check");
+   }
+
+   WALBERLA_MPI_BARRIER()
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+
+   const lbm_generated::PerformanceEvaluation< FlagField_T > performance(blocks, ids.flagField, setup.fluidUID);
+   field::CellCounter< FlagField_T > fluidCells(blocks, ids.flagField, setup.fluidUID);
+   fluidCells();
+
+   WALBERLA_LOG_INFO_ON_ROOT(++step << ". Starting Simulation with " << setup.timesteps << " timesteps")
+   WcTimingPool timeloopTiming;
+   WcTimer simTimer;
+
+   WALBERLA_MPI_BARRIER()
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+
+   simTimer.start();
+   timeloop.run(timeloopTiming);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+   WALBERLA_MPI_BARRIER()
+   simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   double time = double_c(simTimer.max());
+   WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+   performance.logResultOnRoot(setup.timesteps, time);
+
+   const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+   WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+   printResidentMemoryStatistics();
+   return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/apps/showcases/Debug/Debug.py b/apps/showcases/Debug/Debug.py
new file mode 100644
index 000000000..dfb101c55
--- /dev/null
+++ b/apps/showcases/Debug/Debug.py
@@ -0,0 +1,126 @@
+import sympy as sp
+from pystencils import Target
+
+from pystencils.field import fields
+from pystencils.simp import insert_aliases, insert_constants
+
+from lbmpy import LBStencil, LBMConfig, LBMOptimisation
+from lbmpy.boundaries.boundaryconditions import NoSlip, NoSlipLinearBouzidi, ExtrapolationOutflow, UBB, FixedDensity
+from lbmpy.creationfunctions import create_lb_collision_rule
+from lbmpy.enums import Method, Stencil, ForceModel
+from lbmpy.flow_statistics import welford_assignments
+
+from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+info_header = """
+#pragma once
+#include <map>
+std::map<std::string, std::string> infoMap{{{{"stencil", "{stencil}"}},
+                                           {{"streamingPattern", "{streaming_pattern}"}}, 
+                                           {{"collisionOperator", "{collision_operator}"}}}};
+"""
+
+omega = sp.symbols("omega")
+inlet_velocity = sp.symbols("u_x")
+force_vector = tuple([sp.Symbol("Fx"), 0.0, 0.0])
+max_threads = 256
+ 
+with CodeGeneration() as ctx:
+    target = Target.CPU # Target.GPU if ctx.gpu else Target.CPU
+    name_prefix = "Debug" # "PorousMediaGPU"
+    sweep_params = {'block_size': (128, 1, 1)} if ctx.gpu else {}
+
+    # The application is meant to be compiled with double precision. For single precision, the pdf_dtype can be switched
+    # to float32. In this way the calculations are still performed in double precision while the application can profit
+    # from enhanced performance due to the lower precision of the PDF field
+    dtype = 'float64' if ctx.double_accuracy else 'float32'
+    pdf_dtype = 'float64'
+
+    stencil = LBStencil(Stencil.D3Q27)
+    q = stencil.Q
+    dim = stencil.D
+
+    streaming_pattern = 'esotwist'
+
+    pdfs = fields(f"pdfs({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
+    velocity_field, density_field = fields(f"velocity({dim}), density(1) : {dtype}[{dim}D]", layout='fzyx')
+    mean_velocity = fields(f"mean_velocity({dim}): {dtype}[{stencil.D}D]", layout='fzyx')
+
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    # method_enum = Method.CENTRAL_MOMENT
+    method_enum = Method.SRT
+    lbm_config = LBMConfig(
+        method=method_enum,
+        stencil=stencil,
+        relaxation_rate=omega,
+        compressible=True,
+        force_model=ForceModel.GUO,
+        force=force_vector,
+        # subgrid_scale_model=SubgridScaleModel.QR,
+        fourth_order_correction=0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False,
+        field_name='pdfs',
+        streaming_pattern=streaming_pattern
+    )
+
+    lbm_opt = LBMOptimisation(cse_global=False, cse_pdfs=False, field_layout="fzyx",
+                              symbolic_field=pdfs)
+
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+    if lbm_config.method == Method.CUMULANT:
+        collision_rule = insert_constants(collision_rule)
+        collision_rule = insert_aliases(collision_rule)
+    lb_method = collision_rule.method
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip(), field_data_type=pdf_dtype)
+ 
+    quadratic_bounce_back = NoSlipLinearBouzidi(calculate_force_on_boundary=True, data_type=dtype)
+    no_slip_interpolated = lbm_boundary_generator(class_name='Obstacle', flag_uid='Obstacle',
+                                                  boundary_object=quadratic_bounce_back,
+                                                  field_data_type=pdf_dtype)
+
+
+    turbulentubb = lbm_boundary_generator(class_name='TurbulentInflow', flag_uid='TurbulentInflow',
+                                          boundary_object=UBB(velocity=velocity_field.center_vector, density=1.0, data_type=dtype, dim=dim),
+                                          field_data_type=pdf_dtype)
+
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB(velocity=(inlet_velocity, 0.0, 0.0), density=1.0, data_type=dtype, dim=dim),
+                                 field_data_type=pdf_dtype)
+
+    # outflow_boundary = ExtrapolationOutflow(stencil[4], lb_method, data_type=pdf_dtype, density=1.0)
+    outflow_boundary = ExtrapolationOutflow(stencil[4], lb_method, data_type=pdf_dtype)
+    outflow = lbm_boundary_generator(class_name='Outflow', flag_uid='Outflow',
+                                     boundary_object=outflow_boundary,
+                                     field_data_type=pdf_dtype)
+
+    # outflow = lbm_boundary_generator(class_name='Outflow', flag_uid='Outflow',
+    #                                 boundary_object=FixedDensity(1.0),
+    #                                  field_data_type=pdf_dtype)
+
+    generate_lbm_package(ctx, name=name_prefix, collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, no_slip_interpolated, ubb, turbulentubb, outflow],
+                         macroscopic_fields=macroscopic_fields, gpu_indexing_params=sweep_params,
+                         target=target, data_type=dtype, pdfs_data_type=pdf_dtype,
+                         max_threads=max_threads)
+    
+    welford_update = welford_assignments(field=velocity_field, mean_field=mean_velocity)
+    generate_sweep(ctx, "Welford", welford_update, target=target)
+
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, f'{name_prefix}InfoHeader',
+                         field_typedefs=field_typedefs)
+
+    infoHeaderParams = {
+        'stencil': stencil.name.lower(),
+        'streaming_pattern': streaming_pattern,
+        'collision_operator': lbm_config.method.name.lower(),
+    }
+
+    ctx.write_file(f"{name_prefix}StaticDefines.h", info_header.format(**infoHeaderParams))
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/CMakeLists.txt b/apps/showcases/PorousMediaGPU/CMakeLists.txt
index 087cab7fb..9b7a4e28a 100644
--- a/apps/showcases/PorousMediaGPU/CMakeLists.txt
+++ b/apps/showcases/PorousMediaGPU/CMakeLists.txt
@@ -12,6 +12,7 @@ waLBerla_generate_target_from_python(NAME PorousMediaGPUCodeGen
         NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX}
         Obstacle.h Obstacle.${CODEGEN_FILE_SUFFIX}
         UBB.h UBB.${CODEGEN_FILE_SUFFIX}
+        TurbulentInflow.h TurbulentInflow.${CODEGEN_FILE_SUFFIX}
         Outflow.h Outflow.${CODEGEN_FILE_SUFFIX}
         Welford.h Welford.${CODEGEN_FILE_SUFFIX}
         PorousMediaGPUBoundaryCollection.h
@@ -27,7 +28,19 @@ else()
             FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h DragEvaluation.cpp InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp
             DEPENDS blockforest boundary core field io lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen )
 
-# target_include_directories(PorousMediaGPU PUBLIC ${HDF5_CXX_INCLUDE_DIRS})
-# target_link_libraries(PorousMediaGPU PUBLIC ${HDF5_LIBRARIES} )
-
 endif(WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP)
+
+waLBerla_generate_target_from_python(NAME DebugCodeGen
+        FILE Debug.py
+        CODEGEN_CFG debug_codegen
+        OUT_FILES DebugSweepCollection.h DebugSweepCollection.cpp
+        DebugStorageSpecification.h DebugStorageSpecification.cpp
+        NoSlip.h NoSlip.cpp
+        UBB.h UBB.cpp
+        DebugBoundaryCollection.h
+        DebugInfoHeader.h
+        DebugStaticDefines.h)
+
+waLBerla_add_executable ( NAME Debug
+        FILES Debug.cpp
+        DEPENDS blockforest boundary core field io lbm_generated postprocessing stencil timeloop vtk DebugCodeGen )
diff --git a/apps/showcases/PorousMediaGPU/Debug.cpp b/apps/showcases/PorousMediaGPU/Debug.cpp
new file mode 100644
index 000000000..130cb9251
--- /dev/null
+++ b/apps/showcases/PorousMediaGPU/Debug.cpp
@@ -0,0 +1,468 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PorousMediaGPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "blockforest/communication/NonUniformBufferedScheme.h"
+
+#include "core/Abort.h"
+#include "core/DataTypes.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/logging/Logging.h"
+#include "core/math/Vector3.h"
+#include "core/math/Constants.h"
+#include "core/Environment.h"
+#include "core/mpi/MPIManager.h"
+#include "core/MemoryUsage.h"
+#include "core/mpi/Reduce.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include "field/AddToStorage.h"
+#include "field/CellCounter.h"
+#include "field/FlagField.h"
+#include "field/StabilityChecker.h"
+#include "field/adaptors/AdaptorCreators.h"
+#include "field/iterators/FieldIterator.h"
+#include "field/vtk/VTKWriter.h"
+#include "field/vtk/FlagFieldCellFilter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+#include "geometry/mesh/TriangleMeshIO.h"
+
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
+
+#include "timeloop/SweepTimeloop.h"
+#include "vtk/VTKOutput.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+
+#include "DebugStaticDefines.h"
+#include "DebugInfoHeader.h"
+
+using namespace walberla;
+
+using StorageSpecification_T = lbm::DebugStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+
+using PdfField_T           = lbm_generated::PdfField< StorageSpecification_T >;
+using FlagField_T          = FlagField< uint8_t >;
+
+using BoundaryCollection_T = lbm::DebugBoundaryCollection< FlagField_T >;
+using SweepCollection_T = lbm::DebugSweepCollection;
+
+
+using macroFieldType = VelocityField_T::value_type;
+using RefinementSelectionFunction = std::function<void (SetupBlockForest &)>;
+
+struct IDs
+{
+   BlockDataID pdfField;
+   BlockDataID velocityField;
+   BlockDataID meanVelocityField;
+   BlockDataID velocityFieldInflowSlice;
+   BlockDataID densityField;
+   BlockDataID flagField;
+};
+
+void refineWalls(SetupBlockForest& forest, const real_t hemispheresRadius, const real_t dx, const uint_t level)
+{
+   auto domain = forest.getDomain();
+   const real_t yMin = domain.yMin();
+   const real_t yMax = domain.yMax();
+
+   const real_t t = hemispheresRadius + dx;
+
+   const AABB minAABB(domain.xMin(), domain.yMin(), domain.zMin(), domain.xMax(), domain.yMin() + t, domain.zMax());
+   // const AABB maxAABB(domain.xMin(), domain.yMax() - t, domain.zMin(), domain.xMax(), domain.yMax(), domain.zMax());
+
+   for (auto block = forest.begin(); block != forest.end(); ++block){
+      const AABB& aabb = block->getAABB();
+
+      // if (block->getLevel() < level && (minAABB.intersects(aabb) || maxAABB.intersects(aabb)) ){
+      if (block->getLevel() < level && minAABB.intersects(aabb) ){
+         block->setMarker(true);
+      }
+   }
+}
+
+void createSetupBlockForest(SetupBlockForest& setupBfs, std::shared_ptr<Config>& config, const RefinementSelectionFunction& refinementSelection, const bool useMPIManager=false)
+{
+   auto domainParameters  = config->getOneBlock("DomainSetup");
+
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+
+   uint_t numProcesses        = domainParameters.getParameter< uint_t >("numberProcesses");
+
+   const std::string blockForestFilestem = "Debug";
+
+   if(useMPIManager) {numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());}
+
+   // TPMSRefinementSelection tpmsRefinementSelection( setup );
+   setupBfs.addRefinementSelectionFunction(refinementSelection);
+
+   // SphereBlockExclusion SphereBlockExclusion( Sphere );
+   // setupBfs.addBlockExclusionFunction(SphereBlockExclusion);
+
+   Vector3< real_t > domainSize = domainParameters.getParameter< Vector3< real_t > >("domainSize");
+   Vector3< uint_t > rootBlocks          = domainParameters.getParameter< Vector3< uint_t > >("blocks");
+   Vector3< bool >  periodic            = domainParameters.getParameter< Vector3< bool > >("periodic");
+   uint_t refinementLevels    = domainParameters.getParameter< uint_t >("refinementLevels");
+
+   const AABB domain(real_c(0.0), real_c(0.0), real_c(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2],
+                         periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+   WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+   WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+   for (uint_t level = 0; level <= refinementLevels; level++){
+      const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+      WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+   }
+
+   const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
+   WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+
+   Vector3< uint_t > c = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+
+   const uint_t cellsPerBlock        = c[0] * c[1] * c[2];;
+   const uint_t totalNumberCells     = setupBfs.getNumberOfBlocks() * cellsPerBlock;
+   const real_t averageCellsPerGPU   = avgBlocksPerProc * real_c(cellsPerBlock);
+
+   const uint_t PDFsPerCell            = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+   const uint_t valuesPerCell          = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+   const uint_t sizePerValue           = sizeof(StorageSpecification_T::value_type);
+   const double expectedMemory         = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+
+   WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+   WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+
+   WALBERLA_LOG_INFO_ON_ROOT("=================================================================================")
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
+   auto logging           = config->getOneBlock("Logging");
+   bool writeSetupForestAndReturn    = logging.getParameter< bool >("writeSetupForestAndReturn", false);
+
+   if(writeSetupForestAndReturn){
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      setupBfs.saveToFile(oss.str().c_str());
+
+      setupBfs.writeVTKOutput(blockForestFilestem);
+   }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs, std::shared_ptr<Config>& config, const RefinementSelectionFunction& refinementSelection){
+   if (mpi::MPIManager::instance()->numProcesses() > 1){
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << "debug" << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      std::ifstream infile(setupBlockForestFilepath.c_str());
+      if(!infile.good()){
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, config, refinementSelection, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else{
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else{
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, config, refinementSelection);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
+}
+
+
+
+int main(int argc, char** argv){
+   Environment env( argc, argv );
+   mpi::MPIManager::instance()->useWorldComm();
+
+   auto config = env.config();
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                        SETUP AND CONFIGURATION                                             ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+   auto parameters        = config->getOneBlock("Parameters");
+   auto loggingParameters = config->getOneBlock("Logging");
+   auto boundariesConfig  = config->getBlock("Boundaries");
+   auto domainParameters  = config->getOneBlock("DomainSetup");
+
+
+   bool writeSetupForestAndReturn = loggingParameters.getParameter< bool >("writeSetupForestAndReturn", false);
+   if (uint_c(MPIManager::instance()->numProcesses()) > 1) writeSetupForestAndReturn = false;
+   shared_ptr< BlockForest > bfs;
+
+   Vector3< real_t > domainSize          = domainParameters.getParameter< Vector3< real_t > >("domainSize");
+   Vector3< uint_t > cellsPerBlock       = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   Vector3< uint_t > rootBlocks          = domainParameters.getParameter< Vector3< uint_t > >("blocks");
+   uint_t refinementLevels    = domainParameters.getParameter< uint_t >("refinementLevels");
+   uint_t timesteps = parameters.getParameter< uint_t >("timesteps");
+
+   const real_t dx = domainSize[0] / real_c(rootBlocks[0] * cellsPerBlock[0]);
+   WALBERLA_CHECK_FLOAT_EQUAL_EPSILON(dx, domainSize[1] / real_c(rootBlocks[1] * cellsPerBlock[1]), real_c(1e-12), "Resulting resolution is not equal in x and y direxction. This is not supported");
+   WALBERLA_CHECK_FLOAT_EQUAL_EPSILON(dx, domainSize[2] / real_c(rootBlocks[2] * cellsPerBlock[2]), real_c(1e-12), "Resulting resolution is not equal in x and z direxction. This is not supported");
+
+   auto dxF = dx / real_c( 1 << refinementLevels );
+   auto refinementSelection = std::bind(refineWalls, std::placeholders::_1, real_c(1.0), dxF, refinementLevels);
+
+   createBlockForest(bfs, config, refinementSelection);
+
+   if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1){
+      WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+      logging::Logging::printFooterOnStream();
+      return EXIT_SUCCESS;
+   }
+
+   auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+   blocks->createCellBoundingBoxes();
+   auto finalDomain = blocks->getDomain();
+
+   const StorageSpecification_T StorageSpec = StorageSpecification_T();
+   // Creating fields
+   uint_t step = 0;
+   WALBERLA_LOG_INFO_ON_ROOT(++step << ". Allocating data")
+   IDs ids;
+
+   ids.pdfField  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx);
+   ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, uint_c(2));
+   ids.meanVelocityField = field::addToStorage< VelocityField_T >(blocks, "mean velocity", macroFieldType(0.0), field::fzyx, uint_c(2));
+   ids.densityField  = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, uint_c(2));
+   ids.flagField     = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(2));
+ 
+   const real_t drivingForce = 1e-6;
+   const real_t omega = 1.970443349753695;
+   const real_t latticeVelocity = 0.05;
+
+   const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3< cell_idx_t > >("innerOuterSplit", Vector3< cell_idx_t >(1, 1, 1)));
+   SweepCollection_T sweepCollection(blocks, ids.pdfField, ids.densityField, ids.velocityField, drivingForce, omega, innerOuterSplit);
+
+   WALBERLA_LOG_INFO_ON_ROOT(++step << ". Initialising data")
+   for (auto& block : *blocks){
+      sweepCollection.initialise(&block, cell_idx_c(1));
+   }
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+   auto fluidUID           = FlagUID("Fluid");
+   auto obstacleUID        = FlagUID("Obstacle");
+
+   // Boundaries
+   WALBERLA_LOG_INFO_ON_ROOT(++step << ". Setting boundary conditions")
+
+
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, ids.flagField, boundariesConfig);
+
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, fluidUID, 2);
+   BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfField, fluidUID, latticeVelocity);
+
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                           COMMUNICATION SCHEME                                             ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+   auto communication = std::make_shared< NonUniformBufferedScheme< CommunicationStencil_T > >(blocks);
+   auto packInfo      = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, ids.pdfField);
+   communication->addPackInfo(packInfo);
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                          TIME STEP DEFINITIONS                                             ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+   lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > LBMMeshRefinement(
+         blocks, ids.pdfField, sweepCollection, boundaryCollection, communication, packInfo);
+/*
+   for (auto& block : *blocks){
+      const auto pdfs = block.getData< PdfField_T >(ids.pdfField);
+      real_t level = real_c(blocks->getLevel(block));
+      for( auto it = pdfs->beginWithGhostLayer(2); it != pdfs->end(); ++it ){
+         for(uint_t i = 0; i < Stencil_T::Q; ++i){
+            pdfs->get(it.x(), it.y(), it.z(), i) = real_c(Stencil_T::Q)*level + real_c(i);
+         }
+      }
+   }
+
+   return 0;
+
+   for (auto& block : *blocks){
+      const auto pdfs = block.getData< PdfField_T >(ids.pdfField);
+      auto level = blocks->getLevel(block);
+
+      if(level == 0){
+         for( auto it = pdfs->beginGhostLayerOnly(stencil::Direction::S); it != pdfs->end(); ++it ){
+
+            WALBERLA_LOG_DEVEL_VAR(it)
+         }
+      }
+
+   }
+
+   return 0;
+
+// LBMMeshRefinement.addRefinementToTimeLoop(timeloop);
+// timeloop.addFuncAfterTimeStep(LBMMeshRefinement, "test");
+// timeloop.run();
+// WALBERLA_LOG_INFO_ON_ROOT("test")
+
+for(auto i = 0; i < 50; ++i){
+   for (auto& block : *blocks){
+      boundaryCollection(&block);
+   }
+}
+*/
+// for(auto i = 0; i < 50; ++i)
+//    LBMMeshRefinement();
+
+// return 0;
+
+   // timeloop.addFuncBeforeTimeStep(LBMMeshRefinement, "Refinement timestep");
+   LBMMeshRefinement.addRefinementToTimeLoop(timeloop);
+
+   const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - dxF,
+                      finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + dxF);
+
+
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   ///                                             VTK Output                                                     ///
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+   // VTtimeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");K
+   auto VTKWriter = config->getOneBlock("VTKWriter");
+   const uint_t vtkWriteFrequency       = VTKWriter.getParameter< uint_t >("vtkWriteFrequency", 0);
+   const real_t sliceThickness          = VTKWriter.getParameter< real_t >("sliceThickness", real_c(1.0));
+
+   if (vtkWriteFrequency > 0){
+      const std::string vtkFolder = "debug";
+      auto vtkOutput =
+         vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, vtkFolder,
+                                        "simulation_step", false, true, true, false, 0, false, false);
+
+
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks){
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+      });
+
+
+      field::FlagFieldCellFilter<FlagField_T> fluidFilter( ids.flagField );
+      fluidFilter.addFlag(obstacleUID );
+      vtkOutput->addCellExclusionFilter(fluidFilter);
+
+
+      if (VTKWriter.getParameter< bool >("writeOnlySlice", true)){
+         // const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - sliceThickness * setup.dxF,
+         //                    finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + sliceThickness * setup.dxF);
+         vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
+
+         if (VTKWriter.getParameter< bool >("writeXZSlice", false)){
+            const AABB sliceXZ(finalDomain.xMin(), finalDomain.center()[1] - dxF, finalDomain.zMin(),
+                               finalDomain.xMax(), finalDomain.center()[1] + dxF, finalDomain.zMax());
+            vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXZ));
+         }
+      }
+
+      if (VTKWriter.getParameter< bool >("velocity")){
+         if (VTKWriter.getParameter< bool >("writeVelocityAsMagnitude")){
+            auto velWriter = make_shared< field::VTKMagnitudeWriter< VelocityField_T, float32 > >(ids.velocityField, "velocityMagnitude");
+            vtkOutput->addCellDataWriter(velWriter);
+         }
+         else{
+            auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(ids.velocityField, "velocity");
+            vtkOutput->addCellDataWriter(velWriter);
+         }
+      }
+      if (VTKWriter.getParameter< bool >("density")){
+         auto densityWriter = make_shared< field::VTKWriter< ScalarField_T, float32 > >(ids.densityField, "density");
+         vtkOutput->addCellDataWriter(densityWriter);
+      }
+      if (VTKWriter.getParameter< bool >("flag")){
+         auto flagWriter = make_shared< field::VTKWriter< FlagField_T > >(ids.flagField, "flag");
+         vtkOutput->addCellDataWriter(flagWriter);
+      }
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   // log remaining time
+   timeloop.addFuncAfterTimeStep(
+      timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), 20),
+      "remaining time logger");
+
+   auto CheckerParameters      = config->getOneBlock("StabilityChecker");
+   const uint_t checkFrequency = CheckerParameters.getParameter< uint_t >("checkFrequency", uint_t(0));
+   if (checkFrequency > 0){
+      auto checkFunction = [](VelocityField_T::value_type value) {  return value < math::abs(VelocityField_T::value_type(10)); };
+      timeloop.addFuncAfterTimeStep(
+         makeSharedFunctor(field::makeStabilityChecker< VelocityField_T, FlagField_T >(
+            config, blocks, ids.velocityField, ids.flagField, fluidUID, checkFunction)),
+         "Stability check");
+   }
+
+   WALBERLA_MPI_BARRIER()
+
+   const lbm_generated::PerformanceEvaluation< FlagField_T > performance(blocks, ids.flagField, fluidUID);
+   field::CellCounter< FlagField_T > fluidCells(blocks, ids.flagField, fluidUID);
+   fluidCells();
+
+   WALBERLA_LOG_INFO_ON_ROOT(++step << ". Starting Simulation with " << timesteps << " timesteps")
+   WcTimingPool timeloopTiming;
+   WcTimer simTimer;
+
+   WALBERLA_MPI_BARRIER()
+
+   simTimer.start();
+   timeloop.run(timeloopTiming);
+   WALBERLA_MPI_BARRIER()
+   simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   double time = double_c(simTimer.max());
+   WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+   performance.logResultOnRoot(timesteps, time);
+
+   const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+   WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+   printResidentMemoryStatistics();
+   return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/Debug.py b/apps/showcases/PorousMediaGPU/Debug.py
new file mode 100644
index 000000000..cb1bc609a
--- /dev/null
+++ b/apps/showcases/PorousMediaGPU/Debug.py
@@ -0,0 +1,127 @@
+import sympy as sp
+from pystencils import Target
+
+from pystencils.field import fields
+from pystencils.simp import insert_aliases, insert_constants
+
+from lbmpy import LBStencil, LBMConfig, LBMOptimisation
+from lbmpy.boundaries.boundaryconditions import NoSlip, NoSlipLinearBouzidi, ExtrapolationOutflow, UBB, FixedDensity
+from lbmpy.creationfunctions import create_lb_collision_rule
+from lbmpy.enums import Method, Stencil, ForceModel
+from lbmpy.flow_statistics import welford_assignments
+
+from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+from lbmpy_walberla import RefinementScaling
+
+info_header = """
+#pragma once
+#include <map>
+std::map<std::string, std::string> infoMap{{{{"stencil", "{stencil}"}},
+                                           {{"streamingPattern", "{streaming_pattern}"}}, 
+                                           {{"collisionOperator", "{collision_operator}"}}}};
+"""
+
+omega = sp.symbols("omega")
+inlet_velocity = sp.symbols("u_x")
+force_vector = tuple([sp.Symbol("Fx"), 0.0, 0.0])
+max_threads = 256
+ 
+with CodeGeneration() as ctx:
+    target = Target.CPU # Target.GPU if ctx.gpu else Target.CPU
+    name_prefix = "Debug" # "PorousMediaGPU"
+    sweep_params = {'block_size': (128, 1, 1)} if ctx.gpu else {}
+
+    # The application is meant to be compiled with double precision. For single precision, the pdf_dtype can be switched
+    # to float32. In this way the calculations are still performed in double precision while the application can profit
+    # from enhanced performance due to the lower precision of the PDF field
+    dtype = 'float64' if ctx.double_accuracy else 'float32'
+    pdf_dtype = 'float64'
+
+    stencil = LBStencil(Stencil.D3Q19)
+    q = stencil.Q
+    dim = stencil.D
+
+    streaming_pattern = 'aa'
+
+    pdfs = fields(f"pdfs({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
+    velocity_field, density_field = fields(f"velocity({dim}), density(1) : {dtype}[{dim}D]", layout='fzyx')
+    mean_velocity = fields(f"mean_velocity({dim}): {dtype}[{stencil.D}D]", layout='fzyx')
+
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    # method_enum = Method.CENTRAL_MOMENT
+    method_enum = Method.SRT
+    lbm_config = LBMConfig(
+        method=method_enum,
+        stencil=stencil,
+        relaxation_rate=omega,
+        compressible=True,
+        force=force_vector,
+        # subgrid_scale_model=SubgridScaleModel.QR,
+        fourth_order_correction=0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False,
+        field_name='pdfs',
+        streaming_pattern=streaming_pattern
+    )
+
+    lbm_opt = LBMOptimisation(cse_global=False, cse_pdfs=False, field_layout="fzyx",
+                              symbolic_field=pdfs)
+
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+    if lbm_config.method == Method.CUMULANT:
+        collision_rule = insert_constants(collision_rule)
+        collision_rule = insert_aliases(collision_rule)
+    lb_method = collision_rule.method
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip(), field_data_type=pdf_dtype)
+ 
+    quadratic_bounce_back = NoSlipLinearBouzidi(calculate_force_on_boundary=True, data_type=dtype)
+    no_slip_interpolated = lbm_boundary_generator(class_name='Obstacle', flag_uid='Obstacle',
+                                                  boundary_object=quadratic_bounce_back,
+                                                  field_data_type=pdf_dtype)
+
+
+    turbulentubb = lbm_boundary_generator(class_name='TurbulentInflow', flag_uid='TurbulentInflow',
+                                          boundary_object=UBB(velocity=velocity_field.center_vector, density=1.0, data_type=dtype, dim=dim),
+                                          field_data_type=pdf_dtype)
+
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB(velocity=(inlet_velocity, 0.0, 0.0), density=1.0, data_type=dtype, dim=dim),
+                                 field_data_type=pdf_dtype)
+
+    # outflow_boundary = ExtrapolationOutflow(stencil[4], lb_method, data_type=pdf_dtype, density=1.0)
+    outflow_boundary = ExtrapolationOutflow(stencil[4], lb_method, data_type=pdf_dtype)
+    outflow = lbm_boundary_generator(class_name='Outflow', flag_uid='Outflow',
+                                     boundary_object=outflow_boundary,
+                                     field_data_type=pdf_dtype)
+
+    # outflow = lbm_boundary_generator(class_name='Outflow', flag_uid='Outflow',
+    #                                 boundary_object=FixedDensity(1.0),
+    #                                  field_data_type=pdf_dtype)
+
+    refinement_scaling = RefinementScaling()
+    refinement_scaling.add_standard_relaxation_rate_scaling(omega)
+    refinement_scaling.add_force_scaling(sp.Symbol("Fx"))
+
+    generate_lbm_package(ctx, name=name_prefix, collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, refinement_scaling=refinement_scaling, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields, gpu_indexing_params=sweep_params,
+                         target=target, data_type=dtype, pdfs_data_type=pdf_dtype,
+                         max_threads=max_threads)
+    
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, f'{name_prefix}InfoHeader',
+                         field_typedefs=field_typedefs)
+
+    infoHeaderParams = {
+        'stencil': stencil.name.lower(),
+        'streaming_pattern': streaming_pattern,
+        'collision_operator': lbm_config.method.name.lower(),
+    }
+
+    ctx.write_file(f"{name_prefix}StaticDefines.h", info_header.format(**infoHeaderParams))
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.cpp b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
index d8107e6df..624f466ea 100644
--- a/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
+++ b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp
@@ -27,20 +27,27 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks,
                                const IDs& ids, const Setup& setup, std::shared_ptr<Config>& config)
    : blocks_(blocks), boundaryCollection_(boundaryCollection), ids_(ids), setup_(setup)
 {
+   auto parameters     = config->getOneBlock("DragEvaluation");
+   auto wallParameters = config->getOneBlock("RoughWall");
+
+   // if(parameters.getParameter< uint_t >("evaluationFrequency") == uint_c(0))
+   //    return;
+   
    quantities_ = {{"timestep", 0}, {"Fx", 1}, {"Fy", 2}, {"Fz", 3}, {"CDReal", 4}, {"CLReal", 5}};
 
    WALBERLA_ASSERT_NOT_NULLPTR(blocks)
    maxLevel_ = blocks->getDepth();
    blocks->getBlocks(finestBlocks_, maxLevel_);
 
-   auto parameters     = config->getOneBlock("DragEvaluation");
-   auto wallParameters = config->getOneBlock("RoughWall");
-
    dragFilename_   = parameters.getParameter< std::string >("filename");
    checkFrequency_ = parameters.getParameter< uint_t >("evaluationFrequency");
    writeCounter_   = 1000 * checkFrequency_;
    dt_             = setup.dt / real_c( 1 << setup.refinementLevels );
    timesteps_      = setup.timesteps * ( 1 << setup.refinementLevels );
+   totalExecutionCounter_ = 0;
+
+   if(writeCounter_ == uint_c(0))
+      writeCounter_++;
 
    const real_t radius   = wallParameters.getParameter< real_t >("hemispheresRadius");
    const real_t distance = wallParameters.getParameter< real_t >("hemispheresDistance");
@@ -103,8 +110,7 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks,
    surfaceAreaWall_ -= totalCircleArea;
    WALBERLA_CHECK_GREATER_EQUAL(surfaceAreaWall_, real_c(0.0))
 
-   // TODO
-   meanVelocity_        = 1.0;
+   meanVelocity_ = setup.latticeVelocity;
 
    WALBERLA_LOG_INFO_ON_ROOT(
       "Evaluation initialised:"
@@ -116,11 +122,13 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks,
 
 void DragEvaluation::forceCalculation(const uint_t level)
 {
-   if (checkFrequency_ == uint_c(0))
+   if(level == maxLevel_){
+      ++totalExecutionCounter_;
+   }
+   if (checkFrequency_ == uint_c(0) || (totalExecutionCounter_ % checkFrequency_) != uint_c(0)){
       return;
-
-   ++totalExecutionCounter_;
-
+   }
+   
    if(level == maxLevel_){
       for (auto b : finestBlocks_){
          force_ += Vector3<double>(boundaryCollection_.ObstacleObject->getForce(b));
@@ -128,12 +136,11 @@ void DragEvaluation::forceCalculation(const uint_t level)
 
       mpi::reduceInplace(force_, mpi::SUM);
       WALBERLA_ROOT_SECTION(){
-         const double t = double_c(totalExecutionCounter_) * dt_;
-
+         const double t = double_c(totalExecutionCounter_ - 1) * dt_;
          const double meanU2 = double_c(meanVelocity_) * double_c(meanVelocity_);
 
-         const double cDRealArea = (double_c(8.0) * force_[0]) / (meanU2 * double_c(surfaceAreaHemiSpheres_));
-         const double cLRealArea = (double_c(8.0) * force_[1]) / (meanU2 * double_c(surfaceAreaHemiSpheres_));
+         const double cDRealArea = (double_c(2.0) * force_[0]) / (meanU2 * double_c(surfaceAreaHemiSpheres_));
+         const double cLRealArea = (double_c(2.0) * force_[1]) / (meanU2 * double_c(surfaceAreaHemiSpheres_));
 
          dragResults_[quantities_["timestep"]*writeCounter_ + executionCounter_] = float_c(t);
          dragResults_[quantities_["Fx"]*writeCounter_ + executionCounter_] = float_c(force_[0]);
@@ -142,14 +149,15 @@ void DragEvaluation::forceCalculation(const uint_t level)
          dragResults_[quantities_["CDReal"]*writeCounter_ + executionCounter_] = float_c(cDRealArea);
          dragResults_[quantities_["CLReal"]*writeCounter_ + executionCounter_] = float_c(cLRealArea);
       }
-   }
-   force_[0] = double_c(0.0);
-   force_[1] = double_c(0.0);
-   force_[2] = double_c(0.0);
 
-   ++executionCounter_;
-   if(executionCounter_ >= writeCounter_ || totalExecutionCounter_ == timesteps_){
-      writeResults();
+      force_[0] = double_c(0.0);
+      force_[1] = double_c(0.0);
+      force_[2] = double_c(0.0);
+
+      ++executionCounter_;
+      if(executionCounter_ >= writeCounter_ || totalExecutionCounter_ == timesteps_){
+         writeResults();
+      }
    }
 }
 
diff --git a/apps/showcases/PorousMediaGPU/GridGeneration.h b/apps/showcases/PorousMediaGPU/GridGeneration.h
index 53e67e02d..0dbafcbd2 100644
--- a/apps/showcases/PorousMediaGPU/GridGeneration.h
+++ b/apps/showcases/PorousMediaGPU/GridGeneration.h
@@ -52,8 +52,17 @@ uint_t blockCells(const Setup& setup, const bool withGhostLayers){
    return cells;
 }
 
+using RefinementSelectionFunction = std::function<void (SetupBlockForest &)>;
 
-void createSetupBlockForest(SetupBlockForest& setupBfs, const Setup& setup, const bool useMPIManager=false)
+void noRefinement(SetupBlockForest &)
+{
+    return;
+}
+
+RefinementSelectionFunction defaultRefinement = noRefinement;
+
+
+void createSetupBlockForest(SetupBlockForest& setupBfs, const Setup& setup, const RefinementSelectionFunction& refinementSelection, const bool useMPIManager=false)
 {
    WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
 
@@ -63,7 +72,7 @@ void createSetupBlockForest(SetupBlockForest& setupBfs, const Setup& setup, cons
    if(useMPIManager) {numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());}
 
    // TPMSRefinementSelection tpmsRefinementSelection( setup );
-   // setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(tpmsRefinementSelection));
+   setupBfs.addRefinementSelectionFunction(refinementSelection);
 
    // SphereBlockExclusion SphereBlockExclusion( Sphere );
    // setupBfs.addBlockExclusionFunction(SphereBlockExclusion);
@@ -119,7 +128,7 @@ void createSetupBlockForest(SetupBlockForest& setupBfs, const Setup& setup, cons
    }
 }
 
-void createBlockForest(shared_ptr< BlockForest >& bfs, const Setup& setup){
+void createBlockForest(shared_ptr< BlockForest >& bfs, const Setup& setup, const RefinementSelectionFunction& refinementSelection=defaultRefinement){
    if (mpi::MPIManager::instance()->numProcesses() > 1){
       // Load structured block forest from file
       std::ostringstream oss;
@@ -129,7 +138,7 @@ void createBlockForest(shared_ptr< BlockForest >& bfs, const Setup& setup){
       if(!infile.good()){
          WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
          SetupBlockForest setupBfs;
-         createSetupBlockForest(setupBfs, setup, true);
+         createSetupBlockForest(setupBfs, setup, refinementSelection, true);
          bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
       }
       else{
@@ -139,7 +148,7 @@ void createBlockForest(shared_ptr< BlockForest >& bfs, const Setup& setup){
    }
    else{
       SetupBlockForest setupBfs;
-      createSetupBlockForest(setupBfs, setup);
+      createSetupBlockForest(setupBfs, setup, refinementSelection);
       bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
    }
 }
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
index 73a62abc2..3db223d20 100644
--- a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
+++ b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp
@@ -74,24 +74,23 @@ void initHemisphers(const shared_ptr< StructuredBlockStorage >& blocks, const Bl
    auto domain = blocks->getDomain();
    const real_t yMin = domain.yMin();
    const real_t yMax = domain.yMax();
-   
-   for (auto& block : *blocks){
-      const auto flagField    = block.getData< FlagField_T >(flagFieldID);
+
+   for (auto bIt = blocks->begin(); bIt != blocks->end(); ++bIt){
+      Block& b             = dynamic_cast< Block& >(*bIt);
+      const auto flagField    = b.getData< FlagField_T >(flagFieldID);
       const auto boundaryFlag = flagField->getOrRegisterFlag(setup.obstacleUID);
-      
-      WALBERLA_FOR_ALL_CELLS_XYZ(flagField, Cell globalCell;
-         blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-         Vector3<real_t> p;
-         blocks->getCellCenter(p, globalCell, 0);
-         if(p[1] > radius && p[1] < yMax - radius){
+
+      for( auto it = flagField->beginWithGhostLayer(1); it != flagField->end(); ++it ){
+         Vector3< real_t > cellCenter = blocks->getBlockLocalCellCenter( b, it.cell() );
+         blocks->mapToPeriodicDomain(cellCenter);
+         if(cellCenter[1] > radius && cellCenter[1] < yMax - radius){
             continue;
          }
-
-         auto center = getCenterOfHemisphere(p, radius, hemispheresDistance, yMin, yMax);
-         if(hemispheresContains(p, center, radius)) {
-            addFlag(flagField->get(x, y, z), boundaryFlag);
+         auto center = getCenterOfHemisphere(cellCenter, radius, hemispheresDistance, yMin, yMax);
+         if(hemispheresContains(cellCenter, center, radius)) {
+            addFlag(flagField->get(it.x(), it.y(), it.z()), boundaryFlag);
          }
-      )
+      }
    }
 }
 
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
index 7d32b0352..84b9d75a4 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp
@@ -169,6 +169,27 @@ real_t defaultWallDistance(const Cell& /*fluidCell*/, const Cell& /*boundaryCell
    return real_c(0.5);
 }
 
+void refineWalls(SetupBlockForest& forest, const real_t hemispheresRadius, const real_t dx, const uint_t level)
+{
+   auto domain = forest.getDomain();
+   const real_t yMin = domain.yMin();
+   const real_t yMax = domain.yMax();
+
+   const real_t t = hemispheresRadius + dx;
+
+   const AABB minAABB(domain.xMin(), domain.yMin(), domain.zMin(), domain.xMax(), domain.yMin() + t, domain.zMax());
+   const AABB maxAABB(domain.xMin(), domain.yMax() - t, domain.zMin(), domain.xMax(), domain.yMax(), domain.zMax());
+
+   for (auto block = forest.begin(); block != forest.end(); ++block){
+      const AABB& aabb = block->getAABB();
+
+      if (block->getLevel() < level && (minAABB.intersects(aabb) || maxAABB.intersects(aabb)) ){
+      //if (block->getLevel() < level && minAABB.intersects(aabb) ){
+         block->setMarker(true);
+      }
+   }
+}
+
 
 
 int main(int argc, char** argv){
@@ -195,17 +216,18 @@ int main(int argc, char** argv){
                               "\n   + stencil:             " << setup.stencil <<
                               "\n   + streaming:           " << setup.streamingPattern <<
                               "\n   + compressible:        " << ( StorageSpecification_T::compressible ? "yes" : "no" ) <<
-                              "\n   + resolution:          " << setup.dxC << " - on the coarsest grid" <<
                               "\n- simulation properties:  "
                               "\n   + kin. viscosity:      " << setup.kinViscosity << " [m^2/s] (on the coarsest grid)" <<
                               "\n   + viscosity LB:        " << setup.viscosity  << " [dx*dx/dt] (on the coarsest grid)" <<
                               "\n   + omega:               " << setup.omega << " (on the coarsest grid)" <<
+                              "\n   + omega:               " << setup.omegaF << " (on the finest grid)" <<
                               "\n   + inflow velocity:     " << setup.referenceVelocity << " [m/s]" <<
                               "\n   + lattice velocity:    " << setup.latticeVelocity << " [dx/dt]" <<
                               "\n   + Reynolds number:     " << setup.reynoldsNumber <<
                               "\n   + Mach number:         " << setup.machNumber <<
                               "\n   + driving force:       " << setup.drivingForce <<
                               "\n   + dx (coarsest grid):  " << setup.dxC << " [m]" <<
+                              "\n   + dx (finest grid):    " << setup.dxF << " [m]" <<
                               "\n   + dt (coarsest grid):  " << setup.dt << " [s]" <<
                               "\n   + conversion Faktor:   " << setup.conversionFactor <<
                               "\n   + #time steps:         " << setup.timesteps << " (on the coarsest grid, " << ( real_c(1.0) / setup.dt ) << " for 1s of real time)" <<
@@ -214,7 +236,10 @@ int main(int argc, char** argv){
    bool writeSetupForestAndReturn = loggingParameters.getParameter< bool >("writeSetupForestAndReturn", false);
    if (uint_c(MPIManager::instance()->numProcesses()) > 1) writeSetupForestAndReturn = false;
    shared_ptr< BlockForest > bfs;
-   createBlockForest(bfs, setup);
+
+   auto refinementSelection = std::bind(refineWalls, _1, real_c(1.0), setup.dxF, setup.refinementLevels);
+
+   createBlockForest(bfs, setup, refinementSelection);
 
    if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1){
       WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
@@ -224,28 +249,28 @@ int main(int argc, char** argv){
 
    auto blocks = std::make_shared< StructuredBlockForest >(bfs, setup.cellsPerBlock[0], setup.cellsPerBlock[1], setup.cellsPerBlock[2]);
    blocks->createCellBoundingBoxes();
+   auto finalDomain = blocks->getDomain();
 
    const StorageSpecification_T StorageSpec = StorageSpecification_T();
    // Creating fields
    uint_t step = 0;
    WALBERLA_LOG_INFO_ON_ROOT(++step << ". Allocating data")
    IDs ids;
-   ids.pdfField  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(1), field::fzyx);
+   ids.pdfField  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, setup.numGhostLayers, field::fzyx);
 
    auto allocator = make_shared< gpu::HostFieldAllocator<macroFieldType> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
-   ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, uint_c(1), allocator);
-   ids.meanVelocityField = field::addToStorage< VelocityField_T >(blocks, "mean velocity", macroFieldType(0.0), field::fzyx, uint_c(1), allocator);
-   ids.densityField  = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, uint_c(1), allocator);
-   ids.flagField     = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
+   ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
+   ids.meanVelocityField = field::addToStorage< VelocityField_T >(blocks, "mean velocity", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator);
+   ids.densityField  = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, setup.numGhostLayers, allocator);
+   ids.flagField     = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", setup.numGhostLayers);
 
    ids.pdfFieldGPU      = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, ids.pdfField, StorageSpec, "pdfs on GPU", true);
    ids.velocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityField, "velocity on GPU", true);
    ids.meanVelocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.meanVelocityField, "mean velocity on GPU", true);
    ids.densityFieldGPU  = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, ids.densityField, "velocity on GPU", true);
 
-
    std::function< Vector3< uint_t > (const shared_ptr< StructuredBlockStorage >&, IBlock * const) > getInflowSliceSize = inflowSliceSize;
-   ids.velocityFieldInflowSlice    = field::addToStorage< VelocityField_T >(blocks, "velocity Inflow", getInflowSliceSize, macroFieldType(0.0), field::fzyx, uint_c(1));
+   ids.velocityFieldInflowSlice    = field::addToStorage< VelocityField_T >(blocks, "velocity Inflow", getInflowSliceSize, macroFieldType(0.0), field::fzyx, setup.numGhostLayers);
    ids.velocityFieldInflowSliceGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityFieldInflowSlice, "velocity Inflow on GPU", true);
 
    const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
@@ -253,8 +278,19 @@ int main(int argc, char** argv){
    WelfordSweep_T welfordSweep(ids.meanVelocityFieldGPU, ids.velocityFieldGPU, real_c(0.0));
 
    WALBERLA_LOG_INFO_ON_ROOT(++step << ". Initialising data")
+
+   if(setup.initialiseWithlatticeVelocity){
+      for (auto& block : *blocks){
+         auto velocity = block.getData< VelocityField_T >(ids.velocityField);
+         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velocity,
+            velocity->get(x, y, z, 0) = setup.latticeVelocity;
+         )
+      }
+   }
+   gpu::fieldCpy< gpu::GPUField< VelocityField_T::value_type >, VelocityField_T >(blocks, ids.velocityFieldGPU, ids.velocityField);
+
    for (auto& block : *blocks){
-      sweepCollection.initialise(&block);
+      sweepCollection.initialise(&block, cell_idx_c(setup.numGhostLayers - 1));
 
       auto velocityInflow = block.getData< VelocityField_T >(ids.velocityFieldInflowSlice);
       WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velocityInflow,
@@ -267,18 +303,19 @@ int main(int argc, char** argv){
    auto inflow = std::make_shared<Inflow>(blocks, setup, ids);
    auto probe = std::make_shared<Probe>(blocks, setup, ids, config->getOneBlock("Probes"));
 
+/*
    const real_t lenght = 8.0;
    const real_t res = 3.0;
    const real_t start = 0.0;
-
-/*
    const uint_t nx = uint_c(std::floor(lenght / (setup.dxC * res)));
    for(uint_t i = 0; i < nx; ++i){
       probe->addPoint(Vector3<real_t>(start + real_c(i) * (setup.dxC * res), 0.5, 0.5 ));
    }
 */ 
+
    probe->initialise();
 
+
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -287,11 +324,12 @@ int main(int argc, char** argv){
    WALBERLA_LOG_INFO_ON_ROOT(++step << ". Setting boundary conditions")
    for (auto& block : *blocks){
       auto flagField = block.getData< FlagField_T >(ids.flagField);
-      setup.inflowFlag   = flagField->registerFlag(setup.inflowUID);
-      setup.outflowFlag  = flagField->registerFlag(setup.outflowUID);
-      setup.obstacleFlag = flagField->registerFlag(setup.obstacleUID);
-      setup.noSlipFlag   = flagField->registerFlag(setup.noSlipUID);
-      setup.fluidFlag    = flagField->registerFlag(setup.fluidUID);
+      setup.inflowFlag          = flagField->registerFlag(setup.inflowUID);
+      setup.turbulentInflowFlag = flagField->registerFlag(setup.turbulentInflowUID);
+      setup.outflowFlag         = flagField->registerFlag(setup.outflowUID);
+      setup.obstacleFlag        = flagField->registerFlag(setup.obstacleUID);
+      setup.noSlipFlag          = flagField->registerFlag(setup.noSlipUID);
+      setup.fluidFlag           = flagField->registerFlag(setup.fluidUID);
    }
 
 
@@ -302,6 +340,7 @@ int main(int argc, char** argv){
       initGrid(blocks, ids.flagField, setup);
    }
 
+
    if(config->getNumBlocks("RoughWall") > 0){
       WALBERLA_LOG_INFO_ON_ROOT(++step << ". Using RoughWall setup")
       auto wallParameters = config->getOneBlock("RoughWall");
@@ -319,10 +358,9 @@ int main(int argc, char** argv){
    }
 
 
-   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, setup.fluidUID);
-   // BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.velocityFieldInflowSliceGPU, setup.drivingForce);
-   BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.velocityFieldInflowSliceGPU, wallDistanceFunctor, ids.pdfField);
-   // BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, ids.pdfField); 
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, setup.fluidUID, cell_idx_c(0));
+   BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID,
+                                           ids.velocityFieldInflowSliceGPU, setup.latticeVelocity, wallDistanceFunctor, ids.pdfField);
 
    if(config->getNumBlocks("TPMS") > 0 && loggingParameters.getParameter< bool >("writeSurfaceMeshOfTPMS", false) ){
       WALBERLA_LOG_INFO_ON_ROOT(++step << ". Writing TPMS as surface file")
@@ -354,6 +392,7 @@ int main(int argc, char** argv){
    lbm_generated::BasicRecursiveTimeStepGPU< GPUPdfField_T, SweepCollection_T, BoundaryCollection_T >
       LBMMeshRefinement(blocks, ids.pdfFieldGPU, sweepCollection, boundaryCollection, nonUniformCommunication, nonUniformPackInfo);
 
+   // timeloop.addFuncBeforeTimeStep(LBMMeshRefinement, "Refinement timestep");
    LBMMeshRefinement.addRefinementToTimeLoop(timeloop, uint_c(0));
 
    shared_ptr< DragEvaluation > evaluation;
@@ -361,19 +400,57 @@ int main(int argc, char** argv){
       WALBERLA_LOG_INFO_ON_ROOT(++step << ". Setup Drag Evaluation")
       evaluation = make_shared<DragEvaluation>(blocks, boundaryCollection, ids, setup, config);
       LBMMeshRefinement.addPostBoundaryHandlingBlockFunction(evaluation->forceCalculationFunctor());
+   }
+
+
+   const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - setup.dxF,
+                      finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + setup.dxF);
 
+/*                      
+   CellInterval globalCellInterval;
+   blocks->getCellBBFromAABB( globalCellInterval, sliceXY, uint_c(0) );
+
+   std::map<IBlock*, CellInterval> sliceCellIntervals;
+
+   for (auto& block : *blocks){
+      CellInterval cellBB;
+      blocks->getCellBBFromAABB( cellBB, sliceXY, blocks->getLevel(block) );
+      blocks->transformGlobalToBlockLocalCellInterval( cellBB, block );
+
+      Cell maxCell(cell_idx_c(0), cell_idx_c(0), cell_idx_c(0));
+      Cell minCell(cell_idx_c( blocks->getNumberOfXCells( block ) ), cell_idx_c( blocks->getNumberOfYCells( block ) ), cell_idx_c( blocks->getNumberOfZCells( block ) ));
+
+      for( cell_idx_t z = cell_idx_c(0); z != cell_idx_c( blocks->getNumberOfZCells( block ) ); ++z ){
+         for( cell_idx_t y = cell_idx_c(0); y != cell_idx_c( blocks->getNumberOfYCells( block ) ); ++y ){
+            for( cell_idx_t x = cell_idx_c(0); x != cell_idx_c( blocks->getNumberOfXCells( block ) ); ++x ){
+               if( cellBB.contains(x,y,z) ){
+                  if(Cell(x, y, z) < minCell){
+                     minCell = Cell(x, y, z);
+                  }
+                  if(!(Cell(x, y, z) < maxCell)){
+                     maxCell = Cell(x, y, z);
+                  }
+               }
+            }
+         }
+      }
+      CellInterval localCellInterval(minCell, maxCell);
+      if(!localCellInterval.empty()){
+         sliceCellIntervals[&block] = localCellInterval;
+      }
    }
 
-   
    timeloop.add() << BeforeFunction([&]()
-      {
-         for (auto& block : *blocks){
-            sweepCollection.calculateMacroscopicParameters(&block);
-         }
-         const uint_t velCtr = uint_c(welfordSweep.getCounter());
-         welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
-      }) << Sweep(welfordSweep.getSweep(), "welford sweep");
+   {
+      for (auto const& sliceCellInterval : sliceCellIntervals){
+         sweepCollection.calculateMacroscopicParametersCellInterval(sliceCellInterval.first, sliceCellInterval.second);
+      }
+
+      const uint_t velCtr = uint_c(welfordSweep.getCounter());
+      welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
+   }) << Sweep(welfordSweep.getSweepOnCellInterval(blocks, globalCellInterval), "welford sweep");
 
+*/
 
    /*SweepTimeloop timeloop(blocks->getBlockStorage(), setup.timesteps);
    timeloop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
@@ -394,7 +471,6 @@ int main(int argc, char** argv){
    const uint_t initialWriteCallsToSkip = VTKWriter.getParameter< uint_t >("initialWriteCallsToSkip", uint_c(0.0));
    const real_t sliceThickness          = VTKWriter.getParameter< real_t >("sliceThickness", real_c(1.0));
 
-   auto finalDomain = blocks->getDomain();
    if (vtkWriteFrequency > 0){
       const std::string vtkFolder = "VTKPorousMediaRE_" + std::to_string(uint64_c(setup.reynoldsNumber));
       auto vtkOutput =
@@ -404,16 +480,19 @@ int main(int argc, char** argv){
       vtkOutput->setInitialWriteCallsToSkip(initialWriteCallsToSkip);
 
       vtkOutput->addBeforeFunction([&]() {
-         /*
          for (auto& block : *blocks){
             sweepCollection.calculateMacroscopicParameters(&block);
          }
-         */
+
+
+         // for (auto const& sliceCellInterval : sliceCellIntervals){
+         //    sweepCollection.calculateMacroscopicParametersCellInterval(sliceCellInterval.first, sliceCellInterval.second);
+         // }
 
 #if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
          gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.velocityField, ids.velocityFieldGPU);
-         gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.meanVelocityField, ids.meanVelocityFieldGPU);
-         gpu::fieldCpy< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(blocks, ids.densityField, ids.densityFieldGPU);
+         // gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.meanVelocityField, ids.meanVelocityFieldGPU);
+         // gpu::fieldCpy< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(blocks, ids.densityField, ids.densityFieldGPU);
 #endif
       });
 
@@ -425,8 +504,8 @@ int main(int argc, char** argv){
 
 
       if (VTKWriter.getParameter< bool >("writeOnlySlice", true)){
-         const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - sliceThickness * setup.dxF,
-                            finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + sliceThickness * setup.dxF);
+         // const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - sliceThickness * setup.dxF,
+         //                    finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + sliceThickness * setup.dxF);
          vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
 
          if (VTKWriter.getParameter< bool >("writeXZSlice", false)){
diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.py b/apps/showcases/PorousMediaGPU/PorousMediaGPU.py
index 6138be652..009f0742b 100644
--- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.py
+++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.py
@@ -12,6 +12,7 @@ from lbmpy.flow_statistics import welford_assignments
 
 from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
 from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+from lbmpy_walberla import RefinementScaling
 
 info_header = """
 #pragma once
@@ -23,11 +24,13 @@ std::map<std::string, std::string> infoMap{{{{"stencil", "{stencil}"}},
 
 omega = sp.symbols("omega")
 inlet_velocity = sp.symbols("u_x")
-force_vector = tuple([sp.Symbol("F_x"), 0.0, 0.0])
+fx = sp.Symbol("Fx")
+force_vector = tuple([fx, 0.0, 0.0])
 max_threads = 256
-
+ 
 with CodeGeneration() as ctx:
     target = Target.GPU if ctx.gpu else Target.CPU
+    name_prefix = "PorousMediaGPU"
     sweep_params = {'block_size': (128, 1, 1)} if ctx.gpu else {}
 
     # The application is meant to be compiled with double precision. For single precision, the pdf_dtype can be switched
@@ -40,7 +43,7 @@ with CodeGeneration() as ctx:
     q = stencil.Q
     dim = stencil.D
 
-    streaming_pattern = 'aa'
+    streaming_pattern = 'esopull'
 
     pdfs = fields(f"pdfs({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
     velocity_field, density_field = fields(f"velocity({dim}), density(1) : {dtype}[{dim}D]", layout='fzyx')
@@ -48,23 +51,20 @@ with CodeGeneration() as ctx:
 
     macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
 
-    # method_enum = Method.CUMULANT
-    method_enum = Method.SRT
+    # method_enum = Method.CENTRAL_MOMENT
+    method_enum = Method.CUMULANT
     lbm_config = LBMConfig(
         method=method_enum,
         stencil=stencil,
         relaxation_rate=omega,
         compressible=True,
-        force_model=ForceModel.GUO,
         force=force_vector,
         # subgrid_scale_model=SubgridScaleModel.QR,
         fourth_order_correction=0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False,
-        field_name='pdfs',
         streaming_pattern=streaming_pattern
     )
 
-    lbm_opt = LBMOptimisation(cse_global=False, cse_pdfs=False, field_layout="fzyx",
-                              symbolic_field=pdfs)
+    lbm_opt = LBMOptimisation(cse_global=False, cse_pdfs=False, field_layout="fzyx")
 
     collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
     if lbm_config.method == Method.CUMULANT:
@@ -80,8 +80,13 @@ with CodeGeneration() as ctx:
                                                   boundary_object=quadratic_bounce_back,
                                                   field_data_type=pdf_dtype)
 
+
+    turbulentubb = lbm_boundary_generator(class_name='TurbulentInflow', flag_uid='TurbulentInflow',
+                                          boundary_object=UBB(velocity=velocity_field.center_vector, density=1.0, data_type=dtype, dim=dim),
+                                          field_data_type=pdf_dtype)
+
     ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
-                                 boundary_object=UBB(velocity=velocity_field.center_vector, density=1.0, data_type=dtype, dim=dim),
+                                 boundary_object=UBB(velocity=(inlet_velocity, 0.0, 0.0), density=1.0, data_type=dtype, dim=dim),
                                  field_data_type=pdf_dtype)
 
     # outflow_boundary = ExtrapolationOutflow(stencil[4], lb_method, data_type=pdf_dtype, density=1.0)
@@ -94,13 +99,17 @@ with CodeGeneration() as ctx:
     #                                 boundary_object=FixedDensity(1.0),
     #                                  field_data_type=pdf_dtype)
 
-    generate_lbm_package(ctx, name="PorousMediaGPU", collision_rule=collision_rule,
+    refinement_scaling = RefinementScaling()
+    refinement_scaling.add_standard_relaxation_rate_scaling(omega)
+    refinement_scaling.add_force_scaling(fx)
+
+    generate_lbm_package(ctx, name=name_prefix, collision_rule=collision_rule,
                          lbm_config=lbm_config, lbm_optimisation=lbm_opt,
-                         nonuniform=True, boundaries=[no_slip, no_slip_interpolated, ubb, outflow],
+                         nonuniform=True, refinement_scaling=refinement_scaling, boundaries=[no_slip, no_slip_interpolated, ubb, turbulentubb, outflow],
                          macroscopic_fields=macroscopic_fields, gpu_indexing_params=sweep_params,
                          target=target, data_type=dtype, pdfs_data_type=pdf_dtype,
                          max_threads=max_threads)
-    
+
     welford_update = welford_assignments(field=velocity_field, mean_field=mean_velocity)
     generate_sweep(ctx, "Welford", welford_update, target=target)
 
@@ -108,7 +117,7 @@ with CodeGeneration() as ctx:
                       'ScalarField_T': density_field}
 
     # Info header containing correct template definitions for stencil and field
-    generate_info_header(ctx, 'PorousMediaGPUInfoHeader',
+    generate_info_header(ctx, f'{name_prefix}InfoHeader',
                          field_typedefs=field_typedefs)
 
     infoHeaderParams = {
@@ -117,4 +126,4 @@ with CodeGeneration() as ctx:
         'collision_operator': lbm_config.method.name.lower(),
     }
 
-    ctx.write_file("PorousMediaGPUStaticDefines.h", info_header.format(**infoHeaderParams))
\ No newline at end of file
+    ctx.write_file(f"{name_prefix}StaticDefines.h", info_header.format(**infoHeaderParams))
\ No newline at end of file
diff --git a/apps/showcases/PorousMediaGPU/Setup.h b/apps/showcases/PorousMediaGPU/Setup.h
index a0a87b5fa..13fe8a56a 100644
--- a/apps/showcases/PorousMediaGPU/Setup.h
+++ b/apps/showcases/PorousMediaGPU/Setup.h
@@ -23,6 +23,8 @@
 #include "core/DataTypes.h"
 #include "core/math/Vector3.h"
 
+#include "lbm_generated/refinement/RefinementScaling.h"
+
 #include <string>
 
 #include "Types.h"
@@ -59,9 +61,10 @@ struct Setup
       WALBERLA_CHECK_FLOAT_EQUAL_EPSILON(coarseMeshSize, domainSize[1] / real_c(rootBlocks[1] * cellsPerBlock[1]), real_c(1e-12), "Resulting resolution is not equal in x and y direxction. This is not supported");
       WALBERLA_CHECK_FLOAT_EQUAL_EPSILON(coarseMeshSize, domainSize[2] / real_c(rootBlocks[2] * cellsPerBlock[2]), real_c(1e-12), "Resulting resolution is not equal in x and z direxction. This is not supported");
 
-      referenceLength   = domainSize[1];
-      reynoldsNumber    = parameters.getParameter< real_t >("reynoldsNumber");
-      latticeVelocity   = parameters.getParameter< real_t >("latticeVelocity");
+      referenceLength               = domainSize[1];
+      reynoldsNumber                = parameters.getParameter< real_t >("reynoldsNumber");
+      latticeVelocity               = parameters.getParameter< real_t >("latticeVelocity");
+      initialiseWithlatticeVelocity = parameters.getParameter< bool >("initialiseWithlatticeVelocity");
 
       useGrid               = turbulence.getParameter< bool >("useGrid");
       turbulentInflow       = turbulence.getParameter< bool >("turbulentInflow");
@@ -71,19 +74,23 @@ struct Setup
       uc                    = turbulence.getParameter< Vector3< real_t > >("U_c");
 
       if(config->getNumBlocks("RoughWall") > 0){
-         kinViscosity = parameters.getParameter< real_t >("viscosity");
+         kinViscosity                    = parameters.getParameter< real_t >("viscosity");
+         const real_t bulkReynoldsNumber = parameters.getParameter< real_t >("bulkReynoldsNumber");
          referenceLength *= real_c(0.5);
-         
-         referenceVelocity = (reynoldsNumber * kinViscosity) / referenceLength;
-         dt  = latticeVelocity / referenceVelocity * coarseMeshSize;
-         viscosity = (kinViscosity * dt) / (coarseMeshSize * coarseMeshSize);
 
+         const real_t bulkVelocity = (bulkReynoldsNumber * kinViscosity) / referenceLength;
+         dt  = latticeVelocity / bulkVelocity * coarseMeshSize;
+         viscosity = (kinViscosity * dt) / (coarseMeshSize * coarseMeshSize);
          const real_t latticeReferenceLength = referenceLength / coarseMeshSize;
+
+         referenceVelocity = (reynoldsNumber * kinViscosity) / referenceLength;
          const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) ));
          machNumber = latticeVelocity / speedOfSound;
-         // viscosity  = real_c((latticeVelocity * latticeReferenceLength) / reynoldsNumber);
          omega      = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5)));
-         drivingForce = calculateDrivingForce(latticeVelocity, latticeReferenceLength);
+
+         const real_t frictionVelocity = (reynoldsNumber * viscosity) / latticeReferenceLength;
+
+         drivingForce = calculateDrivingForce(frictionVelocity, latticeReferenceLength);
       }
       else{
          referenceVelocity = parameters.getParameter< real_t >("referenceVelocity");
@@ -98,6 +105,8 @@ struct Setup
          kinViscosity = viscosity * coarseMeshSize * coarseMeshSize / dt;
       }
 
+      omegaF = lbm_generated::relaxationRateScaling(omega, refinementLevels);
+
       rho = real_c(1.0);
       dxC = coarseMeshSize;
       dxF = real_c(coarseMeshSize) / real_c( 1 << refinementLevels );
@@ -115,11 +124,12 @@ struct Setup
       streamingPattern = infoMap["streamingPattern"];
       collisionModel   = infoMap["collisionOperator"];
 
-      fluidUID    = FlagUID("Fluid");
-      noSlipUID   = FlagUID("NoSlip");
-      obstacleUID = FlagUID("Obstacle");
-      inflowUID   = FlagUID("UBB");
-      outflowUID  = FlagUID("Outflow");
+      fluidUID           = FlagUID("Fluid");
+      noSlipUID          = FlagUID("NoSlip");
+      obstacleUID        = FlagUID("Obstacle");
+      inflowUID          = FlagUID("UBB");
+      turbulentInflowUID = FlagUID("TurbulentInflow");
+      outflowUID         = FlagUID("Outflow");
 
       writeSetupForestAndReturn    = logging.getParameter< bool >("writeSetupForestAndReturn", false);
       remainingTimeLoggerFrequency = logging.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0);
@@ -141,6 +151,7 @@ struct Setup
    real_t referenceVelocity;
    real_t referenceLength;
    real_t latticeVelocity;
+   bool initialiseWithlatticeVelocity;
 
    bool useGrid;
    bool turbulentInflow;
@@ -155,6 +166,7 @@ struct Setup
    real_t viscosity;
    real_t kinViscosity;
    real_t omega;
+   real_t omegaF;
    real_t rho;
    real_t dxC;
    real_t dxF;
@@ -176,9 +188,11 @@ struct Setup
    FlagUID noSlipUID;
    FlagUID obstacleUID;
    FlagUID inflowUID;
+   FlagUID turbulentInflowUID;
    FlagUID outflowUID;
 
    uint8_t inflowFlag;
+   uint8_t turbulentInflowFlag;
    uint8_t outflowFlag;
    uint8_t noSlipFlag;
    uint8_t obstacleFlag;
diff --git a/apps/showcases/PorousMediaGPU/TPMS.prm b/apps/showcases/PorousMediaGPU/TPMS.prm
index b7ee4728e..019bd4890 100644
--- a/apps/showcases/PorousMediaGPU/TPMS.prm
+++ b/apps/showcases/PorousMediaGPU/TPMS.prm
@@ -5,7 +5,7 @@ Parameters
     referenceVelocity 1.0;
     referenceLength 1.0;
     latticeVelocity 0.025;
-    initialiseWithInletVelocity true;
+    initialiseWithlatticeVelocity true;
 
     timesteps 100001;
 
@@ -54,7 +54,7 @@ DomainSetup
 
 Boundaries
 {
-    Border { direction W;    walldistance -1;  flag UBB; }
+    Border { direction W;    walldistance -1;  flag TurbulentInflow; }
     Border { direction E;    walldistance -1;  flag Outflow; }
 }
 
diff --git a/apps/showcases/PorousMediaGPU/roughWall.prm b/apps/showcases/PorousMediaGPU/roughWall.prm
index c54b0bc6b..23e69c1e1 100644
--- a/apps/showcases/PorousMediaGPU/roughWall.prm
+++ b/apps/showcases/PorousMediaGPU/roughWall.prm
@@ -1,10 +1,12 @@
 Parameters
 {
     reynoldsNumber 400;
+    bulkReynoldsNumber 4466;
     viscosity 1.48e-5;
     latticeVelocity 0.05;
+    initialiseWithlatticeVelocity true;
 
-    timesteps 5001;
+    timesteps 800001;
 
     processMemoryLimit 512; // MiB
     innerOuterSplit <1, 1, 1>;
@@ -31,14 +33,12 @@ RoughWall
 
 DomainSetup
 {
-    // domainSize    < 6.9, 3.2, 12 >;
-    // cellsPerBlock < 69, 32, 120 >;
+    domainSize    < 34.6, 40, 4 >;
+    cellsPerBlock < 346, 40, 40 >;
 
-    domainSize    < 55.4, 40, 12 >;
-    cellsPerBlock < 554, 400, 120 >;
-    blocks    < 1, 1, 1 >;
-    periodic    < true, false, false >;
-    refinementLevels 0;
+    blocks    < 1, 10, 1 >;
+    periodic    < true, false, true >;
+    refinementLevels 2;
     numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
     blockForestFilestem porousMediaBlockForest;
 }
@@ -60,12 +60,12 @@ StabilityChecker
 
 VTKWriter
 {
-    vtkWriteFrequency 5000;
+    vtkWriteFrequency 1000;
     vtkGhostLayers 0;
     velocity true;
     writeVelocityAsMagnitude true;
     density false;
-    meanVelocity true;
+    meanVelocity false;
     flag false;
     writeOnlySlice true;
     sliceThickness 1;
@@ -73,7 +73,7 @@ VTKWriter
     amrFileFormat false;
     oneFilePerProcess false;
     samplingResolution -1;
-    initialWriteCallsToSkip 0;
+    initialWriteCallsToSkip 500000;
 }
 
 Logging
@@ -94,6 +94,6 @@ Probes
 
 DragEvaluation
 {
-    evaluationFrequency 1;
+    evaluationFrequency 40;
     filename drag.h5;
 }
\ No newline at end of file
diff --git a/python/lbmpy_walberla/packing_kernels.py b/python/lbmpy_walberla/packing_kernels.py
index af7be15cc..862a6cdaa 100644
--- a/python/lbmpy_walberla/packing_kernels.py
+++ b/python/lbmpy_walberla/packing_kernels.py
@@ -103,9 +103,9 @@ class PackingKernelsCodegen:
         self.accessors = [get_accessor(streaming_pattern, t) for t in get_timesteps(streaming_pattern)]
         self.mask_field = fields(f'mask : uint32 [{self.dim}D]', layout=src_field.layout)
 
-        # self.block_wise = True
-        # if not self.inplace or not self.config.target == Target.GPU:
-        self.block_wise = False
+        self.block_wise = True
+        if not self.inplace or not self.config.target == Target.GPU:
+            self.block_wise = False
 
         self.index = TypedSymbol("index", dtype=BasicType(np.int64))
         self.index_shape = TypedSymbol("_size_0", dtype=BasicType(np.int64))
diff --git a/python/lbmpy_walberla/sweep_collection.py b/python/lbmpy_walberla/sweep_collection.py
index 164dd94d3..a54fa088c 100644
--- a/python/lbmpy_walberla/sweep_collection.py
+++ b/python/lbmpy_walberla/sweep_collection.py
@@ -158,20 +158,6 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
     ctx.write_file(f"{class_name}.{source_extension}", source)
 
 
-class RefinementScaling:
-    def __init__(self):
-        self.scaling_info = []
-
-    def add_standard_relaxation_rate_scaling(self, viscosity_relaxation_rate):
-        self.add_scaling(viscosity_relaxation_rate)
-
-    def add_scaling(self, parameter):
-        if isinstance(parameter, sp.Symbol):
-            self.scaling_info.append(parameter.name)
-        else:
-            raise ValueError("Only pure symbols allowed")
-
-
 def lbm_kernel_family(class_name, kernel_name,
                       collision_rule, streaming_pattern, src_field, dst_field, config: CreateKernelConfig):
 
diff --git a/python/lbmpy_walberla/walberla_lbm_package.py b/python/lbmpy_walberla/walberla_lbm_package.py
index 7465a45bb..25da6c610 100644
--- a/python/lbmpy_walberla/walberla_lbm_package.py
+++ b/python/lbmpy_walberla/walberla_lbm_package.py
@@ -1,20 +1,25 @@
 from typing import Callable, List, Dict
 
 from pystencils import Target, Field
+from sympy import Symbol
 
 from lbmpy.creationfunctions import LbmCollisionRule, LBMConfig, LBMOptimisation
+from lbmpy.relaxationrates import get_shear_relaxation_rate
 
 from pystencils_walberla.cmake_integration import CodeGenerationContext
 
 from lbmpy_walberla.boundary_collection import generate_boundary_collection
 from lbmpy_walberla.storage_specification import generate_lbm_storage_specification
-from lbmpy_walberla.sweep_collection import generate_lbm_sweep_collection, RefinementScaling
+from lbmpy_walberla.sweep_collection import generate_lbm_sweep_collection
+
+from lbmpy_walberla import RefinementScaling
 
 
 def generate_lbm_package(ctx: CodeGenerationContext, name: str,
                          collision_rule: LbmCollisionRule,
                          lbm_config: LBMConfig, lbm_optimisation: LBMOptimisation,
-                         nonuniform: bool = False, boundaries: List[Callable] = None,
+                         nonuniform: bool = False, refinement_scaling: RefinementScaling = None, 
+                         boundaries: List[Callable] = None,
                          macroscopic_fields: Dict[str, Field] = None,
                          target: Target = Target.CPU,
                          data_type=None, pdfs_data_type=None,
@@ -33,13 +38,23 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
                                        data_type=pdfs_data_type,
                                        cpu_openmp=cpu_openmp)
 
-    if nonuniform:
-        omega = lbm_config.relaxation_rate
+    if refinement_scaling is None:
         refinement_scaling = RefinementScaling()
-        refinement_scaling.add_standard_relaxation_rate_scaling(omega)
+
+    if nonuniform:
+        if not refinement_scaling.scaling_info:
+            omega = get_shear_relaxation_rate(method)
+            refinement_scaling.add_standard_relaxation_rate_scaling(omega)
+
+            if lbm_config.force:
+                for f_i in lbm_config.force:
+                    if f_i != 0:
+                        assert isinstance(f_i, Symbol)
+                        refinement_scaling.add_force_scaling(f_i)  
     else:
         refinement_scaling = None
 
+
     generate_lbm_sweep_collection(ctx, f'{name}SweepCollection', collision_rule,
                                   lbm_config=lbm_config, lbm_optimisation=lbm_optimisation,
                                   refinement_scaling=refinement_scaling,
diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py
index 22ae0ecea..b1c61b589 100644
--- a/python/pystencils_walberla/jinja_filters.py
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -5,6 +5,7 @@ except ImportError:
     from jinja2 import contextfilter as jinja2_context_decorator
 
 from collections.abc import Iterable
+import re
 import sympy as sp
 
 from pystencils import Target, Backend
@@ -45,10 +46,7 @@ standard_parameter_registration = """
 for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++)
 {{
     const {dtype} level_scale_factor = {dtype}(uint_t(1) << level);
-    const {dtype} one                = {dtype}(1.0);
-    const {dtype} half               = {dtype}(0.5);
-    
-    {name}Vector.push_back( {dtype}({name} / (level_scale_factor * (-{name} * half + one) + {name} * half)) );
+    {function_rules}
 }}
 """
 
@@ -263,11 +261,17 @@ def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignor
         symbols.difference_update(pointer_symbols)
     type_information = {p.symbol.name: p.symbol.dtype for p in kernel_info.parameters if not p.is_field_parameter}
     result = []
-    registered_parameters = [] if not parameter_registration else parameter_registration.scaling_info
+
+    if parameter_registration is None:
+        registered_parameters = []
+    else:
+        registered_parameters = [scaling[1] for scaling in parameter_registration.scaling_info]
+
     for s in symbols:
         if s in registered_parameters:
             dtype = type_information[s].c_name
             if not level_known:
+                level_known = True
                 result.append("const uint_t level = block->getBlockStorage().getLevel(*block);")
             result.append(f"{dtype} & {s} = {s}Vector[level];")
         else:
@@ -487,7 +491,9 @@ def generate_constructor_initializer_list(kernel_infos, parameters_to_ignore=Non
 
     # Then free parameters
     if parameter_registration is not None:
-        parameters_to_skip.extend(parameter_registration.scaling_info)
+        for scaling in parameter_registration.scaling_info:
+            parameters_to_skip.append(scaling[1])
+
 
     for kernel_info in kernel_infos:
         for param in kernel_info.parameters:
@@ -589,12 +595,18 @@ def generate_members(ctx, kernel_infos, parameters_to_ignore=None, only_fields=F
                 result.append(f"BlockDataID {param.field_name}ID;")
                 params_to_skip.append(param.field_name)
 
+    if parameter_registration is None:
+        scaling_identifiers = []
+    else:
+        scaling_identifiers = [scaling[1] for scaling in parameter_registration.scaling_info]
+
+
     for kernel_info in kernel_infos:
         for param in kernel_info.parameters:
             if only_fields and not param.is_field_parameter:
                 continue
             if not param.is_field_parameter and param.symbol.name not in params_to_skip:
-                if parameter_registration and param.symbol.name in parameter_registration.scaling_info:
+                if param.symbol.name in scaling_identifiers:
                     result.append(f"std::vector<{param.symbol.dtype}> {param.symbol.name}Vector;")
                 else:
                     result.append(f"{param.symbol.dtype} {param.symbol.name}_;")
@@ -719,10 +731,13 @@ def generate_getter(kernel_infos, parameter_registration=None):
 
     params_to_skip = []
     result = []
+    scaling_identifiers = []
+    if parameter_registration:
+        scaling_identifiers = [scaling[1] for scaling in parameter_registration.scaling_info]
     for kernel_info in kernel_infos:
         for param in kernel_info.parameters:
             if not param.is_field_parameter and param.symbol.name not in params_to_skip:
-                if parameter_registration and param.symbol.name in parameter_registration.scaling_info:
+                if param.symbol.name in scaling_identifiers:
                     continue
                 dtype = param.symbol.dtype
                 name = param.symbol.name
@@ -739,10 +754,13 @@ def generate_setter(kernel_infos, parameter_registration=None):
 
     params_to_skip = []
     result = []
+    scaling_identifiers = []
+    if parameter_registration:
+        scaling_identifiers = [scaling[1] for scaling in parameter_registration.scaling_info]
     for kernel_info in kernel_infos:
         for param in kernel_info.parameters:
             if not param.is_field_parameter and param.symbol.name not in params_to_skip:
-                if parameter_registration and param.symbol.name in parameter_registration.scaling_info:
+                if param.symbol.name in scaling_identifiers:
                     continue
                 dtype = param.symbol.dtype
                 name = param.symbol.name
@@ -761,34 +779,70 @@ def generate_parameter_registration(kernel_infos, parameter_registration):
 
     params_to_skip = []
     result = []
+
+    scaling_identifiers = [scaling[1] for scaling in parameter_registration.scaling_info]
+    function_rules = []
+
     for kernel_info in kernel_infos:
         for param in kernel_info.parameters:
             if not param.is_field_parameter and param.symbol.name not in params_to_skip:
-                if param.symbol.name in parameter_registration.scaling_info:
-                    result.append(standard_parameter_registration.format(dtype=param.symbol.dtype,
-                                                                         name=param.symbol.name))
-                    params_to_skip.append(param.symbol.name)
+                if param.symbol.name in scaling_identifiers:
+                    name = param.symbol.name
+                    dtype = param.symbol.dtype
+
+                    idx = scaling_identifiers.index(param.symbol.name)
+                    lambda_expr = parameter_registration.scaling_info[idx][2]
+                    raw_scaling = convert_to_raw_form(name, lambda_expr)
+                    function_rules.append(raw_scaling.format(dtype=dtype))
+                    params_to_skip.append(name)
+
+    function_rules_baked = "\n    ".join(function_rules)
+    result.append(standard_parameter_registration.format(dtype=dtype, function_rules=function_rules_baked))
+
 
     return "\n".join(result)
 
 
+def convert_to_raw_form(name, expression):
+    expr_str = str(expression)
+
+    # Extract the body of the lambda expression
+    start_idx = expr_str.find(":") + 1
+    body = expr_str[start_idx:].strip()
+
+    # Replace numeric constants with the {dtype}(number) format
+    body = re.sub(r'(?<!\w)(\d+(\.\d*)?|\.\d+)(?!\w)', lambda match: f'{{dtype}}({match.group(1)})', body)
+
+    return f"{name}Vector.push_back({{dtype}}({body}));"
+
+
 def generate_constructor(kernel_infos, parameter_registration):
     if parameter_registration is None:
         return ""
     if not isinstance(kernel_infos, Iterable):
         kernel_infos = [kernel_infos]
 
-    params_to_skip = []
     result = []
+    params_to_skip = []
+
+    scaling_identifiers = [scaling[1] for scaling in parameter_registration.scaling_info]
+    function_rules = []
+
     for kernel_info in kernel_infos:
         for param in kernel_info.parameters:
             if not param.is_field_parameter and param.symbol.name not in params_to_skip:
-                if param.symbol.name in parameter_registration.scaling_info:
+               if param.symbol.name in scaling_identifiers:
                     name = param.symbol.name
                     dtype = param.symbol.dtype
-                    result.append(standard_parameter_registration.format(dtype=dtype, name=name))
-                    params_to_skip.append(name)
 
+                    idx = scaling_identifiers.index(param.symbol.name)
+                    lambda_expr = parameter_registration.scaling_info[idx][2]
+                    raw_scaling = convert_to_raw_form(name, lambda_expr)
+                    function_rules.append(raw_scaling.format(dtype=dtype))
+                    params_to_skip.append(name)
+    
+    function_rules_baked = "\n    ".join(function_rules)
+    result.append(standard_parameter_registration.format(dtype=dtype, function_rules=function_rules_baked))
     return "\n".join(result)
 
 
diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
index 0327b9b52..054b492b9 100644
--- a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
+++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
@@ -63,7 +63,7 @@ void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollectio
 
    // 2.1 Collision and Ghost-Layer Propagation
    for(auto b: blocks_[level]){
-      ghostLayerPropagation(b);  // GL-Propagation first without swapping arrays...
+      ghostLayerPropagation(b, nullptr);  // GL-Propagation first without swapping arrays...
       sweepCollection_.streamCollide(b);                // then Stream-Collide on interior, and swap arrays
    }
 
diff --git a/src/lbm_generated/refinement/RefinementScaling.h b/src/lbm_generated/refinement/RefinementScaling.h
index abee51e71..2a2e1ff54 100644
--- a/src/lbm_generated/refinement/RefinementScaling.h
+++ b/src/lbm_generated/refinement/RefinementScaling.h
@@ -21,6 +21,7 @@
 #pragma once
 
 #include "core/DataTypes.h"
+#include "core/math/Vector3.h"
 
 namespace walberla::lbm_generated
 {
@@ -28,10 +29,22 @@ namespace walberla::lbm_generated
 inline real_t relaxationRateScaling( real_t relaxationRate, uint_t refinementLevel )
 {
    const real_t levelScaleFactor = real_c(uint_c(1) << refinementLevel);
-   const real_t one                = real_c(1.0);
-   const real_t half               = real_c(0.5);
+   const real_t one              = real_c(1.0);
+   const real_t half             = real_c(0.5);
 
    return real_c(relaxationRate / (levelScaleFactor * (-relaxationRate * half + one) + relaxationRate * half));
 }
 
+inline Vector3< real_t > levelDependentBodyForce(const Vector3< real_t > & forceDensity, const uint_t refinementLevel)
+{
+   const real_t levelScaleFactor = real_c(uint_c(1) << refinementLevel);
+   return forceDensity * ( real_c(1.0) / levelScaleFactor );
+}
+
+inline real_t levelDependentBodyForce(const real_t & forceDensity, const uint_t refinementLevel)
+{
+   const real_t levelScaleFactor = real_c(uint_c(1) << refinementLevel);
+   return forceDensity * ( real_c(1.0) / levelScaleFactor );
+}
+
 } // namespace walberla::lbm_generated
\ No newline at end of file
-- 
GitLab