Compare revisions

bc7d45bd · bc7d45bd · bc7d45bd · bc7d45bd · bc7d45bd · bc7d45bd
--- a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
+++ b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
@@ -15,10 +15,10 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
 if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
+            DEPENDS blockforest core gpu field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
 else ()
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
+            DEPENDS blockforest core field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
 endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
@@ -8,6 +8,11 @@ from waLBerla.tools.config import block_decomposition
 import sys
 from math import prod
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
 def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
    """Checks if a single block of given size fits into GPU memory"""
@@ -20,7 +25,9 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, s
 class Scenario:
-    def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256),
+    def __init__(self, time_step_strategy,
+                 cuda_block_size,
+                 cells_per_block=(256, 256, 256),
                 cuda_enabled_mpi=False):
        # output frequencies
        self.vtkWriteFrequency = 0
@@ -89,6 +96,14 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
        sequenceValuesToScalars(data)
        df = pd.DataFrame.from_records([data])
@@ -101,43 +116,19 @@ class Scenario:
 def benchmark():
    scenarios = wlb.ScenarioManager()
-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
    gpu_mem = gpu_mem_gb * (2 ** 30)
-    block_size = (256, 256, 256)
+    block_size = (320, 320, 320)
+    cuda_enabled_mpi = True
    if not domain_block_size_ok(block_size, gpu_mem):
        wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
    else:
-        scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size))
+        scenarios.add(Scenario(time_step_strategy='normal',
+                               cuda_block_size=(128, 1, 1),
+                               cells_per_block=block_size,
+                               cuda_enabled_mpi=cuda_enabled_mpi))
-def kernel_benchmark():
-    scenarios = wlb.ScenarioManager()
-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
-    gpu_mem = gpu_mem_gb * (2 ** 30)
-    block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)]
+benchmark()
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1),
-                   (32, 4, 1), (64, 4, 1),
-                   (32, 4, 2),
-                   (32, 8, 1),
-                   (16, 16, 1)]
-    for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
-        for cuda_block in cuda_blocks:
-            for block_size in block_sizes:
-                if not domain_block_size_ok(block_size, gpu_mem):
-                    wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
-                    continue
-                scenario = Scenario(time_step_strategy=time_step_strategy,
-                                    cuda_block_size=cuda_block,
-                                    cells_per_block=block_size)
-                scenarios.add(scenario)
-# benchmark()
-kernel_benchmark()
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -29,6 +29,7 @@
 #include "field/vtk/VTKWriter.h"
 #include "geometry/InitBoundaryHandling.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/DictWrapper.h"
@@ -78,14 +79,10 @@ int main(int argc, char** argv)
      logging::configureLogging(config);
      shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGridFromConfig(config);
-      Vector3< uint_t > cellsPerBlock =
-         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
      // Reading parameters
      auto parameters                    = config->getOneBlock("Parameters");
      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
      const uint_t timesteps             = parameters.getParameter< uint_t >("timesteps", uint_c(50));
-      const real_t remainingTimeLoggerFrequency =
-         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0));
      const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
      const uint_t warmupSteps  = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
@@ -102,6 +99,7 @@ int main(int argc, char** argv)
         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
      BlockDataID phase_field_gpu =
         gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+      BlockDataID phase_field_tmp = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "temporary phasefield", true);
 #else
      BlockDataID lb_phase_field =
         field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
@@ -109,6 +107,7 @@ int main(int argc, char** argv)
         field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx);
      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
+      BlockDataID phase_field_tmp = field::addToStorage< PhaseField_T >(blocks, "phase tmp", real_c(0.0), field::fzyx);
 #endif
      if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only")
@@ -139,47 +138,80 @@ int main(int argc, char** argv)
      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
      pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
+         lb_phase_field_gpu, phase_field_gpu, phase_field_tmp, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
                                              gpuBlockSize[1], gpuBlockSize[2]);
 #else
      pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
-      pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, vel_field);
+      pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, phase_field_tmp, vel_field);
      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
 #endif
 // add communication
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
+      const bool gpuEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
-      auto Comm_velocity_based_distributions =
+      const int streamLowPriority  = 0;
-         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+      const int streamHighPriority = 0;
-      auto generatedPackInfo_velocity_based_distributions =
+      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
-         make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
+      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
-      Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field_gpu);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
-      Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
-      auto Comm_phase_field_distributions =
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false);
-         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Full_Stencil_T > >(blocks, gpuEnabledMpi, false);
-      auto generatedPackInfo_phase_field_distributions =
+      auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false, 65432);
-         make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
-      Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
-#else
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< gpu::GPUField<real_t> >(phase_field_gpu);
+           auto phaseFieldTMP = b->getData< gpu::GPUField<real_t> >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
+#else
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
-      auto generatedPackInfo_velocity_based_distributions =
-         make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
-      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field);
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
-      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
+      auto UniformGPUSchemePhaseField = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks, 65432);
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
-      auto generatedPackInfo_phase_field_distributions =
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
-         make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
-      Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
+      auto Comm_phase_field_distributions = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< PhaseField_T >(phase_field);
+           auto phaseFieldTMP = b->getData< PhaseField_T >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
 #endif
      BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
@@ -201,99 +233,37 @@ int main(int argc, char** argv)
            init_h(&block);
            init_g(&block);
         }
+         WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+         WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+         WALBERLA_MPI_BARRIER()
         WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions done")
      }
+      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      int const streamLowPriority  = 0;
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
-      int const streamHighPriority = 0;
+                     << Sweep(phase_field_LB_step.getSweep(defaultStream), "Phase LB Step")
-      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
-      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
-#endif
-      auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
-#if defined(WALBERLA_BUILD_WITH_CUDA)
+                     << Sweep(hydro_LB_step.getSweep(defaultStream), "Hydro LB Step");
-      auto normalTimeStep = [&]() {
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
-         Comm_velocity_based_distributions->startCommunication();
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block, defaultStream);
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
-         Comm_velocity_based_distributions->wait();
-         Comm_phase_field_distributions->startCommunication();
-         for (auto& block : *blocks)
-            hydro_LB_step(&block, defaultStream);
-         Comm_phase_field_distributions->wait();
-      };
-      auto phase_only = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-      };
-      auto hydro_only = [&]() {
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-      auto without_comm = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
 #else
-      auto normalTimeStep = [&]() {
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
-            Comm_velocity_based_distributions.startCommunication();
+                     << Sweep(phase_field_LB_step.getSweep(), "Phase LB Step")
-            for (auto& block : *blocks)
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
-               phase_field_LB_step(&block);
-            Comm_velocity_based_distributions.wait();
-            Comm_phase_field_distributions.startCommunication();
-            for (auto& block : *blocks)
-               hydro_LB_step(&block);
-            Comm_phase_field_distributions.wait();
-      };
-      auto phase_only = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-      };
-      auto hydro_only = [&]() {
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-      auto without_comm = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-#endif
-      std::function< void() > timeStep;
-      if (timeStepStrategy == "phase_only")
-      {
-         timeStep = std::function< void() >(phase_only);
-         WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
-      }
-      else if (timeStepStrategy == "hydro_only")
-      {
-         timeStep = std::function< void() >(hydro_only);
-         WALBERLA_LOG_INFO_ON_ROOT("started only hydro step without communication for benchmarking")
-      }
-      else if (timeStepStrategy == "kernel_only")
-      {
-         timeStep = std::function< void() >(without_comm);
-         WALBERLA_LOG_INFO_ON_ROOT("started complete phasefield model without communication for benchmarking")
-      }
-      else
-      {
-         timeStep = std::function< void() >(normalTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("normal timestep with overlapping")
-      }
-      timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_step.getSweep(), "Hydro LB Step");
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
-      // remaining time logger
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
-      if (remainingTimeLoggerFrequency > 0)
+#endif
-         timeLoop->addFuncAfterTimeStep(
-            timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
-            "remaining time logger");
      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
      if (vtkWriteFrequency > 1)
@@ -307,40 +277,60 @@ int main(int argc, char** argv)
         auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
         vtkOutput->addCellDataWriter(phaseWriter);
-         timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+         timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
      }
+      lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
+      fluidCells();
+      WALBERLA_LOG_INFO_ON_ROOT("Multiphase benchmark with " << fluidCells.numberOfCells() << " fluid cells")
+      WALBERLA_LOG_INFO_ON_ROOT("Running " << warmupSteps << " timesteps to warm up the system")
      for (uint_t i = 0; i < warmupSteps; ++i)
-         timeLoop->singleStep();
+         timeloop.singleStep();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Warmup timesteps done")
-      timeLoop->setCurrentTimeStepToZero();
+      timeloop.setCurrentTimeStepToZero();
+      WALBERLA_MPI_BARRIER()
      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      WcTimingPool timeloopTiming;
      WcTimer simTimer;
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      cudaDeviceSynchronize();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 #endif
      simTimer.start();
-      timeLoop->run();
+      timeloop.run(timeloopTiming);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      cudaDeviceSynchronize();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 #endif
+      WALBERLA_MPI_BARRIER()
      simTimer.end();
      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-      auto time            = real_c(simTimer.last());
+      double time = simTimer.max();
-      auto nrOfCells       = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
-      auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
+      performance.logResultOnRoot(timesteps, time);
-      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << performance.mlupsPerProcess(timesteps, time))
      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
      WALBERLA_ROOT_SECTION()
      {
         python_coupling::PythonCallback pythonCallbackResults("results_callback");
         if (pythonCallbackResults.isCallable())
         {
-            pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+            pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
            pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
            pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
            #if defined(WALBERLA_BUILD_WITH_CUDA)
-               pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", cudaEnabledMpi);
+               pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", gpuEnabledMpi);
            #endif
            // Call Python function to report results
            pythonCallbackResults();

--- a/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
@@ -19,7 +19,7 @@ with CodeGeneration() as ctx:
    field_type = "float64" if ctx.double_accuracy else "float32"
    stencil_phase = LBStencil(Stencil.D3Q15)
-    stencil_hydro = LBStencil(Stencil.D3Q27)
+    stencil_hydro = LBStencil(Stencil.D3Q19)
    assert (stencil_phase.D == stencil_hydro.D)
    ########################
@@ -76,13 +76,13 @@ with CodeGeneration() as ctx:
                                 delta_equilibrium=False,
                                 force=sp.symbols(f"F_:{stencil_phase.D}"), velocity_input=u,
                                 weighted=True, relaxation_rates=rates,
-                                 output={'density': C_tmp}, kernel_type='stream_pull_collide')
+                                 output={'density': C_tmp})
    method_phase = create_lb_method(lbm_config=lbm_config_phase)
    lbm_config_hydro = LBMConfig(stencil=stencil_hydro, method=Method.MRT, compressible=False,
                                 weighted=True, relaxation_rate=omega,
                                 force=sp.symbols(f"F_:{stencil_hydro.D}"),
-                                 output={'velocity': u}, kernel_type='collide_stream_push')
+                                 output={'velocity': u})
    method_hydro = create_lb_method(lbm_config=lbm_config_hydro)
    # create the kernels for the initialization of the g and h field
@@ -137,7 +137,8 @@ with CodeGeneration() as ctx:
    sweep_params = {'block_size': sweep_block_size}
    stencil_typedefs = {'Stencil_phase_T': stencil_phase,
-                        'Stencil_hydro_T': stencil_hydro}
+                        'Stencil_hydro_T': stencil_hydro,
+                        'Full_Stencil_T': LBStencil(Stencil.D3Q27)}
    field_typedefs = {'PdfField_phase_T': h,
                      'PdfField_hydro_T': g,
                      'VelocityField_T': u,
@@ -156,7 +157,7 @@ with CodeGeneration() as ctx:
        generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target=Target.CPU)
        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                       field_swaps=[(h, h_tmp), (C, C_tmp)],
+                       field_swaps=[(h, h_tmp)],
                       inner_outer_split=True,
                       cpu_vectorize_info=cpu_vec,
                       target=Target.CPU)
@@ -172,7 +173,7 @@ with CodeGeneration() as ctx:
                              streaming_pattern='pull', target=Target.CPU)
        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
-                              streaming_pattern='push', target=Target.CPU)
+                              streaming_pattern='pull', target=Target.CPU)
        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.CPU)
@@ -183,7 +184,7 @@ with CodeGeneration() as ctx:
                       g_updates, target=Target.GPU)
        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                       field_swaps=[(h, h_tmp), (C, C_tmp)],
+                       field_swaps=[(h, h_tmp)],
                       target=Target.GPU,
                       gpu_indexing_params=sweep_params,
                       varying_parameters=vp)
@@ -198,7 +199,7 @@ with CodeGeneration() as ctx:
                              streaming_pattern='pull', target=Target.GPU)
        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
-                              streaming_pattern='push', target=Target.GPU)
+                              streaming_pattern='pull', target=Target.GPU)
        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.GPU)

--- a/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
+++ b/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
@@ -37,6 +37,7 @@
 #include "core/debug/CheckFunctions.h"
 #include "core/debug/Debug.h"
 #include "core/debug/TestSubsystem.h"
+#include "core/logging/Initialization.h"
 #include "core/logging/Logging.h"
 #include "core/math/Constants.h"
 #include "core/math/Sample.h"
@@ -2889,10 +2890,9 @@ int main( int argc, char **argv )
                      "             ('fzyx') data layout!" << std::endl;
      }
      return EXIT_SUCCESS;
-   }   
+   }
   logging::Logging::printHeaderOnStream();
-   //WALBERLA_ROOT_SECTION() { logging::Logging::instance()->setLogLevel( logging::Logging::PROGRESS ); }
 #ifdef _OPENMP
   if( std::getenv( "OMP_NUM_THREADS" ) == nullptr )
@@ -2903,6 +2903,7 @@ int main( int argc, char **argv )
   // open configuration file
   shared_ptr< Config > config = make_shared< Config >();
+   logging::configureLogging(config);
   config->readParameterFile( argv[1] );
   Config::BlockHandle configBlock = config->getBlock( "SchaeferTurek" );

--- a/apps/benchmarks/SchaeferTurek/input.dat
+++ b/apps/benchmarks/SchaeferTurek/input.dat
@@ -29,31 +29,31 @@ SchaeferTurek
   //////////////////////////////
   memoryPerCell      153; // in bytes
-   processMemoryLimit 2048; // in MiB !
+   processMemoryLimit 4048; // in MiB !
-   yzBlocks 1; // blocks in y- and z-direction [2D: only 1 block in z-direction]
+   yzBlocks 4; // blocks in y- and z-direction [2D: only 1 block in z-direction]
   // The number of blocks in x-direction is calculated from: H, L, yzBlocks, xCells, and yzCells.
-   xCells  40; // number of cells in x-direction for each block
+   xCells  64; // number of cells in x-direction for each block
-   yzCells 40; // number of cells in y- and z-direction for each block [2D: only affects y-direction]
+   yzCells 16; // number of cells in y- and z-direction for each block [2D: only affects y-direction]
   H 0.41; // [m]
-   L 20.0; // [m] (2.5 [3D], 2.2 [2D])
+   L 2.5; // [m] (2.5 [3D], 2.2 [2D])
   strictlyObeyL false;//true; // true = outflow will be located exactly at "L", false = outflow might be slightly further away than "L"
   cylinderxPosition    0.5;  // [m] (0.5 [3D], 0.2 [2D])
   cylinderyPosition    0.2;  // [m]
-   cylinderRadius       0.18; // [m]
+   cylinderRadius       0.05; // [m]
   circularCrossSection true; // true = obstacle is a true cylinder, false = obstacle is a box
   kinViscosity   0.001; // [m^2/s]
   rho            1;     // [kg/m^3]
-   inflowVelocity   2; // [m/s] (0.45 [3D], 2.25 [3D], 0.3 [2D], 1.5 [2D])
+   inflowVelocity   0.45; // [m/s] (0.45 [3D], 2.25 [3D], 0.3 [2D], 1.5 [2D])
-   raisingTime    0.3; // [s] (0 == immediatelly full velocity)
+   raisingTime      0; // [s] (0 == immediatelly full velocity)
   sinPeriod        0; // [s] (0 == no sinus term)
-   obstacleBoundary 0; // 0 = staircase, 1 = curved
+   obstacleBoundary 1; // 0 = staircase, 1 = curved
   outletType       1; // 0 = pressure, 1 = outlet (2/1), 2 = outlet (4/3)
   initWithVelocity false; // at the beginning of the simulation everything is set to: ( 4 * inflowVelocity ) / 9 [3D] or ( 2 * inflowVelocity ) / 3 [2D]
@@ -63,7 +63,7 @@ SchaeferTurek
   nbrOfEvaluationPointsForCoefficientExtremas 100;
-   evaluatePressure true;
+   evaluatePressure false;
   pAlpha < 0.45, 0.2, 0.205 >; // points for evaluating
   pOmega < 0.55, 0.2, 0.205 >; // the pressure difference
@@ -74,9 +74,9 @@ SchaeferTurek
   // WHERE TO REFINE ? //
   ///////////////////////
-   useCylinderForRefinement false;
+   useCylinderForRefinement true;
-   cylinderRefinementLevel  2; // if "useCylinderForRefinement" is true, everything around the cylinder is refined to this level
+   cylinderRefinementLevel  0; // if "useCylinderForRefinement" is true, everything around the cylinder is refined to this level
-   cylinderRefinementBuffer 0.05; // [m] - additional space around the cylinder that is also guaranteed to be at level "cylinderRefinementLevel"
+   cylinderRefinementBuffer 0.0; // [m] - additional space around the cylinder that is also guaranteed to be at level "cylinderRefinementLevel"
   AABBRefinementSelection
   {
@@ -149,11 +149,11 @@ SchaeferTurek
   ////////////////////////////
 //minSimulationTime  10; // in [s] - ATTENTION: _IF_ 'minSimulationTime' is specified, 'outerTimeSteps' is adapted accordingly!
-   outerTimeSteps      2; // total number of time steps = outerTimeSteps * innerTimeSteps
+   outerTimeSteps      1; // total number of time steps = outerTimeSteps * innerTimeSteps
-   innerTimeSteps   5000; // For each outer loop, performance data is logged.
+   innerTimeSteps   20001; // For each outer loop, performance data is logged.
-   evaluationCheckFrequency 0; // 0 = disable evaluation
+   evaluationCheckFrequency 100; // 0 = disable evaluation
-   evaluationLogToStream    false;
+   evaluationLogToStream    true;
   evaluationLogToFile      true;
   evaluationFilename       SchaeferTurek.txt;
@@ -175,7 +175,7 @@ SchaeferTurek
      vtkMPIIO           true;
   }
-   remainingTimeLoggerFrequency 5; // in seconds
+   remainingTimeLoggerFrequency 10; // in seconds
   vtkBeforeTimeStep true; // false = at the end of time step
@@ -243,18 +243,20 @@ SchaeferTurek
   checkStrouhalNbrDiscreteDUpperBound  1E6; // upper bound for the Strouhal number (only checked if "evaluateStrouhal" is set to true - "discrete" cylinder diameter used for evaluation)
 }
+Logging
+{
+    logLevel info;  // info progress detail tracing
+}
 VTK
 {
-   /*
   fluid_field
   {
      baseFolder vtk;
    //initialWriteCallsToSkip 55130;
      writeFrequency 304;//102;
-      ghostLayers      1;
+      ghostLayers      0;
    //AABB_filter_0 {
    //   min < 0,   0,    0.2049999 >;
@@ -276,18 +278,16 @@ VTK
      }
      writers {
-       //VelocityFromPDF;
+         VelocityFromPDF;
         VelocityMagnitudeFromPDF;
-       //DensityFromPDF;
+         DensityFromPDF;
       //NonEquPart;
       //PDF;
      }
      incompatibleBlockStates empty;
   }
-   */
-   /*
   flag_field
   {
      baseFolder vtk;
@@ -302,8 +302,7 @@ VTK
      incompatibleBlockStates empty;
   }
-   */
   /*
   domain_decomposition
   {

--- a/apps/benchmarks/UniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt
@@ -3,14 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )
-foreach(streaming_pattern pull push aa esotwist)
+foreach(streaming_pattern pull push aa esotwist esopull esopush)
    foreach(stencil d3q19 d3q27)
-        foreach (collision_setup srt trt w-mrt r-w-mrt cm r-cm k r-k entropic smagorinsky)
+        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
 	    # KBC methods only for D2Q9 and D3Q27 defined
 	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
 		    continue()
-	    endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+        endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+        if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
+            continue()
+        endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
            set(config ${stencil}_${streaming_pattern}_${collision_setup})
            waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
                    FILE UniformGridCPU.py

--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
@@ -235,12 +235,16 @@ int main(int argc, char** argv)
                  pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
                  pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
                  pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+                  pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+                  pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
                  pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
                  pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
                  pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
                  pythonCallbackResults.data().exposeValue("stencil", infoStencil);
                  pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
                  pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+                  pythonCallbackResults.data().exposeValue("vectorised", vectorised);
+                  pythonCallbackResults.data().exposeValue("nontemporal", nontemporal);
                  pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
                  pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
                  // Call Python function to report results

--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
@@ -3,17 +3,12 @@ from dataclasses import replace
 import sympy as sp
 import pystencils as ps
-from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases, insert_constants,\
-    insert_symbol_times_minus_one
 from lbmpy.advanced_streaming import is_inplace
-from lbmpy.advanced_streaming.utility import streaming_patterns, get_accessor, Timestep
+from lbmpy.advanced_streaming.utility import streaming_patterns
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
-from lbmpy.enums import Method, Stencil
+from lbmpy.enums import Method, Stencil, SubgridScaleModel
-from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
 from lbmpy.moments import get_default_moment_set_for_stencil
-from lbmpy.updatekernels import create_stream_only_kernel
 from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
 from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
@@ -21,48 +16,53 @@ from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
 omega = sp.symbols('omega')
 omega_free = sp.Symbol('omega_free')
-# best configs in terms of FLOPS
 options_dict = {
    'srt': {
        'method': Method.SRT,
        'relaxation_rate': omega,
-        'compressible': True,
+        'compressible': False,
    },
    'trt': {
        'method': Method.TRT,
        'relaxation_rate': omega,
-        'compressible': True,
+        'compressible': False,
    },
-    'r-w-mrt': {
+    'mrt': {
        'method': Method.MRT,
        'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
-        'compressible': True,
+        'compressible': False,
    },
-    'w-mrt': {
+    'mrt-overrelax': {
        'method': Method.MRT,
        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
-        'compressible': True,
+        'compressible': False,
    },
-    'r-cm': {
+    'central': {
        'method': Method.CENTRAL_MOMENT,
        'relaxation_rate': omega,
        'compressible': True,
    },
-    'cm': {
+    'central-overrelax': {
        'method': Method.CENTRAL_MOMENT,
        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
        'compressible': True,
    },
-    'r-k': {
+    'cumulant': {
-        'method': Method.CUMULANT,
+        'method': Method.MONOMIAL_CUMULANT,
        'relaxation_rate': omega,
        'compressible': True,
    },
-    'k': {
+    'cumulant-overrelax': {
-        'method': Method.CUMULANT,
+        'method': Method.MONOMIAL_CUMULANT,
        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
        'compressible': True,
    },
+    'cumulant-K17': {
+        'method': Method.CUMULANT,
+        'relaxation_rate': omega,
+        'compressible': True,
+        'fourth_order_correction': 0.01
+    },
    'entropic': {
        'method': Method.TRT_KBC_N4,
        'compressible': True,
@@ -73,7 +73,12 @@ options_dict = {
    },
    'smagorinsky': {
        'method': Method.SRT,
-        'smagorinsky': False,
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
+        'relaxation_rate': omega,
+    },
+    'qr': {
+        'method': Method.SRT,
+        'subgrid_scale_model': SubgridScaleModel.QR,
        'relaxation_rate': omega,
    }
 }
@@ -83,6 +88,8 @@ info_header = """
 const char * infoStencil = "{stencil}";
 const char * infoStreamingPattern = "{streaming_pattern}";
 const char * infoCollisionSetup = "{collision_setup}";
+const bool vectorised = {vec};
+const bool nontemporal = {nt_stores};
 const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
@@ -90,10 +97,15 @@ const bool infoCsePdfs = {cse_pdfs};
 with CodeGeneration() as ctx:
    openmp = True if ctx.openmp else False
    field_type = "float64" if ctx.double_accuracy else "float32"
-    if ctx.optimize_for_localhost:
+    # This base pointer specification causes introduces temporary pointers in the outer loop such that the inner loop
-        cpu_vec = {"nontemporal": False, "assume_aligned": True}
+    # only contains aligned memory addresses. Doing so NT Stores are much more effective which causes great perfomance
-    else:
+    # gains especially for the pull scheme on skylake architectures
-        cpu_vec = None
+    base_pointer_spec = None  # [['spatialInner0'], ['spatialInner1']]
+    # cpu_vec = {"instruction_set": "best", "nontemporal": False,
+    #            "assume_aligned": True, 'assume_sufficient_line_padding': True}
+    cpu_vec = {"instruction_set": None}
+    nt_stores = False
    config_tokens = ctx.config.split('_')
@@ -110,7 +122,6 @@ with CodeGeneration() as ctx:
        raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
    assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
    options = options_dict[collision_setup]
    assert stencil.D == 3, "This application supports only three-dimensional stencils"
@@ -121,23 +132,20 @@ with CodeGeneration() as ctx:
    lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
    lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
+    # This creates a simplified version of the central moment collision operator where the bulk and shear viscosity is
+    # not seperated. This is done to get a fair comparison with the monomial cumulants.
    if lbm_config.method == Method.CENTRAL_MOMENT:
        lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
    if not is_inplace(streaming_pattern):
        lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
-        field_swaps = [(pdfs, pdfs_tmp)]
-    else:
-        field_swaps = []
-    # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
+    # This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
-    # is_inplace is set to False to ensure that the streaming is done with src and dst field.
+    # the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
-    # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
+    # typically remove the copy of the center PDF which results in an overestimation of the maximum performance
-    # accessor = CollideOnlyInplaceAccessor()
+    stream_only_kernel = []
-    accessor = get_accessor(streaming_pattern, Timestep.EVEN)
+    for i in range(stencil.Q):
-    #accessor.is_inplace = False
+        stream_only_kernel.append(ps.Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
-    field_swaps_stream_only = () if accessor.is_inplace else [(pdfs, pdfs_tmp)]
-    stream_only_kernel = create_stream_only_kernel(stencil, pdfs, None if accessor.is_inplace else pdfs_tmp, accessor=accessor)
    # LB Sweep
    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
@@ -152,17 +160,26 @@ with CodeGeneration() as ctx:
                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
                         nonuniform=False, boundaries=[no_slip, ubb],
                         macroscopic_fields=macroscopic_fields,
-                         cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
+                         cpu_openmp=openmp, cpu_vectorize_info=cpu_vec,
+                         base_pointer_specification=base_pointer_spec)
    # Stream only kernel
+    cpu_vec_stream = None
+    if ctx.optimize_for_localhost:
+        cpu_vec_stream = {"instruction_set": "best", "nontemporal": True,
+                          "assume_aligned": True, 'assume_sufficient_line_padding': True,
+                          "assume_inner_stride_one": True}
    generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel,
-                   field_swaps=field_swaps_stream_only,
+                   target=ps.Target.CPU, cpu_openmp=openmp,
-                   target=ps.Target.CPU, cpu_openmp=openmp)
+                   cpu_vectorize_info=cpu_vec_stream, base_pointer_specification=[['spatialInner0'], ['spatialInner1']])
    infoHeaderParams = {
        'stencil': stencil_str,
        'streaming_pattern': streaming_pattern,
        'collision_setup': collision_setup,
+        'vec': int(True if cpu_vec else False),
+        'nt_stores': int(nt_stores),
        'cse_global': int(lbm_opt.cse_global),
        'cse_pdfs': int(lbm_opt.cse_pdfs),
    }

--- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
@@ -4,13 +4,26 @@ from waLBerla.tools.config import block_decomposition
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
 import sys
 import sqlite3
-from math import prod
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
 # Number of time steps run for a workload of 128^3 per process
 # if double as many cells are on the process, half as many time steps are run etc.
 # increase this to get more reliable measurements
 TIME_STEPS_FOR_128_BLOCK = 10
 DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))
 def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
@@ -35,7 +48,7 @@ class Scenario:
    def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1,
                 timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
                 warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
-                 vtk_write_frequency=0, remaining_time_logger_frequency=-1):
+                 vtk_write_frequency=0, remaining_time_logger_frequency=-1, db_file_name=None):
        if boundary_setup:
            init_shear_flow = False
@@ -58,6 +71,7 @@ class Scenario:
        self.vtk_write_frequency = vtk_write_frequency
        self.remaining_time_logger_frequency = remaining_time_logger_frequency
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
        self.config_dict = self.config(print_dict=False)
@@ -101,6 +115,15 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
        sequenceValuesToScalars(data)
        result = data
@@ -111,8 +134,8 @@ class Scenario:
        table_name = table_name.replace("-", "_")
        for num_try in range(num_tries):
            try:
-                checkAndUpdateSchema(result, table_name, DB_FILE)
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
-                storeSingle(result, table_name, DB_FILE)
+                storeSingle(result, table_name, self.db_file_name)
                break
            except sqlite3.OperationalError as e:
                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
@@ -156,18 +179,38 @@ def overlap_benchmark():
            scenarios.add(scenario)
-def scaling_benchmark():
+def weak_scaling_benchmark():
-    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running weak scaling benchmark with one block per proc")
-    wlb.log_info_on_root("Running scaling benchmark")
    wlb.log_info_on_root("")
    scenarios = wlb.ScenarioManager()
-    cells_per_block = [(32, 32, 32), (128, 128, 128)]
-    for cell_per_block in cells_per_block:
+    for t in ["noOverlap", "simpleOverlap"]:
-        scenarios.add(Scenario(time_step_strategy='noOverlap',
+        scenarios.add(Scenario(time_step_strategy=t,
                               inner_outer_split=(1, 1, 1),
-                               cells_per_block=cell_per_block))
+                               cells_per_block=(WeakX, WeakY, WeakZ),
+                               boundary_setup=True,
+                               outer_iterations=1,
+                               db_file_name="weakScalingUniformGridOneBlock.sqlite3"))
+def strong_scaling_benchmark():
+    wlb.log_info_on_root("Running strong scaling benchmark with one block per proc")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    domain_size = (StrongX, StrongY, StrongZ)
+    blocks = block_decomposition(wlb.mpi.numProcesses())
+    cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
+                               time_step_strategy=t,
+                               outer_iterations=1,
+                               timesteps=10,
+                               boundary_setup=True,
+                               db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
 def single_node_benchmark():
@@ -176,13 +219,11 @@ def single_node_benchmark():
    wlb.log_info_on_root("")
    scenarios = wlb.ScenarioManager()
-    block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]
+    scenario = Scenario(cells_per_block=(128, 128, 128),
-    for block_size in block_sizes:
+                        time_step_strategy='kernelOnly',
-        scenario = Scenario(cells_per_block=block_size,
+                        outer_iterations=1,
-                            time_step_strategy='kernelOnly',
+                        timesteps=10)
-                            outer_iterations=1,
+    scenarios.add(scenario)
-                            timesteps=num_time_steps(block_size))
-        scenarios.add(scenario)
 def validation_run():
@@ -211,5 +252,15 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI
 # performance of compute kernel (no communication)
 # overlap_benchmark()  # benchmarks different communication overlap options
 # profiling()  # run only two timesteps on a smaller domain for profiling only
-validation_run()
+# validation_run()
 # scaling_benchmark()
+if BENCHMARK == 0:
+    single_node_benchmark()
+elif BENCHMARK == 1:
+    weak_scaling_benchmark()
+elif BENCHMARK == 2:
+    strong_scaling_benchmark()
+else:
+    validation_run()
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -3,13 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )
-foreach(streaming_pattern pull push aa esotwist)
+foreach(streaming_pattern pull push aa esotwist esopull esopush)
    foreach(stencil d3q19 d3q27)
-        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky)
+        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
 	    # KBC methods only for D2Q9 and D3Q27 defined
 	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
 		    continue()
-	    endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+        endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+        if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
+            continue()
+        endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
            set(config ${stencil}_${streaming_pattern}_${collision_setup})
            waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
                    FILE UniformGridGPU.py

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -136,7 +136,7 @@ int main(int argc, char** argv)
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
+      const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID);
      // Boundaries
      const FlagUID fluidFlagUID("Fluid");
@@ -264,6 +264,13 @@ int main(int argc, char** argv)
            python_coupling::PythonCallback pythonCallbackResults("results_callback");
            if (pythonCallbackResults.isCallable())
            {
+               pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
+               pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
+               pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+               pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+               pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
+               pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
               pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -4,17 +4,16 @@ import pystencils as ps
 from dataclasses import replace
+from pystencils import Assignment
 from pystencils.typing import TypedSymbol
 from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
-from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
+from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, SubgridScaleModel
 from lbmpy.advanced_streaming import is_inplace
 from lbmpy.advanced_streaming.utility import streaming_patterns
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import create_lb_collision_rule
 from lbmpy.moments import get_default_moment_set_for_stencil
-from lbmpy.updatekernels import create_stream_only_kernel
-from lbmpy.fieldaccess import *
 from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
 from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
@@ -74,6 +73,12 @@ options_dict = {
        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
        'compressible': True,
    },
+    'cumulant-K17': {
+        'method': Method.CUMULANT,
+        'relaxation_rate': omega,
+        'compressible': True,
+        'fourth_order_correction': 0.01
+    },
    'entropic': {
        'method': Method.TRT_KBC_N4,
        'compressible': True,
@@ -84,7 +89,12 @@ options_dict = {
    },
    'smagorinsky': {
        'method': Method.SRT,
-        'smagorinsky': False,
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
+        'relaxation_rate': omega,
+    },
+    'qr': {
+        'method': Method.SRT,
+        'subgrid_scale_model': SubgridScaleModel.QR,
        'relaxation_rate': omega,
    }
 }
@@ -101,7 +111,8 @@ const bool infoCsePdfs = {cse_pdfs};
 optimize = True
 with CodeGeneration() as ctx:
-    field_type = "float64" if ctx.double_accuracy else "float32"
+    pdf_data_type = "float64"
+    field_data_type = "float64"
    config_tokens = ctx.config.split('_')
    assert len(config_tokens) >= 3
@@ -124,8 +135,8 @@ with CodeGeneration() as ctx:
    options = options_dict[collision_setup]
    assert stencil.D == 3, "This application supports only three-dimensional stencils"
-    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_data_type}[3D]", layout='fzyx')
-    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_data_type}[3D]", layout='fzyx')
    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
    lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
@@ -140,13 +151,12 @@ with CodeGeneration() as ctx:
    else:
        field_swaps = []
-    # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
+    # This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
-    # is_inplace is set to False to ensure that the streaming is done with src and dst field.
+    # the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
-    # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
+    # typically remove the copy of the center PDF which results in an overestimation of the maximum performance
-    accessor = CollideOnlyInplaceAccessor()
+    stream_only_kernel = []
-    accessor.is_inplace = False
+    for i in range(stencil.Q):
-    field_swaps_stream_only = [(pdfs, pdfs_tmp)]
+        stream_only_kernel.append(Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
-    stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
    # LB Sweep
    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
@@ -158,9 +168,10 @@ with CodeGeneration() as ctx:
    lb_method = collision_rule.method
    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
-                                     boundary_object=NoSlip())
+                                     boundary_object=NoSlip(), field_data_type=pdf_data_type)
    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
-                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_data_type),
+                                 field_data_type=pdf_data_type)
    generate_lbm_package(ctx, name="UniformGridGPU",
                         collision_rule=collision_rule,
@@ -168,12 +179,12 @@ with CodeGeneration() as ctx:
                         nonuniform=False, boundaries=[no_slip, ubb],
                         macroscopic_fields=macroscopic_fields,
                         target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
+                         data_type=field_data_type, pdfs_data_type=pdf_data_type,
                         max_threads=max_threads)
    # Stream only kernel
-    vp = [('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize1'), ('int32_t', 'cudaBlockSize2')]
+    generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel,
-    generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
+                   gpu_indexing_params={'block_size': (128, 1, 1)}, target=ps.Target.GPU,
-                   gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU,
                   max_threads=max_threads)
    infoHeaderParams = {

--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -6,11 +6,25 @@ import sys
 import sqlite3
 from math import prod
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
 # Number of time steps run for a workload of 128^3 per GPU
 # if double as many cells are on the GPU, half as many time steps are run etc.
 # increase this to get more reliable measurements
 TIME_STEPS_FOR_128_BLOCK = 1000
 DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))
 BASE_CONFIG = {
    'DomainSetup': {
@@ -39,6 +53,8 @@ ldc_setup = {'Border': [
 def num_time_steps(block_size, time_steps_for_128_block=200):
    cells = block_size[0] * block_size[1] * block_size[2]
    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    if time_steps < 10:
+        time_steps = 10
    return int(time_steps)
@@ -61,13 +77,13 @@ class Scenario:
                 inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3,
                 init_shear_flow=False, boundary_setup=False,
                 vtk_write_frequency=0, remaining_time_logger_frequency=-1,
-                 additional_info=None):
+                 additional_info=None, blocks=None, db_file_name=None):
        if boundary_setup:
            init_shear_flow = False
            periodic = (0, 0, 0)
-        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+        self.blocks = blocks if blocks else block_decomposition(wlb.mpi.numProcesses())
        self.cells_per_block = cells_per_block
        self.periodic = periodic
@@ -85,6 +101,7 @@ class Scenario:
        self.vtk_write_frequency = vtk_write_frequency
        self.remaining_time_logger_frequency = remaining_time_logger_frequency
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
        self.config_dict = self.config(print_dict=False)
        self.additional_info = additional_info
@@ -97,7 +114,6 @@ class Scenario:
                'blocks': self.blocks,
                'cellsPerBlock': self.cells_per_block,
                'periodic': self.periodic,
-                'oneBlockPerProcess': True
            },
            'Parameters': {
                'omega': self.omega,
@@ -115,7 +131,6 @@ class Scenario:
            'Logging': {
                'logLevel': 'info',  # info progress detail tracing
            }
        }
        if self.boundary_setup:
            config_dict["Boundaries"] = ldc_setup
@@ -140,6 +155,15 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
        sequenceValuesToScalars(data)
        result = data
@@ -150,8 +174,8 @@ class Scenario:
        table_name = table_name.replace("-", "_")  # - not allowed for table name would lead to syntax error
        for num_try in range(num_tries):
            try:
-                checkAndUpdateSchema(result, table_name, DB_FILE)
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
-                storeSingle(result, table_name, DB_FILE)
+                storeSingle(result, table_name, self.db_file_name)
                break
            except sqlite3.OperationalError as e:
                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
@@ -200,12 +224,70 @@ def overlap_benchmark():
        scenarios.add(scenario)
+def no_overlap_scaling(cuda_enabled_mpi=False):
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running scaling benchmark without communication hiding")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    # no overlap
+    scenarios.add(Scenario(cells_per_block=(256, 256, 256),
+                           cuda_blocks=(128, 1, 1),
+                           time_step_strategy='noOverlap',
+                           inner_outer_split=(1, 1, 1),
+                           cuda_enabled_mpi=cuda_enabled_mpi,
+                           outer_iterations=1))
+def weak_scaling_overlap(cuda_enabled_mpi=False):
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running scaling benchmark with communication hiding")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    # overlap
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=(WeakX, WeakY, WeakZ),
+                               cuda_blocks=(128, 1, 1),
+                               time_step_strategy=t,
+                               inner_outer_split=(64, 64, 64),
+                               cuda_enabled_mpi=cuda_enabled_mpi,
+                               outer_iterations=1,
+                               boundary_setup=True,
+                               db_file_name="weakScalingUniformGrid.sqlite3"))
+def strong_scaling_overlap(cuda_enabled_mpi=False):
+    wlb.log_info_on_root("Running strong scaling benchmark with one block per proc with communication hiding")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    domain_size = (StrongX, StrongY, StrongZ)
+    blocks = block_decomposition(wlb.mpi.numProcesses())
+    cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
+    # overlap
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
+                               cuda_blocks=(128, 1, 1),
+                               time_step_strategy=t,
+                               inner_outer_split=(1, 1, 1),
+                               cuda_enabled_mpi=cuda_enabled_mpi,
+                               outer_iterations=1,
+                               timesteps=50,
+                               blocks=blocks,
+                               boundary_setup=True,
+                               db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
 def single_gpu_benchmark():
    """Benchmarks only the LBM compute kernel"""
    wlb.log_info_on_root("Running single GPU benchmarks")
    wlb.log_info_on_root("")
-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
    gpu_mem = gpu_mem_gb * (2 ** 30)
    gpu_type = os.environ.get('GPU_TYPE')
@@ -214,12 +296,8 @@ def single_gpu_benchmark():
        additional_info['gpu_type'] = gpu_type
    scenarios = wlb.ScenarioManager()
-    block_sizes = [(i, i, i) for i in (32, 64, 128, 256)]
+    block_sizes = [(i, i, i) for i in (128, 256, 320)]
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
+    cuda_blocks = [(128, 1, 1), ]
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
-                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
-                   (32, 8, 1), (64, 8, 1),
-                   (32, 16, 1)]
    for block_size in block_sizes:
        for cuda_block_size in cuda_blocks:
            # cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1)
@@ -266,4 +344,14 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI
 # performance of compute kernel (no communication)
 # overlap_benchmark()  # benchmarks different communication overlap options
 # profiling()  # run only two timesteps on a smaller domain for profiling only
-validation_run()
+# validation_run()
+if BENCHMARK == 0:
+    single_gpu_benchmark()
+elif BENCHMARK == 1:
+    weak_scaling_overlap(True)
+elif BENCHMARK == 2:
+    strong_scaling_overlap(True)
+else:
+    validation_run()
--- a/apps/pythonmodule/CMakeLists.txt
+++ b/apps/pythonmodule/CMakeLists.txt
@@ -17,7 +17,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
        set ( pythonModules "-Wl,-whole-archive" ${PYTHON_MODULE_DEPENDENCIES}  "-Wl,-no-whole-archive" )
    endif()
-    add_library( walberla_cpp SHARED PythonModule.cpp )
+    add_library( walberla_cpp SHARED PythonModule.cpp
+            ../showcases/FlowAroundSphere/FlowAroundSphere.cpp)
    target_link_libraries( walberla_cpp ${WALBERLA_LINK_LIBRARIES_KEYWORD} ${pythonModules} ${SERVICE_LIBS} )

--- a/apps/showcases/CMakeLists.txt
+++ b/apps/showcases/CMakeLists.txt
@@ -10,6 +10,11 @@ add_subdirectory( PegIntoSphereBed )
 if ( WALBERLA_BUILD_WITH_CODEGEN)
   add_subdirectory( Antidunes )
+   add_subdirectory( FlowAroundSphere )
+   add_subdirectory( FlowAroundSphereCPU )
+   add_subdirectory( FlowAroundCylinder )
+   add_subdirectory( Channel )
+   add_subdirectory( TaylorGreenVortex )
   if (WALBERLA_BUILD_WITH_PYTHON)
      add_subdirectory( PhaseFieldAllenCahn )

--- a/apps/showcases/Channel/CMakeLists.txt
+++ b/apps/showcases/Channel/CMakeLists.txt
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+waLBerla_generate_target_from_python(NAME ChannelGenerated
+        FILE Channel.py
+        OUT_FILES ChannelStorageSpecification.h ChannelStorageSpecification.cpp
+        ChannelSweepCollection.h ChannelSweepCollection.cpp
+        NoSlip.h NoSlip.cpp
+        UBB.h UBB.cpp
+        ChannelBoundaryCollection.h
+        ChannelHeader.h)
+waLBerla_add_executable ( NAME Channel
+        FILES Channel.cpp
+        DEPENDS ChannelGenerated blockforest core field geometry lbm_generated timeloop )
--- a/apps/showcases/Channel/Channel.cpp
+++ b/apps/showcases/Channel/Channel.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Channel.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \brief Couette flow
+//
+//======================================================================================================================
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/communication/NonUniformBufferedScheme.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/MemoryUsage.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/GhostLayerField.h"
+#include "field/vtk/VTKWriter.h"
+#include "geometry/InitBoundaryHandling.h"
+#include "timeloop/SweepTimeloop.h"
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
+// include the generated header file. It includes all generated classes
+#include "ChannelHeader.h"
+using namespace walberla;
+using namespace std::placeholders;
+using StorageSpecification_T = lbm::ChannelStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+using PdfField_T             = lbm_generated::PdfField< StorageSpecification_T >;
+using PackInfo_T             = lbm_generated::NonuniformGeneratedPdfPackInfo< PdfField_T >;
+using SweepCollection_T = lbm::ChannelSweepCollection;
+using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >;
+using ScalarField_T = GhostLayerField< real_t, 1 >;
+using flag_t               = walberla::uint8_t;
+using FlagField_T          = FlagField< flag_t >;
+using BoundaryCollection_T = lbm::ChannelBoundaryCollection< FlagField_T >;
+using Timestep_T = lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >;
+using blockforest::communication::NonUniformBufferedScheme;
+static const real_t c0o1 = real_c(0.0) / real_c(1.0);
+static const real_t c1o1 = real_c(1.0) / real_c(1.0);
+static const real_t c2o1 = real_c(2.0) / real_c(1.0);
+static const real_t c3o1 = real_c(3.0) / real_c(1.0);
+static const real_t c6o1 = real_c(6.0) / real_c(1.0);
+static const real_t c8o1 = real_c(8.0) / real_c(1.0);
+static const real_t c9o1 = real_c(9.0) / real_c(1.0);
+static const real_t c18o1 = real_c(18.0) / real_c(1.0);
+static const real_t c36o1 = real_c(36.0) / real_c(1.0);
+static const real_t c1o2 = real_c(1.0) / real_c(2.0);
+static const real_t c3o2 = real_c(3.0) / real_c(2.0);
+static const real_t c9o2 = real_c(9.0) / real_c(2.0);
+static const real_t c9o4 = real_c(9.0) / real_c(4.0);
+static const real_t c1o3 = real_c(1.0) / real_c(3.0);
+static const real_t c2o3 = real_c(2.0) / real_c(3.0);
+static const real_t c2o9 = real_c(2.0) / real_c(9.0);
+static const real_t c4o9 = real_c(4.0) / real_c(9.0);
+static const real_t c1o4 = real_c(1.0) / real_c(4.0);
+static const real_t c1o6 = real_c(1.0) / real_c(6.0);
+static const real_t c1o8 = real_c(1.0) / real_c(8.0);
+static const real_t c1o9 = real_c(1.0) / real_c(9.0);
+static const real_t c1o16 = real_c(1.0) / real_c(16.0);
+static const real_t c1o18 = real_c(1.0) / real_c(18.0);
+static const real_t c1o27 = real_c(1.0) / real_c(27.0);
+static const real_t c1o36 = real_c(1.0) / real_c(36.0);
+static const real_t c1o64 = real_c(1.0) / real_c(64.0);
+struct IDs
+{
+   BlockDataID pdfField;
+   BlockDataID pdfFieldTmp;
+   BlockDataID velocityField;
+   BlockDataID densityField;
+   BlockDataID avgVelField;
+   BlockDataID avgVelSqrField;
+   BlockDataID avgPressureField;
+   BlockDataID flagField;
+   BlockDataID pdfFieldGPU;
+   BlockDataID velocityFieldGPU;
+   BlockDataID densityFieldGPU;
+};
+struct Setup
+{
+   Vector3< uint_t > blocks;
+   Vector3< uint_t > cellsPerBlock;
+   Vector3< uint_t > cells;
+   Vector3< bool > periodic;
+   uint_t numGhostLayers;
+};
+class AccuracyEvaluation
+{
+ public:
+   AccuracyEvaluation(std::shared_ptr< StructuredBlockForest >& blocks, const IDs& ids,
+                      SweepCollection_T& sweepCollection, const real_t maxLatticeVelocity, const uint_t logFrequency,
+                      const bool logToStream, const bool logToFile)
+      : blocks_(blocks), ids_(ids), sweepCollection_(sweepCollection), maxLatticeVelocity_(maxLatticeVelocity),
+        executionCounter_(uint_c(0)), filename_("Channel.txt"), normalizationFactor_(real_c(1.0))
+   {
+      logFrequency_  = logToStream ? logFrequency : uint_c(0);
+      plotFrequency_ = logToFile ? logFrequency : uint_c(0);
+   }
+   void setNormalizationFactor(const real_t f) { normalizationFactor_ = f; }
+   void setFilename(const std::string& filename) { filename_ = filename; }
+   real_t L1() const { return L1_; }
+   real_t L2() const { return L2_; }
+   real_t Lmax() const { return Lmax_; }
+   void operator()()
+   {
+      if (logFrequency_ == uint_c(0) && (plotFrequency_ == uint_c(0) || filename_.empty())) return;
+      ++executionCounter_;
+      const bool plot = (plotFrequency_ != uint_c(0) && (executionCounter_ - uint_c(1)) % plotFrequency_ == uint_c(0) &&
+                         !filename_.empty());
+      const bool log  = (logFrequency_ != uint_c(0) && (executionCounter_ - uint_c(1)) % logFrequency_ == uint_c(0));
+      if (!log && !plot) return;
+      const auto& domainAABB = blocks_->getDomain();
+      real_t _L1(real_c(0.0));
+      real_t _L2(real_c(0.0));
+      real_t _Lmax(real_c(0.0));
+      for (auto block = blocks_->begin(); block != blocks_->end(); ++block)
+      {
+         sweepCollection_.calculateMacroscopicParameters(block.get());
+         const VectorField_T* velocityField = block->template getData< const VectorField_T >(ids_.velocityField);
+         const auto level = blocks_->getLevel(*block);
+         const real_t volumeFraction =
+            (blocks_->dx(level) * blocks_->dy(level) * blocks_->dz(level)) / domainAABB.volume();
+         for (auto it = velocityField->beginXYZ(); it != velocityField->end(); ++it)
+         {
+            Vector3< real_t > center = blocks_->getBlockLocalCellCenter(*block, Cell(it.x(), it.y(), it.z()));
+            Vector3< real_t > exactVelocity(
+               real_c(maxLatticeVelocity_ * (center[1] - real_c(domainAABB.yMin())) / real_c(domainAABB.ySize())),
+               real_c(0.0), real_c(0.0));
+            Vector3< real_t > velocity(it.getF(0), it.getF(1), it.getF(2));
+            const auto error  = velocity - exactVelocity;
+            const real_t diff = error.length();
+            _L1 += diff * volumeFraction;
+            _L2 += diff * diff * volumeFraction;
+            _Lmax = std::max(_Lmax, diff);
+         }
+      }
+      mpi::reduceInplace(_L1, mpi::SUM);
+      mpi::reduceInplace(_L2, mpi::SUM);
+      mpi::reduceInplace(_Lmax, mpi::MAX);
+      _L2 = std::sqrt(_L2);
+      L1_   = _L1;
+      L2_   = _L2;
+      Lmax_ = _Lmax;
+      WALBERLA_ROOT_SECTION()
+      {
+         if (plot && executionCounter_ == uint_t(1))
+         {
+            std::ofstream file(filename_.c_str());
+            file << "# accuracy evaluation"
+                 << "# step [1], L1 [2], L2 [3], Lmax [4]" << std::endl;
+            file.close();
+         }
+         if (log)
+         {
+            WALBERLA_LOG_INFO("Evaluation of accuracy:"
+                              << "\n - L1:   " << L1_ << "\n - L2:   " << L2_ << "\n - Lmax: " << Lmax_);
+         }
+         if (plot)
+         {
+            std::ofstream file(filename_.c_str(), std::ofstream::out | std::ofstream::app);
+            file << (executionCounter_ - uint_t(1)) << " " << L1_ << " " << L2_ << " " << Lmax_ << std::endl;
+            file.close();
+         }
+      }
+   };
+ private:
+   std::shared_ptr< StructuredBlockForest > blocks_;
+   IDs ids_;
+   SweepCollection_T& sweepCollection_;
+   real_t maxLatticeVelocity_;
+   uint_t executionCounter_;
+   uint_t plotFrequency_;
+   uint_t logFrequency_;
+   std::string filename_;
+   real_t normalizationFactor_;
+   real_t L1_;
+   real_t L2_;
+   real_t Lmax_;
+}; // class AccuracyEvaluation
+namespace
+{
+void workloadMemoryAndSUIDAssignment(SetupBlockForest& forest, const memory_t memoryPerBlock)
+{
+   for (auto block = forest.begin(); block != forest.end(); ++block)
+   {
+      block->setWorkload(numeric_cast< workload_t >(uint_t(1) << block->getLevel()));
+      block->setMemory(memoryPerBlock);
+   }
+}
+shared_ptr< SetupBlockForest >
+   createSetupBlockForest(const blockforest::RefinementSelectionFunctions& refinementSelectionFunctions,
+                          const Setup& setup, uint_t numberOfProcesses, const memory_t memoryPerCell)
+{
+   shared_ptr< SetupBlockForest > forest = make_shared< SetupBlockForest >();
+   const memory_t memoryPerBlock =
+      numeric_cast< memory_t >((setup.cellsPerBlock[0] + uint_t(2) * setup.numGhostLayers) *
+                               (setup.cellsPerBlock[1] + uint_t(2) * setup.numGhostLayers) *
+                               (setup.cellsPerBlock[2] + uint_t(2) * setup.numGhostLayers)) *
+      memoryPerCell;
+   forest->addRefinementSelectionFunction(refinementSelectionFunctions);
+   forest->addWorkloadMemorySUIDAssignmentFunction(
+      std::bind(workloadMemoryAndSUIDAssignment, std::placeholders::_1, memoryPerBlock));
+   forest->init(
+      AABB(real_c(0), real_c(0), real_c(0), real_c(setup.cells[0]), real_c(setup.cells[1]), real_c(setup.cells[2])),
+      setup.blocks[0], setup.blocks[1], setup.blocks[2], setup.periodic[0], setup.periodic[1], setup.periodic[2]);
+   MPIManager::instance()->useWorldComm();
+   forest->balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numberOfProcesses);
+   // WALBERLA_LOG_INFO_ON_ROOT("SetupBlockForest created successfully:\n" << *forest);
+   return forest;
+}
+shared_ptr< blockforest::StructuredBlockForest >
+   createStructuredBlockForest(const blockforest::RefinementSelectionFunctions& refinementSelectionFunctions,
+                               const Setup& setup, const memory_t memoryPerCell)
+{
+   // WALBERLA_LOG_INFO_ON_ROOT("Creating the block structure ...");
+   shared_ptr< SetupBlockForest > sforest = createSetupBlockForest(
+      refinementSelectionFunctions, setup, uint_c(MPIManager::instance()->numProcesses()), memoryPerCell);
+   auto bf  = std::make_shared< blockforest::BlockForest >(uint_c(MPIManager::instance()->rank()), *sforest, false);
+   auto sbf = std::make_shared< blockforest::StructuredBlockForest >(bf, setup.cellsPerBlock[0], setup.cellsPerBlock[1],
+                                                                     setup.cellsPerBlock[2]);
+   sbf->createCellBoundingBoxes();
+   return sbf;
+}
+} // namespace
+class RefinementSelection
+{
+ public:
+   RefinementSelection(const uint_t level) : level_(level) {}
+   void operator()(SetupBlockForest& forest)
+   {
+      const AABB& domain = forest.getDomain();
+      const real_t ySize = (domain.ySize() / real_t(12)) * real_c(0.99);
+      const AABB intersectDomain(domain.xMin(), domain.yMax() - ySize, domain.zMin(), domain.xMax(), domain.yMax(),
+                                 domain.zMax());
+      for (auto& block : forest)
+      {
+         const AABB& aabb = block.getAABB();
+         if (intersectDomain.intersects(aabb))
+         {
+            if (block.getLevel() < level_) block.setMarker(true);
+         }
+      }
+   }
+ private:
+   uint_t level_;
+}; // class RefinementSelection
+class StreamCollide
+{
+ public:
+   StreamCollide(const shared_ptr< StructuredBlockForest > & blocks, const IDs& ids, const real_t omega) : ids_(ids)
+   {
+      for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++)
+      {
+         const double level_scale_factor = double(uint_t(1) << level);
+         const double one                = double(1.0);
+         const double half               = double(0.5);
+         omegaVector.push_back( double(omega / (level_scale_factor * (-omega * half + one) + omega * half)) );
+      }
+   }
+   void operator()(Block * block)
+   {
+      auto pdfField = block->getData<PdfField_T>(ids_.pdfField);
+      auto pdfFieldTmp = block->getData<PdfField_T>(ids_.pdfFieldTmp);
+      CellInterval interval =pdfField->xyzSize();
+      streamCollide(block, interval);
+      pdfField->swapDataPointers(pdfFieldTmp);
+   }
+   void streamCollide(Block * block, const CellInterval & ci)
+   {
+      auto pdfField = block->getData<PdfField_T>(ids_.pdfField);
+      auto pdfFieldTmp = block->getData<PdfField_T>(ids_.pdfFieldTmp);
+      const uint_t level = block->getLevel();
+      double omega = omegaVector[level];
+      const double xi_0 = ((1.0) / (omega*-0.25 + 2.0));
+      const double rr_0 = xi_0*(omega*-2.0 + 4.0);
+      for (auto it: ci)
+      {
+         const cell_idx_t ctr_2 = it.z();
+         const cell_idx_t ctr_1 = it.y();
+         const cell_idx_t ctr_0 = it.x();
+         const double f_0 = pdfField->get(ctr_0, ctr_1, ctr_2, 0);
+         const double f_1 = pdfField->get(ctr_0, ctr_1 - 1, ctr_2, 1);
+         const double f_2 = pdfField->get(ctr_0, ctr_1 + 1, ctr_2, 2);
+         const double f_3 = pdfField->get(ctr_0 + 1, ctr_1, ctr_2, 3);
+         const double f_4 = pdfField->get(ctr_0 - 1, ctr_1, ctr_2, 4);
+         const double f_5 = pdfField->get(ctr_0, ctr_1, ctr_2 - 1, 5);
+         const double f_6 = pdfField->get(ctr_0, ctr_1, ctr_2 + 1, 6);
+         const double f_7 = pdfField->get(ctr_0 + 1, ctr_1 - 1, ctr_2, 7);
+         const double f_8 = pdfField->get(ctr_0 - 1, ctr_1 - 1, ctr_2, 8);
+         const double f_9 = pdfField->get(ctr_0 + 1, ctr_1 + 1, ctr_2, 9);
+         const double f_10 = pdfField->get(ctr_0 - 1, ctr_1 + 1, ctr_2, 10);
+         const double f_11 = pdfField->get(ctr_0, ctr_1 - 1, ctr_2 - 1, 11);
+         const double f_12 = pdfField->get(ctr_0, ctr_1 + 1, ctr_2 - 1, 12);
+         const double f_13 = pdfField->get(ctr_0 + 1, ctr_1, ctr_2 - 1, 13);
+         const double f_14 = pdfField->get(ctr_0 - 1, ctr_1, ctr_2 - 1, 14);
+         const double f_15 = pdfField->get(ctr_0, ctr_1 - 1, ctr_2 + 1, 15);
+         const double f_16 = pdfField->get(ctr_0, ctr_1 + 1, ctr_2 + 1, 16);
+         const double f_17 = pdfField->get(ctr_0 + 1, ctr_1, ctr_2 + 1, 17);
+         const double f_18 = pdfField->get(ctr_0 - 1, ctr_1, ctr_2 + 1, 18);
+         const double f_19 = pdfField->get(ctr_0 - 1, ctr_1 - 1, ctr_2 - 1, 19);
+         const double f_20 = pdfField->get(ctr_0 + 1, ctr_1 - 1, ctr_2 - 1, 20);
+         const double f_21 = pdfField->get(ctr_0 - 1, ctr_1 + 1, ctr_2 - 1, 21);
+         const double f_22 = pdfField->get(ctr_0 + 1, ctr_1 + 1, ctr_2 - 1, 22);
+         const double f_23 = pdfField->get(ctr_0 - 1, ctr_1 - 1, ctr_2 + 1, 23);
+         const double f_24 = pdfField->get(ctr_0 + 1, ctr_1 - 1, ctr_2 + 1, 24);
+         const double f_25 = pdfField->get(ctr_0 - 1, ctr_1 + 1, ctr_2 + 1, 25);
+         const double f_26 = pdfField->get(ctr_0 + 1, ctr_1 + 1, ctr_2 + 1, 26);
+         const double vel0Term = f_10 + f_14 + f_18 + f_19 + f_21 + f_23 + f_25 + f_4 + f_8;
+         const double vel1Term = f_1 + f_11 + f_15 + f_20 + f_24 + f_7;
+         const double vel2Term = f_12 + f_13 + f_22 + f_5;
+         const double delta_rho = f_0 + f_16 + f_17 + f_2 + f_26 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+         const double rho = delta_rho + 1.0;
+         const double xi_1 = ((1.0) / (rho));
+         const double u_0 = xi_1*(-f_13 - f_17 - f_20 - f_22 - f_24 - f_26 - f_3 - f_7 - f_9 + vel0Term);
+         const double u_1 = xi_1*(-f_10 - f_12 - f_16 + f_19 - f_2 - f_21 - f_22 + f_23 - f_25 - f_26 + f_8 - f_9 + vel1Term);
+         const double u_2 = xi_1*(f_11 + f_14 - f_15 - f_16 - f_17 - f_18 + f_19 + f_20 + f_21 - f_23 - f_24 - f_25 - f_26 - f_6 + vel2Term);
+         const double u0Mu1 = u_0 - u_1;
+         const double u0Pu1 = u_0 + u_1;
+         const double u1Pu2 = u_1 + u_2;
+         const double u1Mu2 = u_1 - u_2;
+         const double u0Mu2 = u_0 - u_2;
+         const double u0Pu2 = u_0 + u_2;
+         const double f_eq_common = delta_rho + rho*-1.5*(u_0*u_0) + rho*-1.5*(u_1*u_1) + rho*-1.5*(u_2*u_2);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 0)  = f_0 + omega*(-f_0 + f_eq_common*0.29629629629629628);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 1)  = f_1 + omega*(f_1*-0.5 + f_2*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_1*u_1)) + rr_0*(f_1*-0.5 + f_2*0.5 + rho*u_1*0.22222222222222221);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 2)  = f_2 + omega*(f_1*-0.5 + f_2*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_1*u_1)) + rr_0*(f_1*0.5 + f_2*-0.5 + rho*u_1*-0.22222222222222221);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 3)  = f_3 + omega*(f_3*-0.5 + f_4*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_0*u_0)) + rr_0*(f_3*-0.5 + f_4*0.5 + rho*u_0*-0.22222222222222221);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 4)  = f_4 + omega*(f_3*-0.5 + f_4*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_0*u_0)) + rr_0*(f_3*0.5 + f_4*-0.5 + rho*u_0*0.22222222222222221);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 5)  = f_5 + omega*(f_5*-0.5 + f_6*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_2*u_2)) + rr_0*(f_5*-0.5 + f_6*0.5 + rho*u_2*0.22222222222222221);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 6)  = f_6 + omega*(f_5*-0.5 + f_6*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_2*u_2)) + rr_0*(f_5*0.5 + f_6*-0.5 + rho*u_2*-0.22222222222222221);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 7)  = f_7 + omega*(f_10*-0.5 + f_7*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu1*u0Mu1)) + rr_0*(f_10*0.5 + f_7*-0.5 + rho*u0Mu1*-0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 8)  = f_8 + omega*(f_8*-0.5 + f_9*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu1*u0Pu1)) + rr_0*(f_8*-0.5 + f_9*0.5 + rho*u0Pu1*0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 9)  = f_9 + omega*(f_8*-0.5 + f_9*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu1*u0Pu1)) + rr_0*(f_8*0.5 + f_9*-0.5 + rho*u0Pu1*-0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 10) = f_10 + omega*(f_10*-0.5 + f_7*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu1*u0Mu1)) + rr_0*(f_10*-0.5 + f_7*0.5 + rho*u0Mu1*0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 11) = f_11 + omega*(f_11*-0.5 + f_16*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Pu2*u1Pu2)) + rr_0*(f_11*-0.5 + f_16*0.5 + rho*u1Pu2*0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 12) = f_12 + omega*(f_12*-0.5 + f_15*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Mu2*u1Mu2)) + rr_0*(f_12*-0.5 + f_15*0.5 + rho*u1Mu2*-0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 13) = f_13 + omega*(f_13*-0.5 + f_18*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu2*u0Mu2)) + rr_0*(f_13*-0.5 + f_18*0.5 + rho*u0Mu2*-0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 14) = f_14 + omega*(f_14*-0.5 + f_17*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu2*u0Pu2)) + rr_0*(f_14*-0.5 + f_17*0.5 + rho*u0Pu2*0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 15) = f_15 + omega*(f_12*-0.5 + f_15*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Mu2*u1Mu2)) + rr_0*(f_12*0.5 + f_15*-0.5 + rho*u1Mu2*0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 16) = f_16 + omega*(f_11*-0.5 + f_16*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Pu2*u1Pu2)) + rr_0*(f_11*0.5 + f_16*-0.5 + rho*u1Pu2*-0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 17) = f_17 + omega*(f_14*-0.5 + f_17*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu2*u0Pu2)) + rr_0*(f_14*0.5 + f_17*-0.5 + rho*u0Pu2*-0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 18) = f_18 + omega*(f_13*-0.5 + f_18*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu2*u0Mu2)) + rr_0*(f_13*0.5 + f_18*-0.5 + rho*u0Mu2*0.055555555555555552);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 19) = f_19 + omega*(delta_rho*-0.013888888888888888 + f_19*-0.5 + f_26*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_19*-0.5 + f_26*0.5 + rho*(u0Pu1*0.013888888888888888 + u_2*0.013888888888888888));
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 20) = f_20 + omega*(delta_rho*-0.013888888888888888 + f_20*-0.5 + f_25*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_20*-0.5 + f_25*0.5 + rho*(u0Mu1*-0.013888888888888888 + u_2*0.013888888888888888));
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 21) = f_21 + omega*(delta_rho*-0.013888888888888888 + f_21*-0.5 + f_24*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_21*-0.5 + f_24*0.5 + rho*(u0Mu1*0.013888888888888888 + u_2*0.013888888888888888));
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 22) = f_22 + omega*(delta_rho*-0.013888888888888888 + f_22*-0.5 + f_23*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_22*-0.5 + f_23*0.5 + rho*(u0Pu1*-0.013888888888888888 + u_2*0.013888888888888888));
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 23) = f_23 + omega*(delta_rho*-0.013888888888888888 + f_22*-0.5 + f_23*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_22*0.5 + f_23*-0.5 + rho*(u0Pu1*0.013888888888888888 + u_2*-0.013888888888888888));
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 24) = f_24 + omega*(delta_rho*-0.013888888888888888 + f_21*-0.5 + f_24*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_21*0.5 + f_24*-0.5 + rho*(u0Mu1*-0.013888888888888888 + u_2*-0.013888888888888888));
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 25) = f_25 + omega*(delta_rho*-0.013888888888888888 + f_20*-0.5 + f_25*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_20*0.5 + f_25*-0.5 + rho*(u0Mu1*0.013888888888888888 + u_2*-0.013888888888888888));
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 26) = f_26 + omega*(delta_rho*-0.013888888888888888 + f_19*-0.5 + f_26*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_19*0.5 + f_26*-0.5 + rho*(u0Pu1*-0.013888888888888888 + u_2*-0.013888888888888888));
+      }
+   }
+   void stream(Block * block, const CellInterval & ci)
+   {
+      auto pdfField = block->getData<PdfField_T>(ids_.pdfField);
+      auto pdfFieldTmp = block->getData<PdfField_T>(ids_.pdfFieldTmp);
+      for (auto it: ci)
+      {
+         const cell_idx_t ctr_2 = it.z();
+         const cell_idx_t ctr_1 = it.y();
+         const cell_idx_t ctr_0 = it.x();
+         const double f_0 = pdfField->get(ctr_0, ctr_1, ctr_2, 0);
+         const double f_1 = pdfField->get(ctr_0, ctr_1 - 1, ctr_2, 1);
+         const double f_2 = pdfField->get(ctr_0, ctr_1 + 1, ctr_2, 2);
+         const double f_3 = pdfField->get(ctr_0 + 1, ctr_1, ctr_2, 3);
+         const double f_4 = pdfField->get(ctr_0 - 1, ctr_1, ctr_2, 4);
+         const double f_5 = pdfField->get(ctr_0, ctr_1, ctr_2 - 1, 5);
+         const double f_6 = pdfField->get(ctr_0, ctr_1, ctr_2 + 1, 6);
+         const double f_7 = pdfField->get(ctr_0 + 1, ctr_1 - 1, ctr_2, 7);
+         const double f_8 = pdfField->get(ctr_0 - 1, ctr_1 - 1, ctr_2, 8);
+         const double f_9 = pdfField->get(ctr_0 + 1, ctr_1 + 1, ctr_2, 9);
+         const double f_10 = pdfField->get(ctr_0 - 1, ctr_1 + 1, ctr_2, 10);
+         const double f_11 = pdfField->get(ctr_0, ctr_1 - 1, ctr_2 - 1, 11);
+         const double f_12 = pdfField->get(ctr_0, ctr_1 + 1, ctr_2 - 1, 12);
+         const double f_13 = pdfField->get(ctr_0 + 1, ctr_1, ctr_2 - 1, 13);
+         const double f_14 = pdfField->get(ctr_0 - 1, ctr_1, ctr_2 - 1, 14);
+         const double f_15 = pdfField->get(ctr_0, ctr_1 - 1, ctr_2 + 1, 15);
+         const double f_16 = pdfField->get(ctr_0, ctr_1 + 1, ctr_2 + 1, 16);
+         const double f_17 = pdfField->get(ctr_0 + 1, ctr_1, ctr_2 + 1, 17);
+         const double f_18 = pdfField->get(ctr_0 - 1, ctr_1, ctr_2 + 1, 18);
+         const double f_19 = pdfField->get(ctr_0 - 1, ctr_1 - 1, ctr_2 - 1, 19);
+         const double f_20 = pdfField->get(ctr_0 + 1, ctr_1 - 1, ctr_2 - 1, 20);
+         const double f_21 = pdfField->get(ctr_0 - 1, ctr_1 + 1, ctr_2 - 1, 21);
+         const double f_22 = pdfField->get(ctr_0 + 1, ctr_1 + 1, ctr_2 - 1, 22);
+         const double f_23 = pdfField->get(ctr_0 - 1, ctr_1 - 1, ctr_2 + 1, 23);
+         const double f_24 = pdfField->get(ctr_0 + 1, ctr_1 - 1, ctr_2 + 1, 24);
+         const double f_25 = pdfField->get(ctr_0 - 1, ctr_1 + 1, ctr_2 + 1, 25);
+         const double f_26 = pdfField->get(ctr_0 + 1, ctr_1 + 1, ctr_2 + 1, 26);
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 0)  = f_0;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 1)  = f_1;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 2)  = f_2;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 3)  = f_3;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 4)  = f_4;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 5)  = f_5;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 6)  = f_6;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 7)  = f_7;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 8)  = f_8;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 9)  = f_9;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 10) = f_10;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 11) = f_11;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 12) = f_12;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 13) = f_13;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 14) = f_14;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 15) = f_15;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 16) = f_16;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 17) = f_17;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 18) = f_18;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 19) = f_19;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 20) = f_20;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 21) = f_21;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 22) = f_22;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 23) = f_23;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 24) = f_24;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 25) = f_25;
+         pdfFieldTmp->get(ctr_0, ctr_1, ctr_2, 26) = f_26;
+      }
+   }
+ private:
+   IDs ids_;
+   std::vector<real_t> omegaVector;
+};
+struct InterpolationCoefficients
+{
+   real_t a000, a100, a010, a001, a200, a020, a002, a110, a101, a011;
+   real_t b000, b100, b010, b001, b200, b020, b002, b110, b101, b011;
+   real_t c000, c100, c010, c001, c200, c020, c002, c110, c101, c011;
+   real_t d000, d100, d010, d001, d110, d101, d011;
+   real_t a111, b111, c111, d111;
+   real_t LaplaceRho;
+};
+struct MomentsOnSourceNode
+{
+   real_t drho;
+   real_t velocityX;
+   real_t velocityY;
+   real_t velocityZ;
+   real_t kxyFromfcNEQ;
+   real_t kyzFromfcNEQ;
+   real_t kxzFromfcNEQ;
+   real_t kxxMyyFromfcNEQ;
+   real_t kxxMzzFromfcNEQ;
+   bool dirty{true};
+   void calculate(const real_t *const f, const real_t omega)
+   {
+      const real_t fP00 = f[4];
+      const real_t fM00 = f[3];
+      const real_t f0P0 = f[1];
+      const real_t f0M0 = f[2];
+      const real_t f00P = f[5];
+      const real_t f00M = f[6];
+      const real_t fPP0 = f[8];
+      const real_t fMM0 = f[9];
+      const real_t fPM0 = f[10];
+      const real_t fMP0 = f[7];
+      const real_t fP0P = f[14];
+      const real_t fM0M = f[17];
+      const real_t fP0M = f[18];
+      const real_t fM0P = f[13];
+      const real_t f0PP = f[11];
+      const real_t f0MM = f[16];
+      const real_t f0PM = f[15];
+      const real_t f0MP = f[12];
+      const real_t fPPP = f[19];
+      const real_t fMPP = f[20];
+      const real_t fPMP = f[21];
+      const real_t fMMP = f[22];
+      const real_t fPPM = f[23];
+      const real_t fMPM = f[24];
+      const real_t fPMM = f[25];
+      const real_t fMMM = f[26];
+      const real_t vel0Term = f[10] + f[14] + f[18] + f[19] + f[21] + f[23] + f[25] + f[4] + f[8];
+      const real_t vel1Term = f[1] + f[11] + f[15] + f[20] + f[24] + f[7];
+      const real_t vel2Term = f[12] + f[13] + f[22] + f[5];
+      this->drho = f[0] + f[16] + f[17] + f[2] + f[26] + f[3] + f[6] + f[9] + vel0Term + vel1Term + vel2Term;
+      const real_t rho = this->drho + 1.0;
+      const real_t oneOverRho = ((1.0) / (rho));
+      this->velocityX = oneOverRho*(-f[13] - f[17] - f[20] - f[22] - f[24] - f[26] - f[3] - f[7] - f[9] + vel0Term);
+      this->velocityY = oneOverRho*(-f[10] - f[12] - f[16] + f[19] - f[2] - f[21] - f[22] + f[23] - f[25] - f[26] + f[8] - f[9] + vel1Term);
+      this->velocityZ = oneOverRho*(f[11] + f[14] - f[15] - f[16] - f[17] - f[18] + f[19] + f[20] + f[21] - f[23] - f[24] - f[25] - f[26] - f[6] + vel2Term);
+      WALBERLA_CHECK_FLOAT_EQUAL(rho, real_c(1.0))
+      WALBERLA_CHECK_FLOAT_EQUAL(this->velocityX, real_c(0.01))
+      WALBERLA_CHECK_FLOAT_EQUAL(this->velocityY, real_c(0.0))
+      WALBERLA_CHECK_FLOAT_EQUAL(this->velocityZ, real_c(0.0))
+      ////////////////////////////////////////////////////////////////////////////////////
+      //! - Calculate second order moments for interpolation
+      //!
+      // example: kxxMzz: moment, second derivative in x direction minus the second derivative in z direction
+      this->kxyFromfcNEQ = -c3o1 * omega *
+                           ((fMM0 + fMMM + fMMP - fMP0 - fMPM - fMPP - fPM0 - fPMM - fPMP + fPP0 + fPPM + fPPP) *
+                               oneOverRho - ((this->velocityX * this->velocityY)));
+      this->kyzFromfcNEQ = -c3o1 * omega *
+                           ((f0MM + fPMM + fMMM - f0MP - fPMP - fMMP - f0PM - fPPM - fMPM + f0PP + fPPP + fMPP) *
+                               oneOverRho -
+                            ((this->velocityY * this->velocityZ)));
+      this->kxzFromfcNEQ = -c3o1 * omega *
+                           ((fM0M + fMMM + fMPM - fM0P - fMMP - fMPP - fP0M - fPMM - fPPM + fP0P + fPMP + fPPP) *
+                               oneOverRho -
+                            ((this->velocityX * this->velocityZ)));
+      this->kxxMyyFromfcNEQ = -c3o2 * omega *
+                              ((fM0M + fM00 + fM0P - f0MM - f0M0 - f0MP - f0PM - f0P0 - f0PP + fP0M + fP00 + fP0P) *
+                                  oneOverRho -
+                               ((this->velocityX * this->velocityX - this->velocityY * this->velocityY)));
+      this->kxxMzzFromfcNEQ = -c3o2 * omega *
+                              ((fMM0 + fM00 + fMP0 - f0MM - f0MP - f00M - f00P - f0PM - f0PP + fPM0 + fP00 + fPP0) *
+                                  oneOverRho -
+                               ((this->velocityX * this->velocityX - this->velocityZ * this->velocityZ)));
+      dirty = false;
+   }
+};
+class MomentsOnSourceNodeSet
+{
+ private:
+   MomentsOnSourceNode momentsPPP;
+   MomentsOnSourceNode momentsMPP;
+   MomentsOnSourceNode momentsPMP;
+   MomentsOnSourceNode momentsMMP;
+   MomentsOnSourceNode momentsPPM;
+   MomentsOnSourceNode momentsMPM;
+   MomentsOnSourceNode momentsPMM;
+   MomentsOnSourceNode momentsMMM;
+ public:
+   void calculatePPP(const real_t *const f, const real_t omega)
+   {
+      momentsPPP.calculate(f, omega);
+   }
+    void calculateMPP(const real_t *const f, const real_t omega)
+   {
+      momentsMPP.calculate(f, omega);
+   }
+   void calculatePMP(const real_t *const f, const real_t omega)
+   {
+      momentsPMP.calculate(f, omega);
+   }
+   void calculateMMP(const real_t *const f, const real_t omega)
+   {
+      momentsMMP.calculate(f, omega);
+   }
+   void calculatePPM(const real_t *const f, const real_t omega)
+   {
+      momentsPPM.calculate(f, omega);
+   }
+   void calculateMPM(const real_t *const f, const real_t omega)
+   {
+      momentsMPM.calculate(f, omega);
+   }
+   void calculatePMM(const real_t *const f, const real_t omega)
+   {
+      momentsPMM.calculate(f, omega);
+   }
+   void calculateMMM(const real_t *const f, const real_t omega)
+   {
+      momentsMMM.calculate(f, omega);
+   }
+   void check()
+   {
+      WALBERLA_CHECK(!momentsPPP.dirty)
+      WALBERLA_CHECK(!momentsMPP.dirty)
+      WALBERLA_CHECK(!momentsPMP.dirty)
+      WALBERLA_CHECK(!momentsMMP.dirty)
+      WALBERLA_CHECK(!momentsPPM.dirty)
+      WALBERLA_CHECK(!momentsMPM.dirty)
+      WALBERLA_CHECK(!momentsPMM.dirty)
+      WALBERLA_CHECK(!momentsMMM.dirty)
+   }
+   void calculateCoefficients(InterpolationCoefficients &coefficients) const
+   {
+      real_t& a000 = coefficients.a000;
+      real_t& b000 = coefficients.b000;
+      real_t& c000 = coefficients.c000;
+      real_t& d000 = coefficients.d000;
+      real_t& a100 = coefficients.a100;
+      real_t& b100 = coefficients.b100;
+      real_t& c100 = coefficients.c100;
+      real_t& d100 = coefficients.d100;
+      real_t& a010 = coefficients.a010;
+      real_t& b010 = coefficients.b010;
+      real_t& c010 = coefficients.c010;
+      real_t& d010 = coefficients.d010;
+      real_t& a001 = coefficients.a001;
+      real_t& b001 = coefficients.b001;
+      real_t& c001 = coefficients.c001;
+      real_t& d001 = coefficients.d001;
+      real_t& d110 = coefficients.d110, &d101 = coefficients.d101, &d011 = coefficients.d011;
+      real_t& a200 = coefficients.a200, &a020 = coefficients.a020, &a002 = coefficients.a002;
+      real_t& b200 = coefficients.b200, &b020 = coefficients.b020, &b002 = coefficients.b002;
+      real_t& c200 = coefficients.c200, &c020 = coefficients.c020, &c002 = coefficients.c002;
+      real_t& a110 = coefficients.a110, &a101 = coefficients.a101, &a011 = coefficients.a011;
+      real_t& b110 = coefficients.b110, &b101 = coefficients.b101, &b011 = coefficients.b011;
+      real_t& c110 = coefficients.c110, &c101 = coefficients.c101, &c011 = coefficients.c011;
+      real_t &a111 = coefficients.a111, &b111 = coefficients.b111, &c111 = coefficients.c111, &d111 = coefficients.d111;
+      real_t &LaplaceRho = coefficients.LaplaceRho;
+      const real_t drhoPPP = momentsPPP.drho, vx1PPP = momentsPPP.velocityX, vx2PPP = momentsPPP.velocityY, vx3PPP = momentsPPP.velocityZ;
+      const real_t drhoMPP = momentsMPP.drho, vx1MPP = momentsMPP.velocityX, vx2MPP = momentsMPP.velocityY, vx3MPP = momentsMPP.velocityZ;
+      const real_t drhoPMP = momentsPMP.drho, vx1PMP = momentsPMP.velocityX, vx2PMP = momentsPMP.velocityY, vx3PMP = momentsPMP.velocityZ;
+      const real_t drhoMMP = momentsMMP.drho, vx1MMP = momentsMMP.velocityX, vx2MMP = momentsMMP.velocityY, vx3MMP = momentsMMP.velocityZ;
+      const real_t drhoPPM = momentsPPM.drho, vx1PPM = momentsPPM.velocityX, vx2PPM = momentsPPM.velocityY, vx3PPM = momentsPPM.velocityZ;
+      const real_t drhoMPM = momentsMPM.drho, vx1MPM = momentsMPM.velocityX, vx2MPM = momentsMPM.velocityY, vx3MPM = momentsMPM.velocityZ;
+      const real_t drhoPMM = momentsPMM.drho, vx1PMM = momentsPMM.velocityX, vx2PMM = momentsPMM.velocityY, vx3PMM = momentsPMM.velocityZ;
+      const real_t drhoMMM = momentsMMM.drho, vx1MMM = momentsMMM.velocityX, vx2MMM = momentsMMM.velocityY, vx3MMM = momentsMMM.velocityZ;
+      // second order moments at the source nodes
+      const real_t kxyFromfcNEQPPP = momentsPPP.kxyFromfcNEQ, kyzFromfcNEQPPP = momentsPPP.kyzFromfcNEQ, kxzFromfcNEQPPP = momentsPPP.kxzFromfcNEQ, kxxMyyFromfcNEQPPP = momentsPPP.kxxMyyFromfcNEQ, kxxMzzFromfcNEQPPP = momentsPPP.kxxMzzFromfcNEQ;
+      const real_t kxyFromfcNEQMPP = momentsMPP.kxyFromfcNEQ, kyzFromfcNEQMPP = momentsMPP.kyzFromfcNEQ, kxzFromfcNEQMPP = momentsMPP.kxzFromfcNEQ, kxxMyyFromfcNEQMPP = momentsMPP.kxxMyyFromfcNEQ, kxxMzzFromfcNEQMPP = momentsMPP.kxxMzzFromfcNEQ;
+      const real_t kxyFromfcNEQPMP = momentsPMP.kxyFromfcNEQ, kyzFromfcNEQPMP = momentsPMP.kyzFromfcNEQ, kxzFromfcNEQPMP = momentsPMP.kxzFromfcNEQ, kxxMyyFromfcNEQPMP = momentsPMP.kxxMyyFromfcNEQ, kxxMzzFromfcNEQPMP = momentsPMP.kxxMzzFromfcNEQ;
+      const real_t kxyFromfcNEQMMP = momentsMMP.kxyFromfcNEQ, kyzFromfcNEQMMP = momentsMMP.kyzFromfcNEQ, kxzFromfcNEQMMP = momentsMMP.kxzFromfcNEQ, kxxMyyFromfcNEQMMP = momentsMMP.kxxMyyFromfcNEQ, kxxMzzFromfcNEQMMP = momentsMMP.kxxMzzFromfcNEQ;
+      const real_t kxyFromfcNEQPPM = momentsPPM.kxyFromfcNEQ, kyzFromfcNEQPPM = momentsPPM.kyzFromfcNEQ, kxzFromfcNEQPPM = momentsPPM.kxzFromfcNEQ, kxxMyyFromfcNEQPPM = momentsPPM.kxxMyyFromfcNEQ, kxxMzzFromfcNEQPPM = momentsPPM.kxxMzzFromfcNEQ;
+      const real_t kxyFromfcNEQMPM = momentsMPM.kxyFromfcNEQ, kyzFromfcNEQMPM = momentsMPM.kyzFromfcNEQ, kxzFromfcNEQMPM = momentsMPM.kxzFromfcNEQ, kxxMyyFromfcNEQMPM = momentsMPM.kxxMyyFromfcNEQ, kxxMzzFromfcNEQMPM = momentsMPM.kxxMzzFromfcNEQ;
+      const real_t kxyFromfcNEQPMM = momentsPMM.kxyFromfcNEQ, kyzFromfcNEQPMM = momentsPMM.kyzFromfcNEQ, kxzFromfcNEQPMM = momentsPMM.kxzFromfcNEQ, kxxMyyFromfcNEQPMM = momentsPMM.kxxMyyFromfcNEQ, kxxMzzFromfcNEQPMM = momentsPMM.kxxMzzFromfcNEQ;
+      const real_t kxyFromfcNEQMMM = momentsMMM.kxyFromfcNEQ, kyzFromfcNEQMMM = momentsMMM.kyzFromfcNEQ, kxzFromfcNEQMMM = momentsMMM.kxzFromfcNEQ, kxxMyyFromfcNEQMMM = momentsMMM.kxxMyyFromfcNEQ, kxxMzzFromfcNEQMMM = momentsMMM.kxxMzzFromfcNEQ;
+      a000 = c1o64 * (
+                        c2o1 * (
+                                  ((kxyFromfcNEQMMM - kxyFromfcNEQPPP) + (kxyFromfcNEQMMP - kxyFromfcNEQPPM)) + ((kxyFromfcNEQPMM - kxyFromfcNEQMPP) + (kxyFromfcNEQPMP - kxyFromfcNEQMPM)) +
+                                  ((kxzFromfcNEQMMM - kxzFromfcNEQPPP) + (kxzFromfcNEQPPM - kxzFromfcNEQMMP)) + ((kxzFromfcNEQPMM - kxzFromfcNEQMPP) + (kxzFromfcNEQMPM - kxzFromfcNEQPMP)) +
+                                  ((vx2PPP + vx2MMM) + (vx2PPM + vx2MMP)) - ((vx2MPP + vx2PMM) + (vx2MPM + vx2PMP)) +
+                                  ((vx3PPP + vx3MMM) - (vx3PPM + vx3MMP)) + ((vx3PMP + vx3MPM) - (vx3MPP + vx3PMM))) +
+                        c8o1 * (((vx1PPP + vx1MMM) + (vx1PPM + vx1MMP)) + ((vx1MPP + vx1PMM) + (vx1PMP + vx1MPM))) +
+                        ((kxxMyyFromfcNEQMMM - kxxMyyFromfcNEQPPP) + (kxxMyyFromfcNEQMMP - kxxMyyFromfcNEQPPM)) +
+                        ((kxxMyyFromfcNEQMPP - kxxMyyFromfcNEQPMM) + (kxxMyyFromfcNEQMPM - kxxMyyFromfcNEQPMP)) +
+                        ((kxxMzzFromfcNEQMMM - kxxMzzFromfcNEQPPP) + (kxxMzzFromfcNEQMMP - kxxMzzFromfcNEQPPM)) +
+                        ((kxxMzzFromfcNEQMPP - kxxMzzFromfcNEQPMM) + (kxxMzzFromfcNEQMPM - kxxMzzFromfcNEQPMP)));
+      b000 = c1o64 * (
+                        c2o1 * (
+                                  ((kxxMyyFromfcNEQPPP - kxxMyyFromfcNEQMMM) + (kxxMyyFromfcNEQPPM - kxxMyyFromfcNEQMMP)) +
+                                  ((kxxMyyFromfcNEQMPP - kxxMyyFromfcNEQPMM) + (kxxMyyFromfcNEQMPM - kxxMyyFromfcNEQPMP)) +
+                                  ((kxyFromfcNEQMMM - kxyFromfcNEQPPP) + (kxyFromfcNEQMMP - kxyFromfcNEQPPM)) +
+                                  ((kxyFromfcNEQMPP - kxyFromfcNEQPMM) + (kxyFromfcNEQMPM - kxyFromfcNEQPMP)) +
+                                  ((kyzFromfcNEQMMM - kyzFromfcNEQPPP) + (kyzFromfcNEQPPM - kyzFromfcNEQMMP)) +
+                                  ((kyzFromfcNEQPMM - kyzFromfcNEQMPP) + (kyzFromfcNEQMPM - kyzFromfcNEQPMP)) +
+                                  ((vx1PPP + vx1MMM) + (vx1PPM + vx1MMP)) - ((vx1MPM + vx1MPP) + (vx1PMM + vx1PMP)) +
+                                  ((vx3PPP + vx3MMM) - (vx3PPM + vx3MMP)) + ((vx3MPP + vx3PMM) - (vx3MPM + vx3PMP))) +
+                        c8o1 * (((vx2PPP + vx2MMM) + (vx2PPM + vx2MMP)) + ((vx2MPP + vx2PMM) + (vx2MPM + vx2PMP))) +
+                        ((kxxMzzFromfcNEQMMM - kxxMzzFromfcNEQPPP) + (kxxMzzFromfcNEQMMP - kxxMzzFromfcNEQPPM)) +
+                        ((kxxMzzFromfcNEQPMM - kxxMzzFromfcNEQMPP) + (kxxMzzFromfcNEQPMP - kxxMzzFromfcNEQMPM)));
+      c000 = c1o64 * (
+                        c2o1 * (
+                                  ((kxxMzzFromfcNEQPPP - kxxMzzFromfcNEQMMM) + (kxxMzzFromfcNEQMMP - kxxMzzFromfcNEQPPM)) +
+                                  ((kxxMzzFromfcNEQMPP - kxxMzzFromfcNEQPMM) + (kxxMzzFromfcNEQPMP - kxxMzzFromfcNEQMPM)) +
+                                  ((kxzFromfcNEQMMM - kxzFromfcNEQPPP) + (kxzFromfcNEQMMP - kxzFromfcNEQPPM)) +
+                                  ((kxzFromfcNEQMPP - kxzFromfcNEQPMM) + (kxzFromfcNEQMPM - kxzFromfcNEQPMP)) +
+                                  ((kyzFromfcNEQMMM - kyzFromfcNEQPPP) + (kyzFromfcNEQMMP - kyzFromfcNEQPPM)) +
+                                  ((kyzFromfcNEQPMM - kyzFromfcNEQMPP) + (kyzFromfcNEQPMP - kyzFromfcNEQMPM)) +
+                                  ((vx1PPP + vx1MMM) - (vx1MMP + vx1PPM)) + ((vx1MPM + vx1PMP) - (vx1MPP + vx1PMM)) +
+                                  ((vx2PPP + vx2MMM) - (vx2MMP + vx2PPM)) + ((vx2MPP + vx2PMM) - (vx2MPM + vx2PMP))) +
+                        c8o1 * (((vx3PPP + vx3MMM) + (vx3PPM + vx3MMP)) + ((vx3PMM + vx3MPP) + (vx3PMP + vx3MPM))) +
+                        ((kxxMyyFromfcNEQMMM - kxxMyyFromfcNEQPPP) + (kxxMyyFromfcNEQPPM - kxxMyyFromfcNEQMMP)) +
+                        ((kxxMyyFromfcNEQPMM - kxxMyyFromfcNEQMPP) + (kxxMyyFromfcNEQMPM - kxxMyyFromfcNEQPMP)));
+      a100 = c1o4 * (((vx1PPP - vx1MMM) + (vx1PPM - vx1MMP)) + ((vx1PMM - vx1MPP) + (vx1PMP - vx1MPM)));
+      b100 = c1o4 * (((vx2PPP - vx2MMM) + (vx2PPM - vx2MMP)) + ((vx2PMM - vx2MPP) + (vx2PMP - vx2MPM)));
+      c100 = c1o4 * (((vx3PPP - vx3MMM) + (vx3PPM - vx3MMP)) + ((vx3PMM - vx3MPP) + (vx3PMP - vx3MPM)));
+      a200 = c1o16 * (
+                        c2o1 * (
+                                  ((vx2PPP + vx2MMM) + (vx2PPM - vx2MPP)) + ((vx2MMP - vx2PMM) - (vx2MPM + vx2PMP)) +
+                                  ((vx3PPP + vx3MMM) - (vx3PPM + vx3MPP)) + ((vx3MPM + vx3PMP) - (vx3MMP + vx3PMM))) +
+                        ((kxxMyyFromfcNEQPPP - kxxMyyFromfcNEQMMM) + (kxxMyyFromfcNEQPPM - kxxMyyFromfcNEQMMP)) +
+                        ((kxxMyyFromfcNEQPMM - kxxMyyFromfcNEQMPP) + (kxxMyyFromfcNEQPMP - kxxMyyFromfcNEQMPM)) +
+                        ((kxxMzzFromfcNEQPPP - kxxMzzFromfcNEQMMM) + (kxxMzzFromfcNEQPPM - kxxMzzFromfcNEQMMP)) +
+                        ((kxxMzzFromfcNEQPMM - kxxMzzFromfcNEQMPP) + (kxxMzzFromfcNEQPMP - kxxMzzFromfcNEQMPM)));
+      b200 = c1o8 * (
+                       c2o1 * (
+                                 -((vx1PPP + vx1MMM) + (vx1PPM + vx1MMP)) + ((vx1MPP + vx1PMM) + (vx1MPM + vx1PMP))) +
+                       ((kxyFromfcNEQPPP - kxyFromfcNEQMMM) + (kxyFromfcNEQPPM - kxyFromfcNEQMMP)) +
+                       ((kxyFromfcNEQPMM - kxyFromfcNEQMPP) + (kxyFromfcNEQPMP - kxyFromfcNEQMPM)));
+      c200 = c1o8 * (
+                       c2o1 * (
+                                 ((vx1PPM + vx1MMP) - (vx1PPP + vx1MMM)) + ((vx1MPP + vx1PMM) - (vx1MPM + vx1PMP))) +
+                       ((kxzFromfcNEQPPP - kxzFromfcNEQMMM) + (kxzFromfcNEQPPM - kxzFromfcNEQMMP)) +
+                       ((kxzFromfcNEQPMM - kxzFromfcNEQMPP) + (kxzFromfcNEQPMP - kxzFromfcNEQMPM)));
+      a010 = c1o4 * (((vx1PPP - vx1MMM) + (vx1PPM - vx1MMP)) + ((vx1MPP - vx1PMM) + (vx1MPM - vx1PMP)));
+      b010 = c1o4 * (((vx2PPP - vx2MMM) + (vx2PPM - vx2MMP)) + ((vx2MPP - vx2PMM) + (vx2MPM - vx2PMP)));
+      c010 = c1o4 * (((vx3PPP - vx3MMM) + (vx3PPM - vx3MMP)) + ((vx3MPP - vx3PMM) + (vx3MPM - vx3PMP)));
+      a020 = c1o8 * (
+                       c2o1 * (-((vx2PPP + vx2MMM) + (vx2MMP + vx2PPM)) + ((vx2MPP + vx2PMM) + (vx2MPM + vx2PMP))) +
+                       ((kxyFromfcNEQPPP - kxyFromfcNEQMMM) + (kxyFromfcNEQPPM - kxyFromfcNEQMMP)) +
+                       ((kxyFromfcNEQMPP - kxyFromfcNEQPMM) + (kxyFromfcNEQMPM - kxyFromfcNEQPMP)));
+      b020 = c1o16 * (
+                        c2o1 * (
+                                  ((kxxMyyFromfcNEQMMM - kxxMyyFromfcNEQPPP) + (kxxMyyFromfcNEQMMP - kxxMyyFromfcNEQPPM)) +
+                                  ((kxxMyyFromfcNEQPMM - kxxMyyFromfcNEQMPP) + (kxxMyyFromfcNEQPMP - kxxMyyFromfcNEQMPM)) +
+                                  ((vx1PPP + vx1MMM) + (vx1PPM + vx1MMP)) - ((vx1MPP + vx1PMM) + (vx1PMP + vx1MPM)) +
+                                  ((vx3PPP + vx3MMM) - (vx3PPM + vx3MMP)) + ((vx3MPP + vx3PMM) - (vx3MPM + vx3PMP))) +
+                        ((kxxMzzFromfcNEQPPP - kxxMzzFromfcNEQMMM) + (kxxMzzFromfcNEQPPM - kxxMzzFromfcNEQMMP)) +
+                        ((kxxMzzFromfcNEQMPP - kxxMzzFromfcNEQPMM) + (kxxMzzFromfcNEQMPM - kxxMzzFromfcNEQPMP)));
+      c020 = c1o8 * (
+                       c2o1 * (((vx2MMP + vx2PPM) - (vx2PPP + vx2MMM)) + ((vx2PMP + vx2MPM) - (vx2MPP + vx2PMM))) +
+                       ((kyzFromfcNEQPPP - kyzFromfcNEQMMM) + (kyzFromfcNEQPPM - kyzFromfcNEQMMP)) +
+                       ((kyzFromfcNEQMPP - kyzFromfcNEQPMM) + (kyzFromfcNEQMPM - kyzFromfcNEQPMP)));
+      a001 = c1o4 * (((vx1PPP - vx1MMM) + (vx1MMP - vx1PPM)) + ((vx1MPP - vx1PMM) + (vx1PMP - vx1MPM)));
+      b001 = c1o4 * (((vx2PPP - vx2MMM) + (vx2MMP - vx2PPM)) + ((vx2MPP - vx2PMM) + (vx2PMP - vx2MPM)));
+      c001 = c1o4 * (((vx3PPP - vx3MMM) + (vx3MMP - vx3PPM)) + ((vx3MPP - vx3PMM) + (vx3PMP - vx3MPM)));
+      a002 = c1o8 * (
+                       c2o1 * (((vx3PPM + vx3MMP) - (vx3PPP + vx3MMM)) + ((vx3MPP + vx3PMM) - (vx3PMP + vx3MPM))) +
+                       ((kxzFromfcNEQPPP - kxzFromfcNEQMMM) + (kxzFromfcNEQMMP - kxzFromfcNEQPPM)) +
+                       ((kxzFromfcNEQPMP - kxzFromfcNEQMPM) + (kxzFromfcNEQMPP - kxzFromfcNEQPMM)));
+      b002 = c1o8 * (
+                       c2o1 * (((vx3PPM + vx3MMP) - (vx3PPP + vx3MMM)) + ((vx3MPM + vx3PMP) - (vx3PMM + vx3MPP))) +
+                       ((kyzFromfcNEQPPP - kyzFromfcNEQMMM) + (kyzFromfcNEQMMP - kyzFromfcNEQPPM)) +
+                       ((kyzFromfcNEQPMP - kyzFromfcNEQMPM) + (kyzFromfcNEQMPP - kyzFromfcNEQPMM)));
+      c002 = c1o16 * (
+                        c2o1 * (
+                                  ((kxxMzzFromfcNEQMMM - kxxMzzFromfcNEQPPP) + (kxxMzzFromfcNEQPPM - kxxMzzFromfcNEQMMP)) +
+                                  ((kxxMzzFromfcNEQMPM - kxxMzzFromfcNEQPMP) + (kxxMzzFromfcNEQPMM - kxxMzzFromfcNEQMPP)) +
+                                  ((vx1PPP + vx1MMM) - (vx1MMP + vx1PPM)) + ((vx1MPM + vx1PMP) - (vx1PMM + vx1MPP)) +
+                                  ((vx2PPP + vx2MMM) - (vx2MMP + vx2PPM)) + ((vx2PMM + vx2MPP) - (vx2MPM + vx2PMP))) +
+                        ((kxxMyyFromfcNEQPPP - kxxMyyFromfcNEQMMM) + (kxxMyyFromfcNEQMMP - kxxMyyFromfcNEQPPM)) +
+                        ((kxxMyyFromfcNEQPMP - kxxMyyFromfcNEQMPM) + (kxxMyyFromfcNEQMPP - kxxMyyFromfcNEQPMM)));
+      a110 = c1o2 * (((vx1PPP + vx1MMM) + (vx1MMP + vx1PPM)) - ((vx1MPM + vx1PMP) + (vx1PMM + vx1MPP)));
+      b110 = c1o2 * (((vx2PPP + vx2MMM) + (vx2MMP + vx2PPM)) - ((vx2MPM + vx2PMP) + (vx2PMM + vx2MPP)));
+      c110 = c1o2 * (((vx3PPP + vx3MMM) + (vx3MMP + vx3PPM)) - ((vx3MPM + vx3PMP) + (vx3PMM + vx3MPP)));
+      a101 = c1o2 * (((vx1PPP + vx1MMM) - (vx1MMP + vx1PPM)) + ((vx1MPM + vx1PMP) - (vx1PMM + vx1MPP)));
+      b101 = c1o2 * (((vx2PPP + vx2MMM) - (vx2MMP + vx2PPM)) + ((vx2MPM + vx2PMP) - (vx2PMM + vx2MPP)));
+      c101 = c1o2 * (((vx3PPP + vx3MMM) - (vx3MMP + vx3PPM)) + ((vx3MPM + vx3PMP) - (vx3PMM + vx3MPP)));
+      a011 = c1o2 * (((vx1PPP + vx1MMM) - (vx1MMP + vx1PPM)) + ((vx1PMM + vx1MPP) - (vx1MPM + vx1PMP)));
+      b011 = c1o2 * (((vx2PPP + vx2MMM) - (vx2MMP + vx2PPM)) + ((vx2PMM + vx2MPP) - (vx2MPM + vx2PMP)));
+      c011 = c1o2 * (((vx3PPP + vx3MMM) - (vx3MMP + vx3PPM)) + ((vx3PMM + vx3MPP) - (vx3MPM + vx3PMP)));
+      a111 = ((vx1PPP - vx1MMM) + (vx1MMP - vx1PPM)) + ((vx1MPM - vx1PMP) + (vx1PMM - vx1MPP));
+      b111 = ((vx2PPP - vx2MMM) + (vx2MMP - vx2PPM)) + ((vx2MPM - vx2PMP) + (vx2PMM - vx2MPP));
+      c111 = ((vx3PPP - vx3MMM) + (vx3MMP - vx3PPM)) + ((vx3MPM - vx3PMP) + (vx3PMM - vx3MPP));
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      //!- Calculate coefficients for the polynomial interpolation of the pressure
+      //!
+      const real_t xoff = 0.0;
+      const real_t yoff = 0.0;
+      const real_t zoff = 0.0;
+      const real_t xoffsq = xoff * xoff;
+      const real_t yoffsq = yoff * yoff;
+      const real_t zoffsq = zoff * zoff;
+      LaplaceRho =
+         ((xoff != c0o1) || (yoff != c0o1) || (zoff != c0o1))
+            ? c0o1 : -c3o1 * (a100 * a100 + b010 * b010 + c001 * c001) - c6o1 * (b100 * a010 + c100 * a001 + c010 * b001);
+      d000 = c1o8 * (((drhoPPP + drhoMMM) + (drhoPPM + drhoMMP)) + ((drhoPMM + drhoMPP) + (drhoPMP + drhoMPM)));
+      d100 = c1o4 * (((drhoPPP - drhoMMM) + (drhoPPM - drhoMMP)) + ((drhoPMM - drhoMPP) + (drhoPMP - drhoMPM)));
+      d010 = c1o4 * (((drhoPPP - drhoMMM) + (drhoPPM - drhoMMP)) + ((drhoMPP - drhoPMM) + (drhoMPM - drhoPMP)));
+      d001 = c1o4 * (((drhoPPP - drhoMMM) + (drhoMMP - drhoPPM)) + ((drhoMPP - drhoPMM) + (drhoPMP - drhoMPM)));
+      d110 = c1o2 * (((drhoPPP + drhoMMM) + (drhoPPM + drhoMMP)) - ((drhoPMM + drhoMPP) + (drhoPMP + drhoMPM)));
+      d101 = c1o2 * (((drhoPPP + drhoMMM) - (drhoPPM + drhoMMP)) + ((drhoPMP + drhoMPM) - (drhoPMM + drhoMPP)));
+      d011 = c1o2 * (((drhoPPP + drhoMMM) - (drhoPPM + drhoMMP)) + ((drhoPMM + drhoMPP) - (drhoPMP + drhoMPM)));
+      d111 = (((drhoPPP - drhoMMM) + (drhoMMP - drhoPPM)) + ((drhoPMM - drhoMPP) + (drhoMPM - drhoPMP)));
+      //////////////////////////////////////////////////////////////////////////
+      //! - Extrapolation for refinement in to the wall (polynomial coefficients)
+      //!
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      //
+      // x------x
+      // |      |
+      // |   ---+--->X
+      // |      |  |
+      // x------x  |
+      //          offset-vector
+      //
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      a000 = a000 + xoff * a100 + yoff * a010 + zoff * a001 + xoffsq * a200 + yoffsq * a020 + zoffsq * a002 +
+             xoff * yoff * a110 + xoff * zoff * a101 + yoff * zoff * a011;
+      a100 = a100 + c2o1 * xoff * a200 + yoff * a110 + zoff * a101;
+      a010 = a010 + c2o1 * yoff * a020 + xoff * a110 + zoff * a011;
+      a001 = a001 + c2o1 * zoff * a002 + xoff * a101 + yoff * a011;
+      b000 = b000 + xoff * b100 + yoff * b010 + zoff * b001 + xoffsq * b200 + yoffsq * b020 + zoffsq * b002 +
+             xoff * yoff * b110 + xoff * zoff * b101 + yoff * zoff * b011;
+      b100 = b100 + c2o1 * xoff * b200 + yoff * b110 + zoff * b101;
+      b010 = b010 + c2o1 * yoff * b020 + xoff * b110 + zoff * b011;
+      b001 = b001 + c2o1 * zoff * b002 + xoff * b101 + yoff * b011;
+      c000 = c000 + xoff * c100 + yoff * c010 + zoff * c001 + xoffsq * c200 + yoffsq * c020 + zoffsq * c002 +
+             xoff * yoff * c110 + xoff * zoff * c101 + yoff * zoff * c011;
+      c100 = c100 + c2o1 * xoff * c200 + yoff * c110 + zoff * c101;
+      c010 = c010 + c2o1 * yoff * c020 + xoff * c110 + zoff * c011;
+      c001 = c001 + c2o1 * zoff * c002 + xoff * c101 + yoff * c011;
+      d000 = d000 + xoff * d100 + yoff * d010 + zoff * d001 +
+             xoff * yoff * d110 + xoff * zoff * d101 + yoff * zoff * d011;
+      d100 = d100 + yoff * d110 + zoff * d101;
+      d010 = d010 + xoff * d110 + zoff * d011;
+      d001 = d001 + xoff * d101 + yoff * d011;
+   }
+};
+class Interpolation
+{
+ public:
+   Interpolation(const shared_ptr< StructuredBlockForest > & blocks, const IDs& ids, const real_t omega)
+      :blocks_(blocks), ids_(ids)
+   {
+      for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++)
+      {
+         const double level_scale_factor = double(uint_t(1) << level);
+         const double one                = double(1.0);
+         const double half               = double(0.5);
+         omegaVector.push_back( double(omega / (level_scale_factor * (-omega * half + one) + omega * half)) );
+         std::vector<Block *> tmp;
+         blocks_->getBlocks(tmp, level);
+         blocksPerLevel_.push_back(tmp);
+         for(auto block: tmp)
+         {
+            fineCoefficients[block].resize(9);
+         }
+      }
+      std::vector<AABB> coarseAABBs;
+      coarseAABBs.resize(8);
+      std::vector<AABB> fineAABBs;
+      for(auto block: blocksPerLevel_[0])
+      {
+         const AABB coarseAABB = block->getAABB();
+         const real_t xMid     = (coarseAABB.xMin() + coarseAABB.xMax()) / real_c(2.0);
+         const real_t yMid     = (coarseAABB.yMin() + coarseAABB.yMax()) / real_c(2.0);
+         const real_t zMid     = (coarseAABB.zMin() + coarseAABB.zMax()) / real_c(2.0);
+         for (uint_t c = 0; c != 8; ++c)
+         {
+            coarseAABBs[c] = AABB(((c & 1) ? xMid : coarseAABB.xMin()),  // xmin (incl.)
+                                  ((c & 2) ? yMid : coarseAABB.yMin()),  // ymin (incl.)
+                                  ((c & 4) ? zMid : coarseAABB.zMin()),  // zmin (incl.)
+                                  ((c & 1) ? coarseAABB.xMax() : xMid),  // xmax (excl.)
+                                  ((c & 2) ? coarseAABB.yMax() : yMid),  // ymax (excl.)
+                                  ((c & 4) ? coarseAABB.zMax() : zMid)); // zmax (excl.))
+         }
+         fineAABBs.clear();
+         for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+         {
+            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+            if (block->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue;
+            if (!(block->neighborhoodSectionHasSmallerBlocks(neighborIdx))) continue;
+            for (uint_t n = 0; n != block->getNeighborhoodSectionSize(neighborIdx); ++n)
+            {
+               auto fineAABB = block->getNeighborAABB(neighborIdx, n);
+               if (std::find(fineAABBs.begin(), fineAABBs.end(), fineAABB) != fineAABBs.end()) continue;
+               fineAABBs.emplace_back(fineAABB);
+            }
+         }
+         for (auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+         {
+            for (uint_t c = 0; c != 8; ++c)
+            {
+               CellInterval ci(0, 0, 0, 3, 3, 3);
+               Vector3< real_t > p(((coarseAABBs[c].xMin() + coarseAABBs[c].xMax()) / real_c(2.0)) + real_c(dir.cx() * coarseAABBs[c].xSize()),
+                                   ((coarseAABBs[c].yMin() + coarseAABBs[c].yMax()) / real_c(2.0)) + real_c(dir.cy() * coarseAABBs[c].ySize()),
+                                   ((coarseAABBs[c].zMin() + coarseAABBs[c].zMax()) / real_c(2.0)) + real_c(dir.cz() * coarseAABBs[c].zSize()));
+               blocks_->mapToPeriodicDomain(p);
+               ci.min()[0] = (c & 1) ? cell_idx_c(3) : cell_idx_c(1);
+               ci.min()[1] = (c & 2) ? cell_idx_c(3) : cell_idx_c(1);
+               ci.min()[2] = (c & 4) ? cell_idx_c(3) : cell_idx_c(1);
+               ci.max()[0] = (c & 1) ? cell_idx_c(3) : cell_idx_c(1);
+               ci.max()[1] = (c & 2) ? cell_idx_c(3) : cell_idx_c(1);
+               ci.max()[2] = (c & 4) ? cell_idx_c(3) : cell_idx_c(1);
+               for (auto fineAABB : fineAABBs)
+               {
+                  if (fineAABB.contains(p))
+                  {
+                     auto fineBlock = dynamic_cast< Block* >(blocks->getBlock(p));
+                     auto t = std::make_tuple(block, fineBlock, *dir, ci);
+                     mapping_.emplace_back(t);
+                  }
+               }
+            }
+         }
+      }
+   }
+//   void getCoarsePDFs(Vector3<real_t>& globalPoint, real_t* local)
+//   {
+//      blocks_->mapToPeriodicDomain(globalPoint);
+//      auto b = blocks_->getBlock(globalPoint);
+//      Cell localCell = blocks_->getBlockLocalCell(*b, globalPoint);
+//      auto f = b->getData<PdfField_T>(ids_.pdfField);
+//
+//      for(cell_idx_t i = 0; i < Stencil_T::Q; ++i)
+//      {
+//         local[i] = f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[i],
+//                           localCell.y() + StorageSpecification_T::AccessorEVEN::readY[i],
+//                           localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[i],
+//                           StorageSpecification_T::AccessorEVEN::readD[i]);
+//      }
+//   }
+   void getCoarsePDFs(Block* block, Cell& cell, real_t* local)
+   {
+      auto f = block->getData<PdfField_T>(ids_.pdfField);
+      for(cell_idx_t i = 0; i < Stencil_T::Q; ++i)
+      {
+         local[i] = f->get(cell.x() + StorageSpecification_T::AccessorEVEN::writeX[i],
+                           cell.y() + StorageSpecification_T::AccessorEVEN::writeY[i],
+                           cell.z() + StorageSpecification_T::AccessorEVEN::writeZ[i],
+                           StorageSpecification_T::AccessorEVEN::writeD[i]);
+      }
+   }
+   void getFinePDFs(Block* block, Cell& cell, real_t* local)
+   {
+      auto f = block->getData<PdfField_T>(ids_.pdfField);
+      for(cell_idx_t i = 0; i < Stencil_T::Q; ++i)
+      {
+         local[i] = f->get(cell.x() + StorageSpecification_T::AccessorEVEN::writeX[i],
+                           cell.y() + StorageSpecification_T::AccessorEVEN::writeY[i],
+                           cell.z() + StorageSpecification_T::AccessorEVEN::writeZ[i],
+                           StorageSpecification_T::AccessorEVEN::writeD[i]);
+      }
+   }
+   inline void backwardInverseChimeraWithK(real_t &mfa, real_t &mfb, real_t &mfc, real_t vv,
+                                           real_t v2, real_t Kinverse, real_t K)
+   {
+      const real_t m0 = (((mfc - mfb) * c1o2 + mfb * vv) * Kinverse + (mfa * Kinverse + c1o1) * (v2 - vv) * c1o2) * K;
+      const real_t m1 = (((mfa - mfc) - c2o1 * mfb * vv) * Kinverse + (mfa * Kinverse + c1o1) * (-v2)) * K;
+      mfc = (((mfc + mfb) * c1o2 + mfb * vv) * Kinverse + (mfa * Kinverse + c1o1) * (v2 + vv) * c1o2) * K;
+      mfa = m0;
+      mfb = m1;
+   }
+   inline void backwardChimera(real_t &mfa, real_t &mfb, real_t &mfc, real_t vv, real_t v2)
+   {
+      const real_t ma = (mfc + mfa * (v2 - vv)) * c1o2 + mfb * (vv - c1o2);
+      const real_t mb = ((mfa - mfc) - mfa * v2) - c2o1 * mfb * vv;
+      mfc = (mfc + mfa * (v2 + vv)) * c1o2 + mfb * (vv + c1o2);
+      mfb = mb;
+      mfa = ma;
+   }
+   void interpolateCF(Block* block, Cell& localCell, Vector3<real_t>& offset, const real_t omegaF, const InterpolationCoefficients &coefficients)
+   {
+      const real_t epsnew = c1o2;
+      const real_t x = offset[0];
+      const real_t y = offset[1];
+      const real_t z = offset[2];
+      const real_t useNEQ = c1o1;
+      const real_t kxyAverage    = c0o1;
+      const real_t kyzAverage    = c0o1;
+      const real_t kxzAverage    = c0o1;
+      const real_t kxxMyyAverage = c0o1;
+      const real_t kxxMzzAverage = c0o1;
+      const real_t& a000 = coefficients.a000;
+      const real_t& b000 = coefficients.b000;
+      const real_t& c000 = coefficients.c000;
+      const real_t& d000 = coefficients.d000;
+      const real_t& a100 = coefficients.a100;
+      const real_t& b100 = coefficients.b100;
+      const real_t& c100 = coefficients.c100;
+      const real_t& d100 = coefficients.d100;
+      const real_t& a010 = coefficients.a010;
+      const real_t& b010 = coefficients.b010;
+      const real_t& c010 = coefficients.c010;
+      const real_t& d010 = coefficients.d010;
+      const real_t& a001 = coefficients.a001;
+      const real_t& b001 = coefficients.b001;
+      const real_t& c001 = coefficients.c001;
+      const real_t& d001 = coefficients.d001;
+      const real_t& d110 = coefficients.d110, &d101 = coefficients.d101, &d011 = coefficients.d011;
+      const real_t& a200 = coefficients.a200, &a020 = coefficients.a020, &a002 = coefficients.a002;
+      const real_t& b200 = coefficients.b200, &b020 = coefficients.b020, &b002 = coefficients.b002;
+      const real_t& c200 = coefficients.c200, &c020 = coefficients.c020, &c002 = coefficients.c002;
+      const real_t& a110 = coefficients.a110, &a101 = coefficients.a101, &a011 = coefficients.a011;
+      const real_t& b110 = coefficients.b110, &b101 = coefficients.b101, &b011 = coefficients.b011;
+      const real_t& c110 = coefficients.c110, &c101 = coefficients.c101, &c011 = coefficients.c011;
+      const real_t &a111 = coefficients.a111, &b111 = coefficients.b111, &c111 = coefficients.c111, &d111 = coefficients.d111;
+      const real_t &LaplaceRho = coefficients.LaplaceRho;
+      ////////////////////////////////////////////////////////////////////////////////////
+      //! - Set all moments to zero
+      //!
+      real_t m111 = real_c(0.0);
+      real_t m211 = real_c(0.0);
+      real_t m011 = real_c(0.0);
+      real_t m121 = real_c(0.0);
+      real_t m101 = real_c(0.0);
+      real_t m112 = real_c(0.0);
+      real_t m110 = real_c(0.0);
+      real_t m221 = real_c(0.0);
+      real_t m001 = real_c(0.0);
+      real_t m201 = real_c(0.0);
+      real_t m021 = real_c(0.0);
+      real_t m212 = real_c(0.0);
+      real_t m010 = real_c(0.0);
+      real_t m210 = real_c(0.0);
+      real_t m012 = real_c(0.0);
+      real_t m122 = real_c(0.0);
+      real_t m100 = real_c(0.0);
+      real_t m120 = real_c(0.0);
+      real_t m102 = real_c(0.0);
+      real_t m222 = real_c(0.0);
+      real_t m022 = real_c(0.0);
+      real_t m202 = real_c(0.0);
+      real_t m002 = real_c(0.0);
+      real_t m220 = real_c(0.0);
+      real_t m020 = real_c(0.0);
+      real_t m200 = real_c(0.0);
+      real_t m000 = real_c(0.0);
+      real_t& f000 = m111;
+      real_t& fP00 = m211;
+      real_t& fM00 = m011;
+      real_t& f0P0 = m121;
+      real_t& f0M0 = m101;
+      real_t& f00P = m112;
+      real_t& f00M = m110;
+      real_t& fPP0 = m221;
+      real_t& fMM0 = m001;
+      real_t& fPM0 = m201;
+      real_t& fMP0 = m021;
+      real_t& fP0P = m212;
+      real_t& fM0M = m010;
+      real_t& fP0M = m210;
+      real_t& fM0P = m012;
+      real_t& f0PP = m122;
+      real_t& f0MM = m100;
+      real_t& f0PM = m120;
+      real_t& f0MP = m102;
+      real_t& fPPP = m222;
+      real_t& fMPP = m022;
+      real_t& fPMP = m202;
+      real_t& fMMP = m002;
+      real_t& fPPM = m220;
+      real_t& fMPM = m020;
+      real_t& fPMM = m200;
+      real_t& fMMM = m000;
+      ////////////////////////////////////////////////////////////////////////////////
+      //! - Set macroscopic values on destination node (zeroth and first order moments)
+      //!
+      real_t press = d000 + x * d100 + y * d010 + z * d001 +
+                   x * y * d110 + x * z * d101 + y * z * d011 + x * y * z * d111 + c3o1 * x * x * LaplaceRho;
+      real_t vvx   = a000 + x * a100 + y * a010 + z * a001 +
+                 x * x * a200 + y * y * a020 + z * z * a002 +
+                 x * y * a110 + x * z * a101 + y * z * a011 + x * y * z * a111;
+      real_t vvy   = b000 + x * b100 + y * b010 + z * b001 +
+                 x * x * b200 + y * y * b020 + z * z * b002 +
+                 x * y * b110 + x * z * b101 + y * z * b011 + x * y * z * b111;
+      real_t vvz   = c000 + x * c100 + y * c010 + z * c001 +
+                 x * x * c200 + y * y * c020 + z * z * c002 +
+                 x * y * c110 + x * z * c101 + y * z * c011 + x * y * z * c111;
+      m000 = press; // m000 is press, if drho is interpolated directly
+      WALBERLA_CHECK_FLOAT_EQUAL(press, real_c(0.0))
+      WALBERLA_CHECK_FLOAT_EQUAL(vvx, real_c(0.01))
+      WALBERLA_CHECK_FLOAT_EQUAL(vvy, real_c(0.0))
+      WALBERLA_CHECK_FLOAT_EQUAL(vvz, real_c(0.0))
+      ////////////////////////////////////////////////////////////////////////////////
+      //! - Set moments (second to sixth order) on destination node
+      //!
+      // linear combinations for second order moments
+      real_t mxxPyyPzz = m000;
+      real_t mxxMyy = -c2o3 * (a100 - b010 + kxxMyyAverage + c2o1 * a200 * x - b110 * x + a110 * y
+                             -c2o1 * b020 * y + a101 * z - b011 * z - b111 * x * z + a111 * y * z) * epsnew/ omegaF * (c1o1 + press);
+      real_t mxxMzz = -c2o3 * (a100 - c001 + kxxMzzAverage + c2o1 * a200 * x - c101 * x + a110 * y
+                             -c011 * y - c111 * x * y + a101 * z - c2o1 * c002 * z + a111 * y * z) * epsnew/ omegaF * (c1o1 + press);
+      m011 = -c1o3 * (b001 + c010 + kyzAverage + b101 * x + c110 * x + b011 * y + c2o1 * c020 * y
+                      + b111 * x * y + c2o1 * b002 * z + c011 * z + c111 * x * z) * epsnew / omegaF * (c1o1 + press);
+      m101 = -c1o3 * (a001 + c100 + kxzAverage + a101 * x + c2o1 * c200 * x + a011 * y + c110 * y
+                      + a111 * x * y + c2o1 * a002 * z + c101 * z + c111 * y * z) * epsnew / omegaF * (c1o1 + press);
+      m110 = -c1o3 * (a010 + b100 + kxyAverage + a110 * x + c2o1 * b200 * x + c2o1 * a020 * y
+                      + b110 * y + a011 * z + b101 * z + a111 * x * z + b111 * y * z) * epsnew / omegaF * (c1o1 + press);
+      m200 = c1o3 * (        mxxMyy +        mxxMzz + mxxPyyPzz) * useNEQ;
+      m020 = c1o3 * (-c2o1 * mxxMyy +        mxxMzz + mxxPyyPzz) * useNEQ;
+      m002 = c1o3 * (        mxxMyy - c2o1 * mxxMzz + mxxPyyPzz) * useNEQ;
+      // linear combinations for third order moments
+      m111 = c0o1;
+      real_t mxxyPyzz = c0o1;
+      real_t mxxyMyzz = c0o1;
+      real_t mxxzPyyz = c0o1;
+      real_t mxxzMyyz = c0o1;
+      real_t mxyyPxzz = c0o1;
+      real_t mxyyMxzz = c0o1;
+      m210 = ( mxxyMyzz + mxxyPyzz) * c1o2;
+      m012 = (-mxxyMyzz + mxxyPyzz) * c1o2;
+      m201 = ( mxxzMyyz + mxxzPyyz) * c1o2;
+      m021 = (-mxxzMyyz + mxxzPyyz) * c1o2;
+      m120 = ( mxyyMxzz + mxyyPxzz) * c1o2;
+      m102 = (-mxyyMxzz + mxyyPxzz) * c1o2;
+      // fourth order moments
+      m022 = m000 * c1o9;
+      m202 = m022;
+      m220 = m022;
+      // fifth order moments
+      // sixth order moment
+      m222 = m000 * c1o27;
+      real_t vxsq = vvx * vvx;
+      real_t vysq = vvy * vvy;
+      real_t vzsq = vvz * vvz;
+      ////////////////////////////////////////////////////////////////////////////////////
+      // X - Dir
+      backwardInverseChimeraWithK(m000, m100, m200, vvx, vxsq, c1o1, c1o1);
+      backwardChimera(            m010, m110, m210, vvx, vxsq);
+      backwardInverseChimeraWithK(m020, m120, m220, vvx, vxsq, c3o1, c1o3);
+      backwardChimera(            m001, m101, m201, vvx, vxsq);
+      backwardChimera(            m011, m111, m211, vvx, vxsq);
+      backwardChimera(            m021, m121, m221, vvx, vxsq);
+      backwardInverseChimeraWithK(m002, m102, m202, vvx, vxsq, c3o1, c1o3);
+      backwardChimera(            m012, m112, m212, vvx, vxsq);
+      backwardInverseChimeraWithK(m022, m122, m222, vvx, vxsq, c9o1, c1o9);
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Y - Dir
+      backwardInverseChimeraWithK(m000, m010, m020, vvy, vysq, c6o1, c1o6);
+      backwardChimera(            m001, m011, m021, vvy, vysq);
+      backwardInverseChimeraWithK(m002, m012, m022, vvy, vysq, c18o1, c1o18);
+      backwardInverseChimeraWithK(m100, m110, m120, vvy, vysq, c3o2, c2o3);
+      backwardChimera(            m101, m111, m121, vvy, vysq);
+      backwardInverseChimeraWithK(m102, m112, m122, vvy, vysq, c9o2, c2o9);
+      backwardInverseChimeraWithK(m200, m210, m220, vvy, vysq, c6o1, c1o6);
+      backwardChimera(            m201, m211, m221, vvy, vysq);
+      backwardInverseChimeraWithK(m202, m212, m222, vvy, vysq, c18o1, c1o18);
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Z - Dir
+      backwardInverseChimeraWithK(m000, m001, m002, vvz, vzsq, c36o1, c1o36);
+      backwardInverseChimeraWithK(m010, m011, m012, vvz, vzsq, c9o1,  c1o9);
+      backwardInverseChimeraWithK(m020, m021, m022, vvz, vzsq, c36o1, c1o36);
+      backwardInverseChimeraWithK(m100, m101, m102, vvz, vzsq, c9o1,  c1o9);
+      backwardInverseChimeraWithK(m110, m111, m112, vvz, vzsq, c9o4,  c4o9);
+      backwardInverseChimeraWithK(m120, m121, m122, vvz, vzsq, c9o1,  c1o9);
+      backwardInverseChimeraWithK(m200, m201, m202, vvz, vzsq, c36o1, c1o36);
+      backwardInverseChimeraWithK(m210, m211, m212, vvz, vzsq, c9o1,  c1o9);
+      backwardInverseChimeraWithK(m220, m221, m222, vvz, vzsq, c36o1, c1o36);
+      auto f = block->getData<PdfField_T>(ids_.pdfField);
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[0],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[0],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[0], 0) = f000;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[4],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[4],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[4], 4)= fP00;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[3],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[3],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[3], 3)= fM00;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[1],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[1],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[1], 1)= f0P0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[2],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[2],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[2], 2)= f0M0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[5],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[5],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[5], 5)= f00P;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[6],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[6],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[6], 6)= f00M;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[8],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[8],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[8], 8)= fPP0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[9],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[9],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[9], 9)= fMM0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[10],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[10],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[10], 10) = fPM0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[7],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[7],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[7], 7)= fMP0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[14],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[14],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[14], 14) = fP0P;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[17],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[17],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[17], 17) = fM0M;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[18],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[18],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[18], 18) = fP0M;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[13],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[13],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[13], 13) = fM0P;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[11],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[11],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[11], 11) = f0PP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[16],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[16],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[16], 16) = f0MM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[15],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[15],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[15], 15) = f0PM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[12],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[12],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[12], 12) = f0MP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[19],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[19],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[19], 19) = fPPP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[20],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[20],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[20], 20) = fMPP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[21],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[21],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[21], 21) = fPMP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[22],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[22],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[22], 22) = fMMP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[23],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[23],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[23], 23) = fPPM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[24],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[24],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[24], 24) = fMPM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[25],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[25],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[25], 25) = fPMM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[26],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[26],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[26], 26) = fMMM;
+   }
+   void interpolateFC(Block* block, Cell& localCell, const real_t omegaC, const InterpolationCoefficients &coefficients)
+   {
+      const real_t epsnew = c2o1;
+      const real_t kxyAverage    = c0o1;
+      const real_t kyzAverage    = c0o1;
+      const real_t kxzAverage    = c0o1;
+      const real_t kxxMyyAverage = c0o1;
+      const real_t kxxMzzAverage = c0o1;
+      ////////////////////////////////////////////////////////////////////////////////////
+      //! - Set all moments to zero
+      //!
+      real_t m111 = real_c(0.0);
+      real_t m211 = real_c(0.0);
+      real_t m011 = real_c(0.0);
+      real_t m121 = real_c(0.0);
+      real_t m101 = real_c(0.0);
+      real_t m112 = real_c(0.0);
+      real_t m110 = real_c(0.0);
+      real_t m221 = real_c(0.0);
+      real_t m001 = real_c(0.0);
+      real_t m201 = real_c(0.0);
+      real_t m021 = real_c(0.0);
+      real_t m212 = real_c(0.0);
+      real_t m010 = real_c(0.0);
+      real_t m210 = real_c(0.0);
+      real_t m012 = real_c(0.0);
+      real_t m122 = real_c(0.0);
+      real_t m100 = real_c(0.0);
+      real_t m120 = real_c(0.0);
+      real_t m102 = real_c(0.0);
+      real_t m222 = real_c(0.0);
+      real_t m022 = real_c(0.0);
+      real_t m202 = real_c(0.0);
+      real_t m002 = real_c(0.0);
+      real_t m220 = real_c(0.0);
+      real_t m020 = real_c(0.0);
+      real_t m200 = real_c(0.0);
+      real_t m000 = real_c(0.0);
+      real_t& f000 = m111;
+      real_t& fP00 = m211;
+      real_t& fM00 = m011;
+      real_t& f0P0 = m121;
+      real_t& f0M0 = m101;
+      real_t& f00P = m112;
+      real_t& f00M = m110;
+      real_t& fPP0 = m221;
+      real_t& fMM0 = m001;
+      real_t& fPM0 = m201;
+      real_t& fMP0 = m021;
+      real_t& fP0P = m212;
+      real_t& fM0M = m010;
+      real_t& fP0M = m210;
+      real_t& fM0P = m012;
+      real_t& f0PP = m122;
+      real_t& f0MM = m100;
+      real_t& f0PM = m120;
+      real_t& f0MP = m102;
+      real_t& fPPP = m222;
+      real_t& fMPP = m022;
+      real_t& fPMP = m202;
+      real_t& fMMP = m002;
+      real_t& fPPM = m220;
+      real_t& fMPM = m020;
+      real_t& fPMM = m200;
+      real_t& fMMM = m000;
+      ////////////////////////////////////////////////////////////////////////////////
+      //! - Declare local variables for destination nodes
+      //!
+      real_t vvx, vvy, vvz, vxsq, vysq, vzsq;
+      real_t mxxPyyPzz, mxxMyy, mxxMzz, mxxyPyzz, mxxyMyzz, mxxzPyyz, mxxzMyyz, mxyyPxzz, mxyyMxzz;
+      real_t useNEQ = c1o1; // zero; //one;   //.... one = on ..... zero = off
+      real_t press;
+      ////////////////////////////////////////////////////////////////////////////////
+      //! - Set macroscopic values on destination node (zeroth and first order moments)
+      //!
+      press = coefficients.d000 - c2o1 * coefficients.LaplaceRho * c1o8;
+      vvx   = coefficients.a000;
+      vvy   = coefficients.b000;
+      vvz   = coefficients.c000;
+      m000 = press; // m000 is press, if drho is interpolated directly
+      vxsq = vvx * vvx;
+      vysq = vvy * vvy;
+      vzsq = vvz * vvz;
+      ////////////////////////////////////////////////////////////////////////////////
+      //! - Set moments (second to sixth order) on destination node
+      //!
+      // linear combinations for second order moments
+      mxxPyyPzz = m000;
+      mxxMyy = -c2o3 * ((coefficients.a100 - coefficients.b010) + kxxMyyAverage) * epsnew / omegaC * (c1o1 + press);
+      mxxMzz = -c2o3 * ((coefficients.a100 - coefficients.c001) + kxxMzzAverage) * epsnew / omegaC * (c1o1 + press);
+      m011 = -c1o3 * ((coefficients.b001 + coefficients.c010) + kyzAverage) * epsnew / omegaC * (c1o1 + press);
+      m101 = -c1o3 * ((coefficients.a001 + coefficients.c100) + kxzAverage) * epsnew / omegaC * (c1o1 + press);
+      m110 = -c1o3 * ((coefficients.a010 + coefficients.b100) + kxyAverage) * epsnew / omegaC * (c1o1 + press);
+      m200 = c1o3 * (        mxxMyy +        mxxMzz + mxxPyyPzz) * useNEQ;
+      m020 = c1o3 * (-c2o1 * mxxMyy +        mxxMzz + mxxPyyPzz) * useNEQ;
+      m002 = c1o3 * (        mxxMyy - c2o1 * mxxMzz + mxxPyyPzz) * useNEQ;
+      // linear combinations for third order moments
+      m111 = c0o1;
+      mxxyPyzz = c0o1;
+      mxxyMyzz = c0o1;
+      mxxzPyyz = c0o1;
+      mxxzMyyz = c0o1;
+      mxyyPxzz = c0o1;
+      mxyyMxzz = c0o1;
+      m210 = ( mxxyMyzz + mxxyPyzz) * c1o2;
+      m012 = (-mxxyMyzz + mxxyPyzz) * c1o2;
+      m201 = ( mxxzMyyz + mxxzPyyz) * c1o2;
+      m021 = (-mxxzMyyz + mxxzPyyz) * c1o2;
+      m120 = ( mxyyMxzz + mxyyPxzz) * c1o2;
+      m102 = (-mxyyMxzz + mxyyPxzz) * c1o2;
+      // fourth order moments
+      m022 = m000 * c1o9;
+      m202 = m022;
+      m220 = m022;
+      // fifth order moments
+      // sixth order moments
+      m222 = m000 * c1o27;
+      ////////////////////////////////////////////////////////////////////////////////////
+      //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
+      //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+      //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> see also Eq. (88)-(96) in <a
+      //! href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+      //! ]</b></a>
+      //!
+      ////////////////////////////////////////////////////////////////////////////////////
+      // X - Dir
+      backwardInverseChimeraWithK(m000, m100, m200, vvx, vxsq, c1o1, c1o1);
+      backwardChimera(            m010, m110, m210, vvx, vxsq);
+      backwardInverseChimeraWithK(m020, m120, m220, vvx, vxsq, c3o1, c1o3);
+      backwardChimera(            m001, m101, m201, vvx, vxsq);
+      backwardChimera(            m011, m111, m211, vvx, vxsq);
+      backwardChimera(            m021, m121, m221, vvx, vxsq);
+      backwardInverseChimeraWithK(m002, m102, m202, vvx, vxsq, c3o1, c1o3);
+      backwardChimera(            m012, m112, m212, vvx, vxsq);
+      backwardInverseChimeraWithK(m022, m122, m222, vvx, vxsq, c9o1, c1o9);
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Y - Dir
+      backwardInverseChimeraWithK(m000, m010, m020, vvy, vysq, c6o1, c1o6);
+      backwardChimera(            m001, m011, m021, vvy, vysq);
+      backwardInverseChimeraWithK(m002, m012, m022, vvy, vysq, c18o1, c1o18);
+      backwardInverseChimeraWithK(m100, m110, m120, vvy, vysq, c3o2, c2o3);
+      backwardChimera(            m101, m111, m121, vvy, vysq);
+      backwardInverseChimeraWithK(m102, m112, m122, vvy, vysq, c9o2, c2o9);
+      backwardInverseChimeraWithK(m200, m210, m220, vvy, vysq, c6o1, c1o6);
+      backwardChimera(            m201, m211, m221, vvy, vysq);
+      backwardInverseChimeraWithK(m202, m212, m222, vvy, vysq, c18o1, c1o18);
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Z - Dir
+      backwardInverseChimeraWithK(m000, m001, m002, vvz, vzsq, c36o1, c1o36);
+      backwardInverseChimeraWithK(m010, m011, m012, vvz, vzsq, c9o1,  c1o9);
+      backwardInverseChimeraWithK(m020, m021, m022, vvz, vzsq, c36o1, c1o36);
+      backwardInverseChimeraWithK(m100, m101, m102, vvz, vzsq, c9o1,  c1o9);
+      backwardInverseChimeraWithK(m110, m111, m112, vvz, vzsq, c9o4,  c4o9);
+      backwardInverseChimeraWithK(m120, m121, m122, vvz, vzsq, c9o1,  c1o9);
+      backwardInverseChimeraWithK(m200, m201, m202, vvz, vzsq, c36o1, c1o36);
+      backwardInverseChimeraWithK(m210, m211, m212, vvz, vzsq, c9o1,  c1o9);
+      backwardInverseChimeraWithK(m220, m221, m222, vvz, vzsq, c36o1, c1o36);
+      auto f = block->getData<PdfField_T>(ids_.pdfField);
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[0],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[0],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[0], 0) = f000;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[4],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[4],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[4], 4)= fP00;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[3],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[3],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[3], 3)= fM00;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[1],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[1],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[1], 1)= f0P0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[2],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[2],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[2], 2)= f0M0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[5],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[5],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[5], 5)= f00P;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[6],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[6],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[6], 6)= f00M;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[8],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[8],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[8], 8)= fPP0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[9],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[9],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[9], 9)= fMM0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[10],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[10],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[10], 10) = fPM0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[7],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[7],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[7], 7)= fMP0;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[14],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[14],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[14], 14) = fP0P;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[17],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[17],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[17], 17) = fM0M;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[18],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[18],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[18], 18) = fP0M;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[13],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[13],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[13], 13) = fM0P;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[11],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[11],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[11], 11) = f0PP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[16],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[16],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[16], 16) = f0MM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[15],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[15],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[15], 15) = f0PM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[12],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[12],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[12], 12) = f0MP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[19],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[19],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[19], 19) = fPPP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[20],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[20],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[20], 20) = fMPP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[21],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[21],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[21], 21) = fPMP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[22],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[22],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[22], 22) = fMMP;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[23],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[23],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[23], 23) = fPPM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[24],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[24],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[24], 24) = fMPM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[25],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[25],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[25], 25) = fPMM;
+      f->get(localCell.x() + StorageSpecification_T::AccessorEVEN::readX[26],
+             localCell.y() + StorageSpecification_T::AccessorEVEN::readY[26],
+             localCell.z() + StorageSpecification_T::AccessorEVEN::readZ[26], 26) = fMMM;
+   }
+   void setCoefficients(Block * sender, Block * receiver, CellInterval& ci, real_t omega, stencil::Direction dir)
+   {
+      real_t f_coarse[27];
+      switch (dir) {
+      case stencil::N : {
+         for (auto it: ci)
+         {
+            Cell c0 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c0, f_coarse);
+            fineCoefficients[receiver][0].calculatePMP(f_coarse, omega);
+            fineCoefficients[receiver][1].calculateMMP(f_coarse, omega);
+            fineCoefficients[receiver][3].calculatePMM(f_coarse, omega);
+            fineCoefficients[receiver][4].calculateMMM(f_coarse, omega);
+            Cell c1 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c1, f_coarse);
+            fineCoefficients[receiver][3].calculatePMP(f_coarse, omega);
+            fineCoefficients[receiver][4].calculateMMP(f_coarse, omega);
+            fineCoefficients[receiver][6].calculatePMM(f_coarse, omega);
+            fineCoefficients[receiver][7].calculateMMM(f_coarse, omega);
+            Cell c2 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c2, f_coarse);
+            fineCoefficients[receiver][4].calculatePMP(f_coarse, omega);
+            fineCoefficients[receiver][5].calculateMMP(f_coarse, omega);
+            fineCoefficients[receiver][7].calculatePMM(f_coarse, omega);
+            fineCoefficients[receiver][8].calculateMMM(f_coarse, omega);
+            Cell c3 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c3, f_coarse);
+            fineCoefficients[receiver][1].calculatePMP(f_coarse, omega);
+            fineCoefficients[receiver][2].calculateMMP(f_coarse, omega);
+            fineCoefficients[receiver][4].calculatePMM(f_coarse, omega);
+            fineCoefficients[receiver][5].calculateMMM(f_coarse, omega);
+            Cell c4 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c4, f_coarse);
+            fineCoefficients[receiver][0].calculatePPP(f_coarse, omega);
+            fineCoefficients[receiver][1].calculateMPP(f_coarse, omega);
+            fineCoefficients[receiver][3].calculatePPM(f_coarse, omega);
+            fineCoefficients[receiver][4].calculateMPM(f_coarse, omega);
+            Cell c5 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c5, f_coarse);
+            fineCoefficients[receiver][3].calculatePPP(f_coarse, omega);
+            fineCoefficients[receiver][4].calculateMPP(f_coarse, omega);
+            fineCoefficients[receiver][6].calculatePPM(f_coarse, omega);
+            fineCoefficients[receiver][7].calculateMPM(f_coarse, omega);
+            Cell c6 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c6, f_coarse);
+            fineCoefficients[receiver][4].calculatePPP(f_coarse, omega);
+            fineCoefficients[receiver][5].calculateMPP(f_coarse, omega);
+            fineCoefficients[receiver][7].calculatePPM(f_coarse, omega);
+            fineCoefficients[receiver][8].calculateMPM(f_coarse, omega);
+            Cell c7 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c7, f_coarse);
+            fineCoefficients[receiver][1].calculatePPP(f_coarse, omega);
+            fineCoefficients[receiver][2].calculateMPP(f_coarse, omega);
+            fineCoefficients[receiver][4].calculatePPM(f_coarse, omega);
+            fineCoefficients[receiver][5].calculateMPM(f_coarse, omega);
+         }
+         break;
+      }
+      case stencil::NW : {
+         for (auto it: ci)
+         {
+            Cell c0 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c0, f_coarse);
+            fineCoefficients[receiver][2].calculatePMP(f_coarse, omega);
+            fineCoefficients[receiver][5].calculatePMM(f_coarse, omega);
+            Cell c1 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c1, f_coarse);
+            fineCoefficients[receiver][5].calculatePMP(f_coarse, omega);
+            fineCoefficients[receiver][8].calculatePMM(f_coarse, omega);
+            Cell c4 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c4, f_coarse);
+            fineCoefficients[receiver][2].calculatePPP(f_coarse, omega);
+            fineCoefficients[receiver][5].calculatePPM(f_coarse, omega);
+            Cell c5 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c5, f_coarse);
+            fineCoefficients[receiver][5].calculatePPP(f_coarse, omega);
+            fineCoefficients[receiver][8].calculatePPM(f_coarse, omega);
+         }
+         break;
+      }
+      case stencil::NE : {
+         for (auto it: ci)
+         {
+            Cell c2 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c2, f_coarse);
+            fineCoefficients[receiver][3].calculateMMP(f_coarse, omega);
+            fineCoefficients[receiver][6].calculateMMM(f_coarse, omega);
+            Cell c3 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c3, f_coarse);
+            fineCoefficients[receiver][0].calculateMMP(f_coarse, omega);
+            fineCoefficients[receiver][3].calculateMMM(f_coarse, omega);
+            Cell c6 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c6, f_coarse);
+            fineCoefficients[receiver][3].calculateMPP(f_coarse, omega);
+            fineCoefficients[receiver][6].calculateMPM(f_coarse, omega);
+            Cell c7 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c7, f_coarse);
+            fineCoefficients[receiver][0].calculateMPP(f_coarse, omega);
+            fineCoefficients[receiver][3].calculateMPM(f_coarse, omega);
+         }
+         break;
+      }
+      case stencil::TN : {
+         for (auto it: ci)
+         {
+            Cell c1 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c1, f_coarse);
+            fineCoefficients[receiver][0].calculatePMM(f_coarse, omega);
+            fineCoefficients[receiver][1].calculateMMM(f_coarse, omega);
+            Cell c2 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c2, f_coarse);
+            fineCoefficients[receiver][1].calculatePMM(f_coarse, omega);
+            fineCoefficients[receiver][2].calculateMMM(f_coarse, omega);
+            Cell c5 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c5, f_coarse);
+            fineCoefficients[receiver][0].calculatePPM(f_coarse, omega);
+            fineCoefficients[receiver][1].calculateMPM(f_coarse, omega);
+            Cell c6 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c6, f_coarse);
+            fineCoefficients[receiver][1].calculatePPM(f_coarse, omega);
+            fineCoefficients[receiver][2].calculateMPM(f_coarse, omega);
+         }
+         break;
+      }
+      case stencil::BN : {
+         for (auto it: ci)
+         {
+            Cell c0 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c0, f_coarse);
+            fineCoefficients[receiver][6].calculatePMP(f_coarse, omega);
+            fineCoefficients[receiver][7].calculateMMP(f_coarse, omega);
+            Cell c3 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c3, f_coarse);
+            fineCoefficients[receiver][7].calculatePMP(f_coarse, omega);
+            fineCoefficients[receiver][8].calculateMMP(f_coarse, omega);
+            Cell c4 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c4, f_coarse);
+            fineCoefficients[receiver][6].calculatePPP(f_coarse, omega);
+            fineCoefficients[receiver][7].calculateMPP(f_coarse, omega);
+            Cell c7 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c7, f_coarse);
+            fineCoefficients[receiver][7].calculatePPP(f_coarse, omega);
+            fineCoefficients[receiver][8].calculateMPP(f_coarse, omega);
+         }
+         break;
+      }
+      case stencil::TNE : {
+         for (auto it: ci)
+         {
+            Cell c2 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c2, f_coarse);
+            fineCoefficients[receiver][0].calculateMMM(f_coarse, omega);
+            Cell c6 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c6, f_coarse);
+            fineCoefficients[receiver][0].calculateMPM(f_coarse, omega);
+         }
+         break;
+      }
+      case stencil::TNW : {
+         for (auto it: ci)
+         {
+            Cell c1 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c1, f_coarse);
+            fineCoefficients[receiver][2].calculatePMM(f_coarse, omega);
+            Cell c5 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0));
+            getCoarsePDFs(sender, c5, f_coarse);
+            fineCoefficients[receiver][2].calculatePPM(f_coarse, omega);
+         }
+         break;
+      }
+      case stencil::BNE : {
+         for (auto it: ci)
+         {
+            Cell c3 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c3, f_coarse);
+            fineCoefficients[receiver][6].calculateMMP(f_coarse, omega);
+            Cell c7 = Cell(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c7, f_coarse);
+            fineCoefficients[receiver][6].calculateMPP(f_coarse, omega);
+         }
+         break;
+      }
+      case stencil::BNW : {
+         for (auto it: ci)
+         {
+            Cell c0 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c0, f_coarse);
+            fineCoefficients[receiver][8].calculatePMP(f_coarse, omega);
+            Cell c4 = Cell(it.x() + cell_idx_c(-1), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1));
+            getCoarsePDFs(sender, c4, f_coarse);
+            fineCoefficients[receiver][8].calculatePPP(f_coarse, omega);
+         }
+         break;
+      }default:
+         WALBERLA_ABORT("This should not happen");
+      }
+   }
+   void coarseToFine()
+   {
+      const real_t omegaC = omegaVector[0];
+      const real_t omegaF = omegaVector[1];
+      for(auto t: mapping_)
+      {
+         auto [coarseBlock, fineBlock, dir, ci] = t;
+         setCoefficients(coarseBlock, fineBlock, ci, omegaC, dir);
+      }
+      for(auto block: blocksPerLevel_[1])
+      {
+         const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(stencil::Direction::S);
+         if (!(block->neighborhoodSectionHasLargerBlock(neighborIdx))) continue;
+         auto momentSets = fineCoefficients[block];
+         for(size_t i = 0; i < 3; ++i)
+         {
+            for(size_t j = 0; j < 3; ++j)
+            {
+               size_t index = 3 * i + j;
+               InterpolationCoefficients coefficients;
+               momentSets[index].calculateCoefficients(coefficients);
+               Vector3<real_t> mmm(-real_c(0.25), -real_c(0.25), -real_c(0.25));
+               Cell c0(-1 + 2*cell_idx_c(j), -2, -1 + 2*cell_idx_c(i));
+               interpolateCF(block, c0, mmm, omegaF, coefficients);
+               Vector3<real_t> mmp(-real_c(0.25), -real_c(0.25), real_c(0.25));
+               Cell c1(-1 + 2*cell_idx_c(j), -2, 0 + 2*cell_idx_c(i));
+               interpolateCF(block, c1, mmp, omegaF, coefficients);
+               Vector3<real_t> pmp(real_c(0.25), -real_c(0.25), real_c(0.25));
+               Cell c2(0 + 2*cell_idx_c(j), -2, 0 + 2*cell_idx_c(i));
+               interpolateCF(block, c2, pmp, omegaF, coefficients);
+               Vector3<real_t> pmm(real_c(0.25), -real_c(0.25), -real_c(0.25));
+               Cell c3(0 + 2*cell_idx_c(j), -2, -1 + 2*cell_idx_c(i));
+               interpolateCF(block, c3, pmm, omegaF, coefficients);
+               Vector3<real_t> mpm(-real_c(0.25), real_c(0.25), -real_c(0.25));
+               Cell c4(-1 + 2*cell_idx_c(j), -1, -1 + 2*cell_idx_c(i));
+               interpolateCF(block, c4, mpm, omegaF, coefficients);
+               Vector3<real_t> mpp(-real_c(0.25), real_c(0.25), real_c(0.25));
+               Cell c5(-1 + 2*cell_idx_c(j), -1, 0 + 2*cell_idx_c(i));
+               interpolateCF(block, c5, mpp, omegaF, coefficients);
+               Vector3<real_t> ppp(real_c(0.25), real_c(0.25), real_c(0.25));
+               Cell c6(0 + 2*cell_idx_c(j), -1, 0 + 2*cell_idx_c(i));
+               interpolateCF(block, c6, ppp, omegaF, coefficients);
+               Vector3<real_t> ppm(real_c(0.25), real_c(0.25), -real_c(0.25));
+               Cell c7(0 + 2*cell_idx_c(j), -1, -1 + 2*cell_idx_c(i));
+               interpolateCF(block, c7, ppm, omegaF, coefficients);
+            }
+         }
+      }
+   }
+   void fineToCoarse()
+   {
+      const real_t omegaC = omegaVector[0];
+      const real_t omegaF = omegaVector[1];
+      for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
+      {
+         if(blocks_->getLevel(*it.get()) != 0)
+            continue;
+         for(cell_idx_t z = 0; z < 4; z++)
+         {
+            for(cell_idx_t x = 0; x < 4; x++)
+            {
+               real_t f_fine[27];
+               MomentsOnSourceNodeSet moments;
+               Vector3<real_t> p(real_c(x), real_c(3.0), real_c(z));
+               blocks_->transformBlockLocalToGlobal(p, *it);
+               Vector3<real_t> cellCenter(p[0] + real_c(0.5), p[1] + real_c(1.5), p[2] + real_c(0.5));
+               Block* fineBlock = dynamic_cast< Block* >(blocks_->getBlock(p[0], p[1] + real_c(2.0), p[2]));
+               Vector3<real_t> mmm(-real_c(0.25), -real_c(0.25), -real_c(0.25));
+               Cell c0 = blocks_->getBlockLocalCell(*fineBlock, cellCenter + mmm);
+               getFinePDFs(fineBlock, c0, f_fine);
+               moments.calculateMMM(f_fine, omegaF);
+               Vector3<real_t> mmp(-real_c(0.25), -real_c(0.25), real_c(0.25));
+               Cell c1 = blocks_->getBlockLocalCell(*fineBlock, cellCenter + mmp);
+               getFinePDFs(fineBlock, c1, f_fine);
+               moments.calculateMMP(f_fine, omegaF);
+               Vector3<real_t> pmp(real_c(0.25), -real_c(0.25), real_c(0.25));
+               Cell c2 = blocks_->getBlockLocalCell(*fineBlock, cellCenter + pmp);
+               getFinePDFs(fineBlock, c2, f_fine);
+               moments.calculatePMP(f_fine, omegaF);
+               Vector3<real_t> pmm(real_c(0.25), -real_c(0.25), -real_c(0.25));
+               Cell c3 = blocks_->getBlockLocalCell(*fineBlock, cellCenter + pmm);
+               getFinePDFs(fineBlock, c3, f_fine);
+               moments.calculatePMM(f_fine, omegaF);
+               Vector3<real_t> mpm(-real_c(0.25), real_c(0.25), -real_c(0.25));
+               Cell c4 = blocks_->getBlockLocalCell(*fineBlock, cellCenter + mpm);
+               getFinePDFs(fineBlock, c4, f_fine);
+               moments.calculateMPM(f_fine, omegaF);
+               Vector3<real_t> mpp(-real_c(0.25), real_c(0.25), real_c(0.25));
+               Cell c5 = blocks_->getBlockLocalCell(*fineBlock, cellCenter + mpp);
+               getFinePDFs(fineBlock, c5, f_fine);
+               moments.calculateMPP(f_fine, omegaF);
+               Vector3<real_t> ppp(real_c(0.25), real_c(0.25), real_c(0.25));
+               Cell c6 = blocks_->getBlockLocalCell(*fineBlock, cellCenter + ppp);
+               getFinePDFs(fineBlock, c6, f_fine);
+               moments.calculatePPP(f_fine, omegaF);
+               Vector3<real_t> ppm(real_c(0.25), real_c(0.25), -real_c(0.25));
+               Cell c7 = blocks_->getBlockLocalCell(*fineBlock, cellCenter + ppm);
+               getFinePDFs(fineBlock, c7, f_fine);
+               moments.calculatePPM(f_fine, omegaF);
+               InterpolationCoefficients coefficients;
+               moments.calculateCoefficients(coefficients);
+               Block* coarseBlock = dynamic_cast< Block* >(it.get());
+               Cell coarseLocalCell(x, cell_idx_c(3), z);
+               interpolateFC(coarseBlock, coarseLocalCell, omegaF, coefficients);
+            }
+         }
+      }
+   }
+ private:
+   std::shared_ptr< StructuredBlockForest > blocks_;
+   IDs ids_;
+   std::vector<real_t> omegaVector;
+   std::vector<std::vector<Block *>> blocksPerLevel_;
+   std::map< Block*, std::vector< MomentsOnSourceNodeSet > > fineCoefficients;
+   std::vector<std::tuple<Block*, Block*, stencil::Direction, CellInterval>> mapping_;
+};
+class Timestep
+{
+ public:
+   Timestep(std::shared_ptr< StructuredBlockForest >& blocks, const IDs& ids,
+            StreamCollide & streamCollide, BoundaryCollection_T& boundaryCollection, Interpolation& interpolation,
+            std::shared_ptr< PackInfo_T >& packInfo, std::shared_ptr< NonUniformBufferedScheme<CommunicationStencil_T> > communication)
+      : blocks_(blocks), ids_(ids), streamCollide_(streamCollide), boundaryCollection_(boundaryCollection), interpolation_(interpolation),
+        packInfo_(packInfo), communication_(communication)
+   {
+      maxLevel_ = blocks_->getDepth();
+      for (uint_t level = 0; level <= maxLevel_; level++)
+      {
+         std::vector<Block *> tmp;
+         blocks_->getBlocks(tmp, level);
+         blocksPerLevel_.push_back(tmp);
+      }
+   }
+   void commLocal(const uint_t level)
+   {
+      for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
+      {
+         if(blocks_->getLevel(*it.get()) != level)
+            continue;
+         auto* sender = dynamic_cast< Block* >(it.get());
+         for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir )
+         {
+            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+            if (!(sender->neighborhoodSectionHasEquallySizedBlock(neighborIdx)))
+               continue;
+            const BlockID & receiverId = sender->getNeighborId( neighborIdx, uint_t(0) );
+            auto receiver = dynamic_cast< Block * >( blocks_->getBlock(receiverId) );
+            const PdfField_T * srcField = sender->getData< PdfField_T >( ids_.pdfField );
+            PdfField_T * dstField = receiver->getData< PdfField_T >( ids_.pdfField );
+            CellInterval srcRegion;
+            CellInterval dstRegion;
+            cell_idx_t gls = 1; //skipsThroughCoarseBlock(sender, *dir) ? 2 : 1;
+            srcField->getSliceBeforeGhostLayer(*dir, srcRegion, gls, false);
+            dstField->getGhostRegion(stencil::inverseDir[*dir], dstRegion, gls, false);
+            WALBERLA_ASSERT_EQUAL(srcRegion.xSize(), dstRegion.xSize())
+            WALBERLA_ASSERT_EQUAL(srcRegion.ySize(), dstRegion.ySize())
+            WALBERLA_ASSERT_EQUAL(srcRegion.zSize(), dstRegion.zSize())
+            //WALBERLA_LOG_DEVEL_VAR(srcRegion)
+            //WALBERLA_LOG_DEVEL_VAR(dstRegion)
+            auto srcIter = srcRegion.begin();
+            auto dstIter = dstRegion.begin();
+            while (srcIter != srcRegion.end())
+            {
+               for( uint_t f = 0; f < Stencil_T::d_per_d_length[*dir]; ++f )
+               {
+                  dstField->get(*dstIter, Stencil_T::idx[ Stencil_T::d_per_d[*dir][f] ]) = srcField->get(*srcIter, Stencil_T::idx[ Stencil_T::d_per_d[*dir][f] ]);
+               }
+               ++srcIter;
+               ++dstIter;
+            }
+            WALBERLA_ASSERT( srcIter == srcRegion.end() )
+            WALBERLA_ASSERT( dstIter == dstRegion.end() )
+         }
+      }
+   }
+   inline bool skipsThroughCoarseBlock(const Block* block, const stencil::Direction dir) const
+   {
+      Vector3< cell_idx_t > dirVec(stencil::cx[dir], stencil::cy[dir], stencil::cz[dir]);
+      bool coarseBlockFound = false;
+      forEachSubdirectionCancel(dirVec, [&](Vector3< cell_idx_t > subdir) {
+         coarseBlockFound =
+            coarseBlockFound || block->neighborhoodSectionHasLargerBlock(
+                                   blockforest::getBlockNeighborhoodSectionIndex(subdir[0], subdir[1], subdir[2]));
+         return !coarseBlockFound;
+      });
+      return coarseBlockFound;
+   }
+   void ghostLayerPropagation(Block * block)
+   {
+      auto pdfField = block->getData<PdfField_T>(ids_.pdfField);
+      for(auto it = CommunicationStencil_T ::beginNoCenter(); it != CommunicationStencil_T ::end(); ++it){
+         uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it);
+         // Propagate on ghost layers shadowing coarse or no blocks
+         if(block->neighborhoodSectionHasLargerBlock(nSecIdx)){
+            CellInterval ci;
+            pdfField->getGhostRegion(*it, ci, 1);
+            streamCollide_.streamCollide(block, ci);
+         }
+      }
+   }
+   void timestep()
+   {
+      for(auto b: blocksPerLevel_[1])
+      {
+         ghostLayerPropagation(b);
+         streamCollide_(b);
+         boundaryCollection_(b);
+      }
+      commLocal(1);
+      for(auto b: blocksPerLevel_[1])
+      {
+         streamCollide_(b);
+         boundaryCollection_(b);
+      }
+      commLocal(1);
+//      for(auto b: blocksPerLevel_[0])
+//      {
+//         streamCollide_(b);
+//         boundaryCollection_(b);
+//      }
+//      commLocal(0);
+      //interpolation_.coarseToFine();
+      // interpolation_.fineToCoarse();
+   }
+   void operator()(){ timestep(); };
+ private:
+   std::shared_ptr< StructuredBlockForest > blocks_;
+   IDs ids_;
+   StreamCollide & streamCollide_;
+   BoundaryCollection_T & boundaryCollection_;
+   Interpolation & interpolation_;
+   std::shared_ptr< PackInfo_T > packInfo_;
+   std::shared_ptr< NonUniformBufferedScheme<CommunicationStencil_T> > communication_;
+   uint_t maxLevel_;
+   std::vector<std::vector<Block *>> blocksPerLevel_;
+   uint_t t{0};
+}; // class RefinementSelection
+int main(int argc, char** argv)
+{
+   mpi::Environment env(argc, argv);
+   shared_ptr< Config > config = make_shared< Config >();
+   config->readParameterFile(argv[1]);
+   logging::configureLogging(config);
+   // read parameters
+   auto parameters           = config->getOneBlock("Parameters");
+   auto loggingParameters    = config->getOneBlock("Logging");
+   auto domainParameters     = config->getOneBlock("DomainSetup");
+   auto EvaluationParameters = config->getOneBlock("Evaluation");
+   const real_t omega              = parameters.getParameter< real_t >("omega");
+   const real_t maxLatticeVelocity = parameters.getParameter< real_t >("maxLatticeVelocity");
+   const uint_t timesteps          = parameters.getParameter< uint_t >("timesteps") + uint_c(1);
+   const uint_t refinementLevels   = domainParameters.getParameter< uint_t >("refinementLevels");
+   Setup setup;
+   setup.blocks        = domainParameters.getParameter< Vector3< uint_t > >("blocks");
+   setup.cellsPerBlock = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   setup.cells = Vector3< uint_t >(setup.blocks[0] * setup.cellsPerBlock[0], setup.blocks[1] * setup.cellsPerBlock[1],
+                                   setup.blocks[2] * setup.cellsPerBlock[2]);
+   setup.periodic       = domainParameters.getParameter< Vector3< bool > >("periodic");
+   setup.numGhostLayers = refinementLevels > 0 ? uint_c(3) : uint_c(1);
+   const uint_t valuesPerCell = (uint_c(2) * Stencil_T::Q + VectorField_T ::F_SIZE + uint_c(2) * ScalarField_T::F_SIZE);
+   const uint_t sizePerValue  = sizeof(PdfField_T::value_type);
+   const memory_t memoryPerCell = memory_t(valuesPerCell * sizePerValue + uint_c(1));
+   blockforest::RefinementSelectionFunctions refinementSelectionFunctions;
+   RefinementSelection refinementSelection(refinementLevels);
+   refinementSelectionFunctions.add(refinementSelection);
+   bool writeSetupForestAndReturn = loggingParameters.getParameter< bool >("writeSetupForestAndReturn", false);
+   if (uint_c(MPIManager::instance()->numProcesses()) > 1) writeSetupForestAndReturn = false;
+   if (writeSetupForestAndReturn)
+   {
+      std::string sbffile = "Channel.bfs";
+      std::ostringstream infoString;
+      infoString << "You have selected the option of just creating the block structure (= domain decomposition) and "
+                    "saving the result to file\n"
+                    "by specifying the output file name \'"
+                 << sbffile << "\' AND also specifying \'saveToFile\'.\n";
+      if (MPIManager::instance()->numProcesses() > 1)
+         WALBERLA_ABORT(infoString.str() << "In this mode you need to start " << argv[0] << " with just one process!")
+      WALBERLA_LOG_INFO_ON_ROOT(infoString.str() << "Creating the block structure ...")
+      const uint_t numberProcesses = domainParameters.getParameter< uint_t >("numberProcesses");
+      shared_ptr< SetupBlockForest > sforest =
+         createSetupBlockForest(refinementSelectionFunctions, setup, numberProcesses, memoryPerCell);
+      sforest->writeVTKOutput("domain_decomposition");
+      sforest->saveToFile(sbffile.c_str());
+      logging::Logging::printFooterOnStream();
+      return EXIT_SUCCESS;
+   }
+   auto blocks = createStructuredBlockForest(refinementSelectionFunctions, setup, memoryPerCell);
+   IDs ids;
+   const StorageSpecification_T StorageSpec = StorageSpecification_T();
+   ids.pdfField      = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, setup.numGhostLayers, field::fzyx);
+   ids.pdfFieldTmp   = lbm_generated::addPdfFieldToStorage(blocks, "pdfs tmp", StorageSpec, setup.numGhostLayers, field::fzyx);
+   ids.velocityField = field::addToStorage< VectorField_T >(blocks, "vel", real_c(0.0), field::fzyx, setup.numGhostLayers);
+   ids.densityField = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, setup.numGhostLayers);
+   ids.flagField    = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
+//   for (auto& block : *blocks)
+//   {
+//      const uint_t level = blocks->getLevel(block);
+//      if(level == 0)
+//      {
+//         auto velocityField = block.getData<VectorField_T>(ids.velocityField);
+//
+//         for (cell_idx_t ctr_2 = 0; ctr_2 < velocityField->zSize(); ++ctr_2)
+//         {
+//            for(cell_idx_t ctr_1 = 0; ctr_1 < velocityField->ySize(); ++ctr_1)
+//            {
+//               for (cell_idx_t ctr_0 = 0; ctr_0 < velocityField->xSize(); ++ctr_0)
+//               {
+//                  Cell cell(ctr_0, ctr_1, ctr_2);
+//                  blocks->transformBlockLocalToGlobalCell(cell, block);
+//                  blocks->mapToPeriodicDomain(cell);
+//                  velocityField->get(ctr_0, ctr_1, ctr_2, 0) = real_c(cell.x());
+//                  velocityField->get(ctr_0, ctr_1, ctr_2, 1) = real_c(cell.y());
+//                  velocityField->get(ctr_0, ctr_1, ctr_2, 2) = real_c(cell.z());
+//               }
+//            }
+//         }
+//      }
+//   }
+      for (auto& block : *blocks)
+      {
+         const uint_t level = blocks->getLevel(block);
+         if(level == 0)
+         {
+            auto velocityField = block.getData<VectorField_T>(ids.velocityField);
+            for (cell_idx_t ctr_2 = 0; ctr_2 < velocityField->zSize(); ++ctr_2)
+            {
+               for(cell_idx_t ctr_1 = 0; ctr_1 < velocityField->ySize(); ++ctr_1)
+               {
+                  for (cell_idx_t ctr_0 = 0; ctr_0 < velocityField->xSize(); ++ctr_0)
+                  {
+                     velocityField->get(ctr_0, ctr_1, ctr_2, 0) = maxLatticeVelocity;
+                  }
+               }
+            }
+         }
+      }
+   SweepCollection_T sweepCollection(blocks, ids.pdfField, ids.densityField, ids.velocityField, omega);
+   for (auto& block : *blocks)
+   {
+      sweepCollection.initialise(&block, cell_idx_c(setup.numGhostLayers - uint_c(1)));
+   }
+   for (auto& block : *blocks)
+   {
+      if(blocks->getLevel(block) == 0)
+      {
+         sweepCollection.streamCollide(&block);
+      }
+   }
+   const FlagUID fluidFlagUID("Fluid");
+   auto boundariesConfig = config->getBlock("Boundaries");
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, ids.flagField, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, fluidFlagUID, cell_idx_c(0));
+   BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfField, fluidFlagUID, maxLatticeVelocity);
+   WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+   auto communication      = std::make_shared< NonUniformBufferedScheme< CommunicationStencil_T > >(blocks);
+   auto nonUniformPackInfo = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, ids.pdfField);
+   communication->addPackInfo(nonUniformPackInfo);
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+   auto VTKWriter                 = config->getOneBlock("VTKWriter");
+   const uint_t vtkWriteFrequency = VTKWriter.getParameter< uint_t >("vtkWriteFrequency");
+   const bool writeVelocity       = VTKWriter.getParameter< bool >("velocity");
+   const bool writeDensity        = VTKWriter.getParameter< bool >("density");
+   // const bool writeAverageFields  = VTKWriter.getParameter< bool >("averageFields", false);
+   const bool writeFlag = VTKWriter.getParameter< bool >("flag");
+   // const bool writeOnlySlice      = VTKWriter.getParameter< bool >("writeOnlySlice", true);
+   const bool amrFileFormat     = VTKWriter.getParameter< bool >("amrFileFormat", false);
+   const bool oneFilePerProcess = VTKWriter.getParameter< bool >("oneFilePerProcess", false);
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput =
+         vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_Channel", "simulation_step",
+                                        false, true, true, false, 0, amrFileFormat, oneFilePerProcess);
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+         {
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+      });
+      if (writeVelocity)
+      {
+         auto velWriter = make_shared< field::VTKWriter< VectorField_T, float32 > >(ids.velocityField, "velocity");
+         vtkOutput->addCellDataWriter(velWriter);
+      }
+      if (writeDensity)
+      {
+         auto densityWriter = make_shared< field::VTKWriter< ScalarField_T, float32 > >(ids.densityField, "density");
+         vtkOutput->addCellDataWriter(densityWriter);
+      }
+      if (writeFlag)
+      {
+         auto flagWriter = make_shared< field::VTKWriter< FlagField_T > >(ids.flagField, "flag");
+         vtkOutput->addCellDataWriter(flagWriter);
+      }
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+   StreamCollide streamCollide(blocks, ids, omega);
+   Interpolation interpolation(blocks, ids, omega);
+   Timestep timeloopFunction(blocks, ids, streamCollide, boundaryCollection, interpolation, nonUniformPackInfo, communication);
+   timeloop.addFuncBeforeTimeStep(timeloopFunction, "Refinement Cycle");
+   const real_t remainingTimeLoggerFrequency =
+      loggingParameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   if (uint_c(remainingTimeLoggerFrequency) > 0)
+   {
+      timeloop.addFuncAfterTimeStep(
+         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+         "remaining time logger");
+   }
+   const uint_t evaluationCheckFrequency = EvaluationParameters.getParameter< uint_t >("evaluationCheckFrequency");
+   const bool evaluationLogToStream      = EvaluationParameters.getParameter< bool >("logToStream");
+   const bool evaluationLogToFile        = EvaluationParameters.getParameter< bool >("logToFile");
+   const std::string evaluationFilename  = EvaluationParameters.getParameter< std::string >("filename");
+   std::shared_ptr< AccuracyEvaluation > evaluation =
+      std::make_shared< AccuracyEvaluation >(blocks, ids, sweepCollection, maxLatticeVelocity, evaluationCheckFrequency,
+                                             evaluationLogToStream, evaluationLogToFile);
+   timeloop.addFuncBeforeTimeStep(SharedFunctor< AccuracyEvaluation >(evaluation), "evaluation");
+   //////////////////////
+   /// RUN SIMULATION ///
+   //////////////////////
+   const lbm_generated::PerformanceEvaluation< FlagField_T > performance(blocks, ids.flagField, fluidFlagUID);
+   field::CellCounter< FlagField_T > fluidCells(blocks, ids.flagField, fluidFlagUID);
+   fluidCells();
+   WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks())
+   for (uint_t level = 0; level <= refinementLevels; level++)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level))
+   }
+   WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation")
+   WcTimingPool timeloopTiming;
+   WcTimer simTimer;
+   WALBERLA_MPI_BARRIER()
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+   simTimer.start();
+   timeloop.run(timeloopTiming);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+   WALBERLA_MPI_BARRIER()
+   simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   real_t time = simTimer.max();
+   WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+   performance.logResultOnRoot(timesteps, time);
+   const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+   WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+   printResidentMemoryStatistics();
+   return EXIT_SUCCESS;
+}
--- a/apps/showcases/Channel/Channel.prm
+++ b/apps/showcases/Channel/Channel.prm
+Parameters
+{
+	omega              1.4;
+	timesteps          10;
+	maxLatticeVelocity 0.01;
+	remainingTimeLoggerFrequency 0; // in seconds
+	vtkWriteFrequency 0;
+}
+DomainSetup
+{
+    blocks        <  2,  2,  1 >;
+    cellsPerBlock < 4, 4, 4 >;
+    periodic      <  1,  0,  1 >;
+    refinementLevels 1;
+    numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
+}
+Boundaries 
+{
+    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction N;    walldistance -1;  flag UBB; }
+}
+VTKWriter
+{
+    vtkWriteFrequency 1;
+    velocity true;
+    density true;
+    averageFields true;
+    flag false;
+    writeOnlySlice false;
+    amrFileFormat false;
+    oneFilePerProcess false;
+}
+Logging
+{
+    logLevel info;  // info progress detail tracing
+    writeSetupForestAndReturn false;
+    remainingTimeLoggerFrequency 60; // in seconds
+}
+Evaluation
+{
+    evaluationCheckFrequency 100;
+    logToStream true;
+    logToFile true;
+    filename Channel.txt;
+}
--- a/apps/showcases/Channel/Channel.py
+++ b/apps/showcases/Channel/Channel.py
+import sympy as sp
+from pystencils import Target
+from pystencils import fields
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+import warnings
+warnings.filterwarnings("ignore")
+with CodeGeneration() as ctx:
+    target = Target.CPU  # Target.GPU if ctx.cuda else Target.CPU
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    pdf_dtype = "float64"
+    streaming_pattern = 'pull'
+    timesteps = get_timesteps(streaming_pattern)
+    omega = sp.symbols("omega")
+    stencil = LBStencil(Stencil.D3Q27)
+    dim = stencil.D
+    pdfs, pdfs_tmp = fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
+    velocity_field, density_field = fields(f"velocity({dim}), density(1) : {data_type}[{dim}D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+    lbm_config = LBMConfig(stencil=stencil, method=Method.TRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern, compressible=True)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([sp.Symbol("u_x"), 0, 0], data_type=data_type))
+    generate_lbm_package(ctx, name="Channel",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields, data_type=data_type)
+    generate_info_header(ctx, 'ChannelHeader')
No results found