Compare revisions

bc7d45bd · bc7d45bd · bc7d45bd · bc7d45bd · bc7d45bd · bc7d45bd
--- a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
+++ b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
@@ -15,10 +15,10 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
 if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
+            DEPENDS blockforest core gpu field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
 else ()
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
+            DEPENDS blockforest core field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
 endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )

--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
@@ -8,6 +8,11 @@ from waLBerla.tools.config import block_decomposition
 import sys
 from math import prod

+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+

 def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
    """Checks if a single block of given size fits into GPU memory"""
@@ -20,7 +25,9 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, s


 class Scenario:
-    def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256),
+    def __init__(self, time_step_strategy,
+                 cuda_block_size,
+                 cells_per_block=(256, 256, 256),
                 cuda_enabled_mpi=False):
        # output frequencies
        self.vtkWriteFrequency = 0
@@ -89,6 +96,14 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
        sequenceValuesToScalars(data)

        df = pd.DataFrame.from_records([data])
@@ -101,43 +116,19 @@ class Scenario:
 def benchmark():
    scenarios = wlb.ScenarioManager()

-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
    gpu_mem = gpu_mem_gb * (2 ** 30)

-    block_size = (256, 256, 256)
+    block_size = (320, 320, 320)
+    cuda_enabled_mpi = True

    if not domain_block_size_ok(block_size, gpu_mem):
        wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
    else:
-        scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size))
-
+        scenarios.add(Scenario(time_step_strategy='normal',
+                               cuda_block_size=(128, 1, 1),
+                               cells_per_block=block_size,
+                               cuda_enabled_mpi=cuda_enabled_mpi))

-def kernel_benchmark():
-    scenarios = wlb.ScenarioManager()
-
-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
-    gpu_mem = gpu_mem_gb * (2 ** 30)

-    block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)]
-
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1),
-                   (32, 4, 1), (64, 4, 1),
-                   (32, 4, 2),
-                   (32, 8, 1),
-                   (16, 16, 1)]
-
-    for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
-        for cuda_block in cuda_blocks:
-            for block_size in block_sizes:
-                if not domain_block_size_ok(block_size, gpu_mem):
-                    wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
-                    continue
-                scenario = Scenario(time_step_strategy=time_step_strategy,
-                                    cuda_block_size=cuda_block,
-                                    cells_per_block=block_size)
-                scenarios.add(scenario)
-
-
-# benchmark()
-kernel_benchmark()
+benchmark()
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -29,6 +29,7 @@
 #include "field/vtk/VTKWriter.h"

 #include "geometry/InitBoundaryHandling.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"

 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/DictWrapper.h"
@@ -78,14 +79,10 @@ int main(int argc, char** argv)
      logging::configureLogging(config);
      shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGridFromConfig(config);

-      Vector3< uint_t > cellsPerBlock =
-         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
      // Reading parameters
      auto parameters                    = config->getOneBlock("Parameters");
      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
      const uint_t timesteps             = parameters.getParameter< uint_t >("timesteps", uint_c(50));
-      const real_t remainingTimeLoggerFrequency =
-         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0));
      const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
      const uint_t warmupSteps  = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));

@@ -102,6 +99,7 @@ int main(int argc, char** argv)
         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
      BlockDataID phase_field_gpu =
         gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+      BlockDataID phase_field_tmp = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "temporary phasefield", true);
 #else
      BlockDataID lb_phase_field =
         field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
@@ -109,6 +107,7 @@ int main(int argc, char** argv)
         field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx);
      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
+      BlockDataID phase_field_tmp = field::addToStorage< PhaseField_T >(blocks, "phase tmp", real_c(0.0), field::fzyx);
 #endif

      if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only")
@@ -139,47 +138,80 @@ int main(int argc, char** argv)
      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);

      pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
+         lb_phase_field_gpu, phase_field_gpu, phase_field_tmp, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
                                              gpuBlockSize[1], gpuBlockSize[2]);
 #else
      pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
-      pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, vel_field);
+      pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, phase_field_tmp, vel_field);
      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
 #endif

 // add communication
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
-      auto Comm_velocity_based_distributions =
-         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
-      auto generatedPackInfo_velocity_based_distributions =
-         make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
-      Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      const bool gpuEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
+      const int streamLowPriority  = 0;
+      const int streamHighPriority = 0;
+      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
+
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field_gpu);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
-      Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);

-      auto Comm_phase_field_distributions =
-         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
-      auto generatedPackInfo_phase_field_distributions =
-         make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
-      Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
-#else
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Full_Stencil_T > >(blocks, gpuEnabledMpi, false);
+      auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false, 65432);
+
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
+
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });

-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });

+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
+
+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< gpu::GPUField<real_t> >(phase_field_gpu);
+           auto phaseFieldTMP = b->getData< gpu::GPUField<real_t> >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
+
+#else
+
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
-      auto generatedPackInfo_velocity_based_distributions =
-         make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);

-      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field);
-      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
+      auto UniformGPUSchemePhaseField = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks, 65432);
+
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);

-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
-      auto generatedPackInfo_phase_field_distributions =
-         make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
-      Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
+
+      auto Comm_phase_field_distributions = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
+
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
+
+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< PhaseField_T >(phase_field);
+           auto phaseFieldTMP = b->getData< PhaseField_T >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
 #endif

      BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
@@ -201,99 +233,37 @@ int main(int argc, char** argv)
            init_h(&block);
            init_g(&block);
         }
+         WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+         WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+         WALBERLA_MPI_BARRIER()
         WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions done")
      }

+      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      int const streamLowPriority  = 0;
-      int const streamHighPriority = 0;
-      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
-      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
-#endif
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_step.getSweep(defaultStream), "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");

-      auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      auto normalTimeStep = [&]() {
-         Comm_velocity_based_distributions->startCommunication();
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block, defaultStream);
-         Comm_velocity_based_distributions->wait();
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_step.getSweep(defaultStream), "Hydro LB Step");
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
+
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");

-         Comm_phase_field_distributions->startCommunication();
-         for (auto& block : *blocks)
-            hydro_LB_step(&block, defaultStream);
-         Comm_phase_field_distributions->wait();
-      };
-      auto phase_only = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-      };
-      auto hydro_only = [&]() {
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-      auto without_comm = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
 #else
-      auto normalTimeStep = [&]() {
-            Comm_velocity_based_distributions.startCommunication();
-            for (auto& block : *blocks)
-               phase_field_LB_step(&block);
-            Comm_velocity_based_distributions.wait();
-
-            Comm_phase_field_distributions.startCommunication();
-            for (auto& block : *blocks)
-               hydro_LB_step(&block);
-            Comm_phase_field_distributions.wait();
-      };
-      auto phase_only = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-      };
-      auto hydro_only = [&]() {
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-      auto without_comm = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-#endif
-      std::function< void() > timeStep;
-      if (timeStepStrategy == "phase_only")
-      {
-         timeStep = std::function< void() >(phase_only);
-         WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
-      }
-      else if (timeStepStrategy == "hydro_only")
-      {
-         timeStep = std::function< void() >(hydro_only);
-         WALBERLA_LOG_INFO_ON_ROOT("started only hydro step without communication for benchmarking")
-      }
-      else if (timeStepStrategy == "kernel_only")
-      {
-         timeStep = std::function< void() >(without_comm);
-         WALBERLA_LOG_INFO_ON_ROOT("started complete phasefield model without communication for benchmarking")
-      }
-      else
-      {
-         timeStep = std::function< void() >(normalTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("normal timestep with overlapping")
-      }
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_step.getSweep(), "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");

-      timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_step.getSweep(), "Hydro LB Step");
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");

-      // remaining time logger
-      if (remainingTimeLoggerFrequency > 0)
-         timeLoop->addFuncAfterTimeStep(
-            timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
-            "remaining time logger");
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
+#endif

      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
      if (vtkWriteFrequency > 1)
@@ -307,40 +277,60 @@ int main(int argc, char** argv)
         auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
         vtkOutput->addCellDataWriter(phaseWriter);

-         timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+         timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
      }

+      lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
+      fluidCells();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Multiphase benchmark with " << fluidCells.numberOfCells() << " fluid cells")
+      WALBERLA_LOG_INFO_ON_ROOT("Running " << warmupSteps << " timesteps to warm up the system")
+
      for (uint_t i = 0; i < warmupSteps; ++i)
-         timeLoop->singleStep();
+         timeloop.singleStep();
+
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Warmup timesteps done")

-      timeLoop->setCurrentTimeStepToZero();
+      timeloop.setCurrentTimeStepToZero();
+      WALBERLA_MPI_BARRIER()
      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      WcTimingPool timeloopTiming;
      WcTimer simTimer;
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      cudaDeviceSynchronize();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 #endif
      simTimer.start();
-      timeLoop->run();
+      timeloop.run(timeloopTiming);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      cudaDeviceSynchronize();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 #endif
+      WALBERLA_MPI_BARRIER()
      simTimer.end();
      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-      auto time            = real_c(simTimer.last());
-      auto nrOfCells       = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
-      auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
-      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
+      double time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << performance.mlupsPerProcess(timesteps, time))
      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
      WALBERLA_ROOT_SECTION()
      {
         python_coupling::PythonCallback pythonCallbackResults("results_callback");
         if (pythonCallbackResults.isCallable())
         {
-            pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+            pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
            pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
            pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
            #if defined(WALBERLA_BUILD_WITH_CUDA)
-               pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", cudaEnabledMpi);
+               pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", gpuEnabledMpi);
            #endif
            // Call Python function to report results
            pythonCallbackResults();

--- a/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
@@ -19,7 +19,7 @@ with CodeGeneration() as ctx:
    field_type = "float64" if ctx.double_accuracy else "float32"

    stencil_phase = LBStencil(Stencil.D3Q15)
-    stencil_hydro = LBStencil(Stencil.D3Q27)
+    stencil_hydro = LBStencil(Stencil.D3Q19)
    assert (stencil_phase.D == stencil_hydro.D)

    ########################
@@ -76,13 +76,13 @@ with CodeGeneration() as ctx:
                                 delta_equilibrium=False,
                                 force=sp.symbols(f"F_:{stencil_phase.D}"), velocity_input=u,
                                 weighted=True, relaxation_rates=rates,
-                                 output={'density': C_tmp}, kernel_type='stream_pull_collide')
+                                 output={'density': C_tmp})
    method_phase = create_lb_method(lbm_config=lbm_config_phase)

    lbm_config_hydro = LBMConfig(stencil=stencil_hydro, method=Method.MRT, compressible=False,
                                 weighted=True, relaxation_rate=omega,
                                 force=sp.symbols(f"F_:{stencil_hydro.D}"),
-                                 output={'velocity': u}, kernel_type='collide_stream_push')
+                                 output={'velocity': u})
    method_hydro = create_lb_method(lbm_config=lbm_config_hydro)

    # create the kernels for the initialization of the g and h field
@@ -137,7 +137,8 @@ with CodeGeneration() as ctx:
    sweep_params = {'block_size': sweep_block_size}

    stencil_typedefs = {'Stencil_phase_T': stencil_phase,
-                        'Stencil_hydro_T': stencil_hydro}
+                        'Stencil_hydro_T': stencil_hydro,
+                        'Full_Stencil_T': LBStencil(Stencil.D3Q27)}
    field_typedefs = {'PdfField_phase_T': h,
                      'PdfField_hydro_T': g,
                      'VelocityField_T': u,
@@ -156,7 +157,7 @@ with CodeGeneration() as ctx:
        generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target=Target.CPU)

        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                       field_swaps=[(h, h_tmp), (C, C_tmp)],
+                       field_swaps=[(h, h_tmp)],
                       inner_outer_split=True,
                       cpu_vectorize_info=cpu_vec,
                       target=Target.CPU)
@@ -172,7 +173,7 @@ with CodeGeneration() as ctx:
                              streaming_pattern='pull', target=Target.CPU)

        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
-                              streaming_pattern='push', target=Target.CPU)
+                              streaming_pattern='pull', target=Target.CPU)

        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.CPU)

@@ -183,7 +184,7 @@ with CodeGeneration() as ctx:
                       g_updates, target=Target.GPU)

        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                       field_swaps=[(h, h_tmp), (C, C_tmp)],
+                       field_swaps=[(h, h_tmp)],
                       target=Target.GPU,
                       gpu_indexing_params=sweep_params,
                       varying_parameters=vp)
@@ -198,7 +199,7 @@ with CodeGeneration() as ctx:
                              streaming_pattern='pull', target=Target.GPU)

        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
-                              streaming_pattern='push', target=Target.GPU)
+                              streaming_pattern='pull', target=Target.GPU)

        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.GPU)


--- a/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
+++ b/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
@@ -37,6 +37,7 @@
 #include "core/debug/CheckFunctions.h"
 #include "core/debug/Debug.h"
 #include "core/debug/TestSubsystem.h"
+#include "core/logging/Initialization.h"
 #include "core/logging/Logging.h"
 #include "core/math/Constants.h"
 #include "core/math/Sample.h"
@@ -2889,10 +2890,9 @@ int main( int argc, char **argv )
                      "             ('fzyx') data layout!" << std::endl;
      }
      return EXIT_SUCCESS;
-   }   
+   }

   logging::Logging::printHeaderOnStream();
-   //WALBERLA_ROOT_SECTION() { logging::Logging::instance()->setLogLevel( logging::Logging::PROGRESS ); }

 #ifdef _OPENMP
   if( std::getenv( "OMP_NUM_THREADS" ) == nullptr )
@@ -2903,6 +2903,7 @@ int main( int argc, char **argv )
   // open configuration file

   shared_ptr< Config > config = make_shared< Config >();
+   logging::configureLogging(config);
   config->readParameterFile( argv[1] );

   Config::BlockHandle configBlock = config->getBlock( "SchaeferTurek" );

--- a/apps/benchmarks/SchaeferTurek/input.dat
+++ b/apps/benchmarks/SchaeferTurek/input.dat
@@ -29,31 +29,31 @@ SchaeferTurek
   //////////////////////////////
   
   memoryPerCell      153; // in bytes
-   processMemoryLimit 2048; // in MiB !
+   processMemoryLimit 4048; // in MiB !
    
-   yzBlocks 1; // blocks in y- and z-direction [2D: only 1 block in z-direction]
+   yzBlocks 4; // blocks in y- and z-direction [2D: only 1 block in z-direction]
   // The number of blocks in x-direction is calculated from: H, L, yzBlocks, xCells, and yzCells.
   
-   xCells  40; // number of cells in x-direction for each block
-   yzCells 40; // number of cells in y- and z-direction for each block [2D: only affects y-direction]
+   xCells  64; // number of cells in x-direction for each block
+   yzCells 16; // number of cells in y- and z-direction for each block [2D: only affects y-direction]
   
   H 0.41; // [m]
-   L 20.0; // [m] (2.5 [3D], 2.2 [2D])
+   L 2.5; // [m] (2.5 [3D], 2.2 [2D])
   strictlyObeyL false;//true; // true = outflow will be located exactly at "L", false = outflow might be slightly further away than "L"
   
   cylinderxPosition    0.5;  // [m] (0.5 [3D], 0.2 [2D])
   cylinderyPosition    0.2;  // [m]
-   cylinderRadius       0.18; // [m]
+   cylinderRadius       0.05; // [m]
   circularCrossSection true; // true = obstacle is a true cylinder, false = obstacle is a box
   
   kinViscosity   0.001; // [m^2/s]
   rho            1;     // [kg/m^3]
   
-   inflowVelocity   2; // [m/s] (0.45 [3D], 2.25 [3D], 0.3 [2D], 1.5 [2D])
-   raisingTime    0.3; // [s] (0 == immediatelly full velocity)
+   inflowVelocity   0.45; // [m/s] (0.45 [3D], 2.25 [3D], 0.3 [2D], 1.5 [2D])
+   raisingTime      0; // [s] (0 == immediatelly full velocity)
   sinPeriod        0; // [s] (0 == no sinus term)
   
-   obstacleBoundary 0; // 0 = staircase, 1 = curved
+   obstacleBoundary 1; // 0 = staircase, 1 = curved
   outletType       1; // 0 = pressure, 1 = outlet (2/1), 2 = outlet (4/3)
   
   initWithVelocity false; // at the beginning of the simulation everything is set to: ( 4 * inflowVelocity ) / 9 [3D] or ( 2 * inflowVelocity ) / 3 [2D]
@@ -63,7 +63,7 @@ SchaeferTurek
   
   nbrOfEvaluationPointsForCoefficientExtremas 100;
   
-   evaluatePressure true;
+   evaluatePressure false;
   pAlpha < 0.45, 0.2, 0.205 >; // points for evaluating
   pOmega < 0.55, 0.2, 0.205 >; // the pressure difference
   
@@ -74,9 +74,9 @@ SchaeferTurek
   // WHERE TO REFINE ? //
   ///////////////////////
   
-   useCylinderForRefinement false;
-   cylinderRefinementLevel  2; // if "useCylinderForRefinement" is true, everything around the cylinder is refined to this level
-   cylinderRefinementBuffer 0.05; // [m] - additional space around the cylinder that is also guaranteed to be at level "cylinderRefinementLevel"
+   useCylinderForRefinement true;
+   cylinderRefinementLevel  0; // if "useCylinderForRefinement" is true, everything around the cylinder is refined to this level
+   cylinderRefinementBuffer 0.0; // [m] - additional space around the cylinder that is also guaranteed to be at level "cylinderRefinementLevel"
   
   AABBRefinementSelection
   {
@@ -149,11 +149,11 @@ SchaeferTurek
   ////////////////////////////
   
 //minSimulationTime  10; // in [s] - ATTENTION: _IF_ 'minSimulationTime' is specified, 'outerTimeSteps' is adapted accordingly!
-   outerTimeSteps      2; // total number of time steps = outerTimeSteps * innerTimeSteps
-   innerTimeSteps   5000; // For each outer loop, performance data is logged.
+   outerTimeSteps      1; // total number of time steps = outerTimeSteps * innerTimeSteps
+   innerTimeSteps   20001; // For each outer loop, performance data is logged.
   
-   evaluationCheckFrequency 0; // 0 = disable evaluation
-   evaluationLogToStream    false;
+   evaluationCheckFrequency 100; // 0 = disable evaluation
+   evaluationLogToStream    true;
   evaluationLogToFile      true;
   evaluationFilename       SchaeferTurek.txt;
   
@@ -175,7 +175,7 @@ SchaeferTurek
      vtkMPIIO           true;
   }
   
-   remainingTimeLoggerFrequency 5; // in seconds
+   remainingTimeLoggerFrequency 10; // in seconds
   
   vtkBeforeTimeStep true; // false = at the end of time step
   
@@ -243,18 +243,20 @@ SchaeferTurek
   checkStrouhalNbrDiscreteDUpperBound  1E6; // upper bound for the Strouhal number (only checked if "evaluateStrouhal" is set to true - "discrete" cylinder diameter used for evaluation)
 }

-
+Logging
+{
+    logLevel info;  // info progress detail tracing
+}

 VTK
 {
-   /*
   fluid_field
   {
      baseFolder vtk;
    //initialWriteCallsToSkip 55130;
  
      writeFrequency 304;//102;
-      ghostLayers      1;
+      ghostLayers      0;
      
    //AABB_filter_0 {
    //   min < 0,   0,    0.2049999 >;
@@ -276,18 +278,16 @@ VTK
      }
      
      writers {
-       //VelocityFromPDF;
+         VelocityFromPDF;
         VelocityMagnitudeFromPDF;
-       //DensityFromPDF;
+         DensityFromPDF;
       //NonEquPart;
       //PDF;
      }
      
      incompatibleBlockStates empty;
   }
-   */
-   
-   /*
+
   flag_field
   {
      baseFolder vtk;
@@ -302,8 +302,7 @@ VTK
      
      incompatibleBlockStates empty;
   }
-   */
-  
+
   /*
   domain_decomposition
   {

--- a/apps/benchmarks/UniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt
@@ -3,14 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )


-foreach(streaming_pattern pull push aa esotwist)
+foreach(streaming_pattern pull push aa esotwist esopull esopush)
    foreach(stencil d3q19 d3q27)
-        foreach (collision_setup srt trt w-mrt r-w-mrt cm r-cm k r-k entropic smagorinsky)
+        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
 	    # KBC methods only for D2Q9 and D3Q27 defined
 	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
 		    continue()
-	    endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
-
+        endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+        if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
+            continue()
+        endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
            set(config ${stencil}_${streaming_pattern}_${collision_setup})
            waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
                    FILE UniformGridCPU.py

--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
@@ -235,12 +235,16 @@ int main(int argc, char** argv)
                  pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
                  pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
                  pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+                  pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+                  pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
                  pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
                  pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
                  pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
                  pythonCallbackResults.data().exposeValue("stencil", infoStencil);
                  pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
                  pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+                  pythonCallbackResults.data().exposeValue("vectorised", vectorised);
+                  pythonCallbackResults.data().exposeValue("nontemporal", nontemporal);
                  pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
                  pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
                  // Call Python function to report results

--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
@@ -3,17 +3,12 @@ from dataclasses import replace
 import sympy as sp
 import pystencils as ps

-from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases, insert_constants,\
-    insert_symbol_times_minus_one
-
 from lbmpy.advanced_streaming import is_inplace
-from lbmpy.advanced_streaming.utility import streaming_patterns, get_accessor, Timestep
+from lbmpy.advanced_streaming.utility import streaming_patterns
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
-from lbmpy.enums import Method, Stencil
-from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
+from lbmpy.enums import Method, Stencil, SubgridScaleModel
 from lbmpy.moments import get_default_moment_set_for_stencil
-from lbmpy.updatekernels import create_stream_only_kernel

 from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
 from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
@@ -21,48 +16,53 @@ from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
 omega = sp.symbols('omega')
 omega_free = sp.Symbol('omega_free')

-# best configs in terms of FLOPS
 options_dict = {
    'srt': {
        'method': Method.SRT,
        'relaxation_rate': omega,
-        'compressible': True,
+        'compressible': False,
    },
    'trt': {
        'method': Method.TRT,
        'relaxation_rate': omega,
-        'compressible': True,
+        'compressible': False,
    },
-    'r-w-mrt': {
+    'mrt': {
        'method': Method.MRT,
        'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
-        'compressible': True,
+        'compressible': False,
    },
-    'w-mrt': {
+    'mrt-overrelax': {
        'method': Method.MRT,
        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
-        'compressible': True,
+        'compressible': False,
    },
-    'r-cm': {
+    'central': {
        'method': Method.CENTRAL_MOMENT,
        'relaxation_rate': omega,
        'compressible': True,
    },
-    'cm': {
+    'central-overrelax': {
        'method': Method.CENTRAL_MOMENT,
        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
        'compressible': True,
    },
-    'r-k': {
-        'method': Method.CUMULANT,
+    'cumulant': {
+        'method': Method.MONOMIAL_CUMULANT,
        'relaxation_rate': omega,
        'compressible': True,
    },
-    'k': {
-        'method': Method.CUMULANT,
+    'cumulant-overrelax': {
+        'method': Method.MONOMIAL_CUMULANT,
        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
        'compressible': True,
    },
+    'cumulant-K17': {
+        'method': Method.CUMULANT,
+        'relaxation_rate': omega,
+        'compressible': True,
+        'fourth_order_correction': 0.01
+    },
    'entropic': {
        'method': Method.TRT_KBC_N4,
        'compressible': True,
@@ -73,7 +73,12 @@ options_dict = {
    },
    'smagorinsky': {
        'method': Method.SRT,
-        'smagorinsky': False,
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
+        'relaxation_rate': omega,
+    },
+    'qr': {
+        'method': Method.SRT,
+        'subgrid_scale_model': SubgridScaleModel.QR,
        'relaxation_rate': omega,
    }
 }
@@ -83,6 +88,8 @@ info_header = """
 const char * infoStencil = "{stencil}";
 const char * infoStreamingPattern = "{streaming_pattern}";
 const char * infoCollisionSetup = "{collision_setup}";
+const bool vectorised = {vec};
+const bool nontemporal = {nt_stores};
 const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
@@ -90,10 +97,15 @@ const bool infoCsePdfs = {cse_pdfs};
 with CodeGeneration() as ctx:
    openmp = True if ctx.openmp else False
    field_type = "float64" if ctx.double_accuracy else "float32"
-    if ctx.optimize_for_localhost:
-        cpu_vec = {"nontemporal": False, "assume_aligned": True}
-    else:
-        cpu_vec = None
+    # This base pointer specification causes introduces temporary pointers in the outer loop such that the inner loop
+    # only contains aligned memory addresses. Doing so NT Stores are much more effective which causes great perfomance
+    # gains especially for the pull scheme on skylake architectures
+    base_pointer_spec = None  # [['spatialInner0'], ['spatialInner1']]
+    # cpu_vec = {"instruction_set": "best", "nontemporal": False,
+    #            "assume_aligned": True, 'assume_sufficient_line_padding': True}
+
+    cpu_vec = {"instruction_set": None}
+    nt_stores = False

    config_tokens = ctx.config.split('_')

@@ -110,7 +122,6 @@ with CodeGeneration() as ctx:
        raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")

    assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
-
    options = options_dict[collision_setup]

    assert stencil.D == 3, "This application supports only three-dimensional stencils"
@@ -121,23 +132,20 @@ with CodeGeneration() as ctx:
    lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
    lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')

+    # This creates a simplified version of the central moment collision operator where the bulk and shear viscosity is
+    # not seperated. This is done to get a fair comparison with the monomial cumulants.
    if lbm_config.method == Method.CENTRAL_MOMENT:
        lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))

    if not is_inplace(streaming_pattern):
        lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
-        field_swaps = [(pdfs, pdfs_tmp)]
-    else:
-        field_swaps = []

-    # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
-    # is_inplace is set to False to ensure that the streaming is done with src and dst field.
-    # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
-    # accessor = CollideOnlyInplaceAccessor()
-    accessor = get_accessor(streaming_pattern, Timestep.EVEN)
-    #accessor.is_inplace = False
-    field_swaps_stream_only = () if accessor.is_inplace else [(pdfs, pdfs_tmp)]
-    stream_only_kernel = create_stream_only_kernel(stencil, pdfs, None if accessor.is_inplace else pdfs_tmp, accessor=accessor)
+    # This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
+    # the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
+    # typically remove the copy of the center PDF which results in an overestimation of the maximum performance
+    stream_only_kernel = []
+    for i in range(stencil.Q):
+        stream_only_kernel.append(ps.Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))

    # LB Sweep
    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
@@ -152,17 +160,26 @@ with CodeGeneration() as ctx:
                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
                         nonuniform=False, boundaries=[no_slip, ubb],
                         macroscopic_fields=macroscopic_fields,
-                         cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
+                         cpu_openmp=openmp, cpu_vectorize_info=cpu_vec,
+                         base_pointer_specification=base_pointer_spec)

    # Stream only kernel
+    cpu_vec_stream = None
+    if ctx.optimize_for_localhost:
+        cpu_vec_stream = {"instruction_set": "best", "nontemporal": True,
+                          "assume_aligned": True, 'assume_sufficient_line_padding': True,
+                          "assume_inner_stride_one": True}
+
    generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel,
-                   field_swaps=field_swaps_stream_only,
-                   target=ps.Target.CPU, cpu_openmp=openmp)
+                   target=ps.Target.CPU, cpu_openmp=openmp,
+                   cpu_vectorize_info=cpu_vec_stream, base_pointer_specification=[['spatialInner0'], ['spatialInner1']])

    infoHeaderParams = {
        'stencil': stencil_str,
        'streaming_pattern': streaming_pattern,
        'collision_setup': collision_setup,
+        'vec': int(True if cpu_vec else False),
+        'nt_stores': int(nt_stores),
        'cse_global': int(lbm_opt.cse_global),
        'cse_pdfs': int(lbm_opt.cse_pdfs),
    }

--- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
@@ -4,13 +4,26 @@ from waLBerla.tools.config import block_decomposition
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
 import sys
 import sqlite3
-from math import prod
+
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None

 # Number of time steps run for a workload of 128^3 per process
 # if double as many cells are on the process, half as many time steps are run etc.
 # increase this to get more reliable measurements
 TIME_STEPS_FOR_128_BLOCK = 10
 DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))


 def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
@@ -35,7 +48,7 @@ class Scenario:
    def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1,
                 timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
                 warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
-                 vtk_write_frequency=0, remaining_time_logger_frequency=-1):
+                 vtk_write_frequency=0, remaining_time_logger_frequency=-1, db_file_name=None):

        if boundary_setup:
            init_shear_flow = False
@@ -58,6 +71,7 @@ class Scenario:

        self.vtk_write_frequency = vtk_write_frequency
        self.remaining_time_logger_frequency = remaining_time_logger_frequency
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name

        self.config_dict = self.config(print_dict=False)

@@ -101,6 +115,15 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
        sequenceValuesToScalars(data)

        result = data
@@ -111,8 +134,8 @@ class Scenario:
        table_name = table_name.replace("-", "_")
        for num_try in range(num_tries):
            try:
-                checkAndUpdateSchema(result, table_name, DB_FILE)
-                storeSingle(result, table_name, DB_FILE)
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
                break
            except sqlite3.OperationalError as e:
                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
@@ -156,18 +179,38 @@ def overlap_benchmark():
            scenarios.add(scenario)


-def scaling_benchmark():
-    """Tests different communication overlapping strategies"""
-    wlb.log_info_on_root("Running scaling benchmark")
+def weak_scaling_benchmark():
+    wlb.log_info_on_root("Running weak scaling benchmark with one block per proc")
    wlb.log_info_on_root("")

    scenarios = wlb.ScenarioManager()
-    cells_per_block = [(32, 32, 32), (128, 128, 128)]

-    for cell_per_block in cells_per_block:
-        scenarios.add(Scenario(time_step_strategy='noOverlap',
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(time_step_strategy=t,
                               inner_outer_split=(1, 1, 1),
-                               cells_per_block=cell_per_block))
+                               cells_per_block=(WeakX, WeakY, WeakZ),
+                               boundary_setup=True,
+                               outer_iterations=1,
+                               db_file_name="weakScalingUniformGridOneBlock.sqlite3"))
+
+
+def strong_scaling_benchmark():
+    wlb.log_info_on_root("Running strong scaling benchmark with one block per proc")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+
+    domain_size = (StrongX, StrongY, StrongZ)
+    blocks = block_decomposition(wlb.mpi.numProcesses())
+    cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
+
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
+                               time_step_strategy=t,
+                               outer_iterations=1,
+                               timesteps=10,
+                               boundary_setup=True,
+                               db_file_name="strongScalingUniformGridOneBlock.sqlite3"))


 def single_node_benchmark():
@@ -176,13 +219,11 @@ def single_node_benchmark():
    wlb.log_info_on_root("")

    scenarios = wlb.ScenarioManager()
-    block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]
-    for block_size in block_sizes:
-        scenario = Scenario(cells_per_block=block_size,
-                            time_step_strategy='kernelOnly',
-                            outer_iterations=1,
-                            timesteps=num_time_steps(block_size))
-        scenarios.add(scenario)
+    scenario = Scenario(cells_per_block=(128, 128, 128),
+                        time_step_strategy='kernelOnly',
+                        outer_iterations=1,
+                        timesteps=10)
+    scenarios.add(scenario)


 def validation_run():
@@ -211,5 +252,15 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI
 # performance of compute kernel (no communication)
 # overlap_benchmark()  # benchmarks different communication overlap options
 # profiling()  # run only two timesteps on a smaller domain for profiling only
-validation_run()
+# validation_run()
 # scaling_benchmark()
+
+
+if BENCHMARK == 0:
+    single_node_benchmark()
+elif BENCHMARK == 1:
+    weak_scaling_benchmark()
+elif BENCHMARK == 2:
+    strong_scaling_benchmark()
+else:
+    validation_run()
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -3,13 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )


-foreach(streaming_pattern pull push aa esotwist)
+foreach(streaming_pattern pull push aa esotwist esopull esopush)
    foreach(stencil d3q19 d3q27)
-        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky)
+        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
 	    # KBC methods only for D2Q9 and D3Q27 defined
 	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
 		    continue()
-	    endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+        endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+        if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
+            continue()
+        endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
            set(config ${stencil}_${streaming_pattern}_${collision_setup})
            waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
                    FILE UniformGridGPU.py

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -136,7 +136,7 @@ int main(int argc, char** argv)
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
+      const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID);

      // Boundaries
      const FlagUID fluidFlagUID("Fluid");
@@ -264,6 +264,13 @@ int main(int argc, char** argv)
            python_coupling::PythonCallback pythonCallbackResults("results_callback");
            if (pythonCallbackResults.isCallable())
            {
+               pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
+               pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
+               pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+               pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+               pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
+               pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
               pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -4,17 +4,16 @@ import pystencils as ps

 from dataclasses import replace

+from pystencils import Assignment
 from pystencils.typing import TypedSymbol
 from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions

-from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
+from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, SubgridScaleModel
 from lbmpy.advanced_streaming import is_inplace
 from lbmpy.advanced_streaming.utility import streaming_patterns
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import create_lb_collision_rule
 from lbmpy.moments import get_default_moment_set_for_stencil
-from lbmpy.updatekernels import create_stream_only_kernel
-from lbmpy.fieldaccess import *

 from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
 from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
@@ -74,6 +73,12 @@ options_dict = {
        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
        'compressible': True,
    },
+    'cumulant-K17': {
+        'method': Method.CUMULANT,
+        'relaxation_rate': omega,
+        'compressible': True,
+        'fourth_order_correction': 0.01
+    },
    'entropic': {
        'method': Method.TRT_KBC_N4,
        'compressible': True,
@@ -84,7 +89,12 @@ options_dict = {
    },
    'smagorinsky': {
        'method': Method.SRT,
-        'smagorinsky': False,
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
+        'relaxation_rate': omega,
+    },
+    'qr': {
+        'method': Method.SRT,
+        'subgrid_scale_model': SubgridScaleModel.QR,
        'relaxation_rate': omega,
    }
 }
@@ -101,7 +111,8 @@ const bool infoCsePdfs = {cse_pdfs};
 optimize = True

 with CodeGeneration() as ctx:
-    field_type = "float64" if ctx.double_accuracy else "float32"
+    pdf_data_type = "float64"
+    field_data_type = "float64"
    config_tokens = ctx.config.split('_')

    assert len(config_tokens) >= 3
@@ -124,8 +135,8 @@ with CodeGeneration() as ctx:
    options = options_dict[collision_setup]

    assert stencil.D == 3, "This application supports only three-dimensional stencils"
-    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
-    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_data_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_data_type}[3D]", layout='fzyx')
    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}

    lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
@@ -140,13 +151,12 @@ with CodeGeneration() as ctx:
    else:
        field_swaps = []

-    # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
-    # is_inplace is set to False to ensure that the streaming is done with src and dst field.
-    # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
-    accessor = CollideOnlyInplaceAccessor()
-    accessor.is_inplace = False
-    field_swaps_stream_only = [(pdfs, pdfs_tmp)]
-    stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
+    # This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
+    # the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
+    # typically remove the copy of the center PDF which results in an overestimation of the maximum performance
+    stream_only_kernel = []
+    for i in range(stencil.Q):
+        stream_only_kernel.append(Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))

    # LB Sweep
    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
@@ -158,9 +168,10 @@ with CodeGeneration() as ctx:
    lb_method = collision_rule.method

    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
-                                     boundary_object=NoSlip())
+                                     boundary_object=NoSlip(), field_data_type=pdf_data_type)
    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
-                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_data_type),
+                                 field_data_type=pdf_data_type)

    generate_lbm_package(ctx, name="UniformGridGPU",
                         collision_rule=collision_rule,
@@ -168,12 +179,12 @@ with CodeGeneration() as ctx:
                         nonuniform=False, boundaries=[no_slip, ubb],
                         macroscopic_fields=macroscopic_fields,
                         target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
+                         data_type=field_data_type, pdfs_data_type=pdf_data_type,
                         max_threads=max_threads)

    # Stream only kernel
-    vp = [('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize1'), ('int32_t', 'cudaBlockSize2')]
-    generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
-                   gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU,
+    generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel,
+                   gpu_indexing_params={'block_size': (128, 1, 1)}, target=ps.Target.GPU,
                   max_threads=max_threads)

    infoHeaderParams = {

--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -6,11 +6,25 @@ import sys
 import sqlite3
 from math import prod

+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
 # Number of time steps run for a workload of 128^3 per GPU
 # if double as many cells are on the GPU, half as many time steps are run etc.
 # increase this to get more reliable measurements
 TIME_STEPS_FOR_128_BLOCK = 1000
 DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))

 BASE_CONFIG = {
    'DomainSetup': {
@@ -39,6 +53,8 @@ ldc_setup = {'Border': [
 def num_time_steps(block_size, time_steps_for_128_block=200):
    cells = block_size[0] * block_size[1] * block_size[2]
    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    if time_steps < 10:
+        time_steps = 10
    return int(time_steps)


@@ -61,13 +77,13 @@ class Scenario:
                 inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3,
                 init_shear_flow=False, boundary_setup=False,
                 vtk_write_frequency=0, remaining_time_logger_frequency=-1,
-                 additional_info=None):
+                 additional_info=None, blocks=None, db_file_name=None):

        if boundary_setup:
            init_shear_flow = False
            periodic = (0, 0, 0)

-        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+        self.blocks = blocks if blocks else block_decomposition(wlb.mpi.numProcesses())

        self.cells_per_block = cells_per_block
        self.periodic = periodic
@@ -85,6 +101,7 @@ class Scenario:

        self.vtk_write_frequency = vtk_write_frequency
        self.remaining_time_logger_frequency = remaining_time_logger_frequency
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name

        self.config_dict = self.config(print_dict=False)
        self.additional_info = additional_info
@@ -97,7 +114,6 @@ class Scenario:
                'blocks': self.blocks,
                'cellsPerBlock': self.cells_per_block,
                'periodic': self.periodic,
-                'oneBlockPerProcess': True
            },
            'Parameters': {
                'omega': self.omega,
@@ -115,7 +131,6 @@ class Scenario:
            'Logging': {
                'logLevel': 'info',  # info progress detail tracing
            }
-
        }
        if self.boundary_setup:
            config_dict["Boundaries"] = ldc_setup
@@ -140,6 +155,15 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
        sequenceValuesToScalars(data)

        result = data
@@ -150,8 +174,8 @@ class Scenario:
        table_name = table_name.replace("-", "_")  # - not allowed for table name would lead to syntax error
        for num_try in range(num_tries):
            try:
-                checkAndUpdateSchema(result, table_name, DB_FILE)
-                storeSingle(result, table_name, DB_FILE)
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
                break
            except sqlite3.OperationalError as e:
                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
@@ -200,12 +224,70 @@ def overlap_benchmark():
        scenarios.add(scenario)


+def no_overlap_scaling(cuda_enabled_mpi=False):
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running scaling benchmark without communication hiding")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    # no overlap
+    scenarios.add(Scenario(cells_per_block=(256, 256, 256),
+                           cuda_blocks=(128, 1, 1),
+                           time_step_strategy='noOverlap',
+                           inner_outer_split=(1, 1, 1),
+                           cuda_enabled_mpi=cuda_enabled_mpi,
+                           outer_iterations=1))
+
+
+def weak_scaling_overlap(cuda_enabled_mpi=False):
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running scaling benchmark with communication hiding")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+
+    # overlap
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=(WeakX, WeakY, WeakZ),
+                               cuda_blocks=(128, 1, 1),
+                               time_step_strategy=t,
+                               inner_outer_split=(64, 64, 64),
+                               cuda_enabled_mpi=cuda_enabled_mpi,
+                               outer_iterations=1,
+                               boundary_setup=True,
+                               db_file_name="weakScalingUniformGrid.sqlite3"))
+
+
+def strong_scaling_overlap(cuda_enabled_mpi=False):
+    wlb.log_info_on_root("Running strong scaling benchmark with one block per proc with communication hiding")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+
+    domain_size = (StrongX, StrongY, StrongZ)
+    blocks = block_decomposition(wlb.mpi.numProcesses())
+    cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
+
+    # overlap
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
+                               cuda_blocks=(128, 1, 1),
+                               time_step_strategy=t,
+                               inner_outer_split=(1, 1, 1),
+                               cuda_enabled_mpi=cuda_enabled_mpi,
+                               outer_iterations=1,
+                               timesteps=50,
+                               blocks=blocks,
+                               boundary_setup=True,
+                               db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
+
+
 def single_gpu_benchmark():
    """Benchmarks only the LBM compute kernel"""
    wlb.log_info_on_root("Running single GPU benchmarks")
    wlb.log_info_on_root("")

-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
    gpu_mem = gpu_mem_gb * (2 ** 30)
    gpu_type = os.environ.get('GPU_TYPE')

@@ -214,12 +296,8 @@ def single_gpu_benchmark():
        additional_info['gpu_type'] = gpu_type

    scenarios = wlb.ScenarioManager()
-    block_sizes = [(i, i, i) for i in (32, 64, 128, 256)]
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
-                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
-                   (32, 8, 1), (64, 8, 1),
-                   (32, 16, 1)]
+    block_sizes = [(i, i, i) for i in (128, 256, 320)]
+    cuda_blocks = [(128, 1, 1), ]
    for block_size in block_sizes:
        for cuda_block_size in cuda_blocks:
            # cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1)
@@ -266,4 +344,14 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI
 # performance of compute kernel (no communication)
 # overlap_benchmark()  # benchmarks different communication overlap options
 # profiling()  # run only two timesteps on a smaller domain for profiling only
-validation_run()
+# validation_run()
+
+if BENCHMARK == 0:
+    single_gpu_benchmark()
+elif BENCHMARK == 1:
+    weak_scaling_overlap(True)
+elif BENCHMARK == 2:
+    strong_scaling_overlap(True)
+else:
+    validation_run()
+
--- a/apps/pythonmodule/CMakeLists.txt
+++ b/apps/pythonmodule/CMakeLists.txt
@@ -17,7 +17,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
        set ( pythonModules "-Wl,-whole-archive" ${PYTHON_MODULE_DEPENDENCIES}  "-Wl,-no-whole-archive" )
    endif()

-    add_library( walberla_cpp SHARED PythonModule.cpp )
+    add_library( walberla_cpp SHARED PythonModule.cpp
+            ../showcases/FlowAroundSphere/FlowAroundSphere.cpp)

    target_link_libraries( walberla_cpp ${WALBERLA_LINK_LIBRARIES_KEYWORD} ${pythonModules} ${SERVICE_LIBS} )


--- a/apps/showcases/CMakeLists.txt
+++ b/apps/showcases/CMakeLists.txt
@@ -10,6 +10,11 @@ add_subdirectory( PegIntoSphereBed )
 if ( WALBERLA_BUILD_WITH_CODEGEN)

   add_subdirectory( Antidunes )
+   add_subdirectory( FlowAroundSphere )
+   add_subdirectory( FlowAroundSphereCPU )
+   add_subdirectory( FlowAroundCylinder )
+   add_subdirectory( Channel )
+   add_subdirectory( TaylorGreenVortex )

   if (WALBERLA_BUILD_WITH_PYTHON)
      add_subdirectory( PhaseFieldAllenCahn )

--- a/apps/showcases/Channel/CMakeLists.txt
+++ b/apps/showcases/Channel/CMakeLists.txt
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+
+waLBerla_generate_target_from_python(NAME ChannelGenerated
+        FILE Channel.py
+        OUT_FILES ChannelStorageSpecification.h ChannelStorageSpecification.cpp
+        ChannelSweepCollection.h ChannelSweepCollection.cpp
+        NoSlip.h NoSlip.cpp
+        UBB.h UBB.cpp
+        ChannelBoundaryCollection.h
+        ChannelHeader.h)
+
+waLBerla_add_executable ( NAME Channel
+        FILES Channel.cpp
+        DEPENDS ChannelGenerated blockforest core field geometry lbm_generated timeloop )
--- a/apps/showcases/Channel/Channel.cpp
+++ b/apps/showcases/Channel/Channel.cpp
--- a/apps/showcases/Channel/Channel.prm
+++ b/apps/showcases/Channel/Channel.prm
+Parameters
+{
+	omega              1.4;
+	timesteps          10;
+	maxLatticeVelocity 0.01;
+
+	remainingTimeLoggerFrequency 0; // in seconds
+	vtkWriteFrequency 0;
+}
+
+DomainSetup
+{
+    blocks        <  2,  2,  1 >;
+    cellsPerBlock < 4, 4, 4 >;
+    periodic      <  1,  0,  1 >;
+    refinementLevels 1;
+    numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
+}
+
+Boundaries 
+{
+    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction N;    walldistance -1;  flag UBB; }
+}
+
+VTKWriter
+{
+    vtkWriteFrequency 1;
+    velocity true;
+    density true;
+    averageFields true;
+    flag false;
+    writeOnlySlice false;
+    amrFileFormat false;
+    oneFilePerProcess false;
+}
+
+Logging
+{
+    logLevel info;  // info progress detail tracing
+    writeSetupForestAndReturn false;
+    remainingTimeLoggerFrequency 60; // in seconds
+}
+
+Evaluation
+{
+    evaluationCheckFrequency 100;
+    logToStream true;
+    logToFile true;
+    filename Channel.txt;
+}
--- a/apps/showcases/Channel/Channel.py
+++ b/apps/showcases/Channel/Channel.py
+import sympy as sp
+
+from pystencils import Target
+from pystencils import fields
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+import warnings
+
+warnings.filterwarnings("ignore")
+with CodeGeneration() as ctx:
+    target = Target.CPU  # Target.GPU if ctx.cuda else Target.CPU
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    pdf_dtype = "float64"
+
+    streaming_pattern = 'pull'
+    timesteps = get_timesteps(streaming_pattern)
+
+    omega = sp.symbols("omega")
+
+    stencil = LBStencil(Stencil.D3Q27)
+    dim = stencil.D
+    pdfs, pdfs_tmp = fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
+    velocity_field, density_field = fields(f"velocity({dim}), density(1) : {data_type}[{dim}D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=Method.TRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern, compressible=True)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([sp.Symbol("u_x"), 0, 0], data_type=data_type))
+
+    generate_lbm_package(ctx, name="Channel",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields, data_type=data_type)
+
+    generate_info_header(ctx, 'ChannelHeader')
No results found