Merge branch 'UniformGridCPU' into 'master'

UniformGridCPU See merge request walberla/walberla!516

Merge branch 'UniformGridCPU' into 'master'
a385588d · Helen Schottenhamml · ca42e06d · a5a51ac0 · a385588d · a385588d
Commit a385588d authored 3 years ago by Helen Schottenhamml
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -22,7 +22,7 @@ if ( WALBERLA_BUILD_WITH_PYTHON )

   if ( WALBERLA_BUILD_WITH_CODEGEN )
      add_subdirectory( FlowAroundSphereCodeGen )
-      add_subdirectory( UniformGridGenerated )
+      add_subdirectory( UniformGridCPU )
      add_subdirectory( PhaseFieldAllenCahn )
   endif()


--- a/apps/benchmarks/UniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+waLBerla_link_files_to_builddir( "simulation_setup" )
+
+
+foreach(streaming_pattern pull push aa esotwist)
+    foreach(stencil d3q19 d3q27)
+        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky)
+	    # KBC methods only for D2Q9 and D3Q27 defined
+	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+		    continue()
+	    endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+
+            set(config ${stencil}_${streaming_pattern}_${collision_setup})
+            waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
+                    FILE UniformGridCPU.py
+                    CODEGEN_CFG ${config}
+                    OUT_FILES   UniformGridCPU_LbKernel.cpp UniformGridCPU_LbKernel.h
+                    UniformGridCPU_PackInfoEven.cpp UniformGridCPU_PackInfoEven.h
+                    UniformGridCPU_PackInfoOdd.cpp UniformGridCPU_PackInfoOdd.h
+                    UniformGridCPU_NoSlip.cpp UniformGridCPU_NoSlip.h
+                    UniformGridCPU_UBB.cpp UniformGridCPU_UBB.h
+                    UniformGridCPU_MacroSetter.cpp UniformGridCPU_MacroSetter.h
+                    UniformGridCPU_MacroGetter.cpp UniformGridCPU_MacroGetter.h
+                    UniformGridCPU_StreamOnlyKernel.cpp UniformGridCPU_StreamOnlyKernel.h
+                    UniformGridCPU_InfoHeader.h
+                    )
+
+
+            waLBerla_add_executable(NAME UniformGridCPU_${config}
+                    FILES UniformGridCPU.cpp
+                    DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk UniformGridCPUGenerated_${config})
+
+            # all configs are excluded from all except for pull d3q27.
+            if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
+                set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
+                set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
+            else()
+                set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+                set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+            endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
+
+        endforeach ()
+    endforeach()
+endforeach()
--- a/apps/benchmarks/UniformGridGenerated/InitShearVelocity.h
+++ b/apps/benchmarks/UniformGridGenerated/InitShearVelocity.h
--- a/apps/benchmarks/UniformGridGenerated/ManualKernels.h
+++ b/apps/benchmarks/UniformGridGenerated/ManualKernels.h
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file UniformGridCPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/Environment.h"
+#include "core/OpenMP.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include "domain_decomposition/SharedSweep.h"
+
+#include "field/AddToStorage.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "lbm/communication/CombinedInPlaceCpuPackInfo.h"
+
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
+
+#include "timeloop/all.h"
+
+#include <iomanip>
+
+#include "InitShearVelocity.h"
+#include "ManualKernels.h"
+#include "UniformGridCPU_InfoHeader.h"
+
+using namespace walberla;
+
+using PackInfoEven_T = lbm::UniformGridCPU_PackInfoEven;
+using PackInfoOdd_T = lbm::UniformGridCPU_PackInfoOdd;
+using LbSweep = lbm::UniformGridCPU_LbKernel;
+
+using FlagField_T = FlagField< uint8_t >;
+
+auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) {
+   return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block),
+                         storage->getNumberOfZCells(*block), uint_t(1), field::fzyx,
+                         make_shared< field::AllocateAligned< real_t, 64 > >());
+};
+
+int main(int argc, char** argv)
+{
+   mpi::Environment env(argc, argv);
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      WALBERLA_MPI_WORLD_BARRIER()
+
+      auto config = *cfg;
+      logging::configureLogging(config);
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config);
+
+      Vector3< uint_t > cellsPerBlock =
+         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
+      // Reading parameters
+      auto parameters          = config->getOneBlock("Parameters");
+      const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const uint_t timesteps   = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
+
+      // Creating fields
+      BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "pdfs");
+      BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
+      BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_t(1.0), field::fzyx);
+
+      // Initialize velocity on cpu
+      if (initShearFlow)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
+         initShearVelocity(blocks, velFieldId);
+      }
+
+      pystencils::UniformGridCPU_MacroSetter setterSweep(densityFieldId, pdfFieldId, velFieldId);
+      pystencils::UniformGridCPU_MacroGetter getterSweep(densityFieldId, pdfFieldId, velFieldId);
+
+      // Set up initial PDF values
+      for (auto& block : *blocks)
+         setterSweep(&block);
+
+      Vector3< int > innerOuterSplit =
+         parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));
+
+      for (uint_t i = 0; i < 3; ++i)
+      {
+         if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
+         {
+            WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
+         }
+      }
+      Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
+
+      LbSweep lbSweep(pdfFieldId, omega, innerOuterSplitCell);
+      pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId);
+
+      // Boundaries
+      const FlagUID fluidFlagUID("Fluid");
+      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
+      auto boundariesConfig   = config->getBlock("Boundaries");
+      bool boundaries         = false;
+      if (boundariesConfig)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions")
+         boundaries = true;
+         geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+         geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
+      }
+
+      lbm::UniformGridCPU_NoSlip noSlip(blocks, pdfFieldId);
+      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
+
+      lbm::UniformGridCPU_UBB ubb(blocks, pdfFieldId);
+      ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      // Initial setup is the post-collision state of an even time step
+      auto tracker = make_shared< lbm::TimestepTracker >(0);
+      auto packInfo =
+         make_shared< lbm::CombinedInPlaceCpuPackInfo< PackInfoEven_T , PackInfoOdd_T > >(tracker, pdfFieldId);
+
+      blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
+      communication.addPackInfo(packInfo);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto boundarySweep = [&](IBlock* block, uint8_t t) {
+         noSlip.run(block, t);
+         ubb.run(block, t);
+      };
+
+      auto boundaryInner = [&](IBlock* block, uint8_t t) {
+         noSlip.inner(block, t);
+         ubb.inner(block, t);
+      };
+
+      auto boundaryOuter = [&](IBlock* block, uint8_t t) {
+         noSlip.outer(block, t);
+         ubb.outer(block, t);
+      };
+
+      auto simpleOverlapTimeStep = [&]() {
+         // Communicate post-collision values of previous timestep...
+         communication.startCommunication();
+         for (auto& block : *blocks)
+         {
+            if (boundaries) boundaryInner(&block, tracker->getCounter());
+            lbSweep.inner(&block, tracker->getCounterPlusOne());
+         }
+         communication.wait();
+         for (auto& block : *blocks)
+         {
+            if (boundaries) boundaryOuter(&block, tracker->getCounter());
+            lbSweep.outer(&block, tracker->getCounterPlusOne());
+         }
+
+         tracker->advance();
+      };
+
+      auto normalTimeStep = [&]() {
+         communication.communicate();
+         for (auto& block : *blocks)
+         {
+            if (boundaries) boundarySweep(&block, tracker->getCounter());
+            lbSweep(&block, tracker->getCounterPlusOne());
+         }
+
+         tracker->advance();
+      };
+
+      // With two-fields patterns, ghost layer cells act as constant stream-in boundaries;
+      // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries.
+      auto kernelOnlyFunc = [&]() {
+         tracker->advance();
+         for (auto& block : *blocks)
+            lbSweep(&block, tracker->getCounter());
+      };
+
+      // Stream only function to test a streaming pattern without executing lbm operations inside
+      auto StreamOnlyFunc = [&]() {
+         for (auto& block : *blocks)
+            StreamOnlyKernel(&block);
+      };
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                             TIME LOOP SETUP                                                ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+
+      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
+      std::function< void() > timeStep;
+      if (timeStepStrategy == "noOverlap")
+         timeStep = std::function< void() >(normalTimeStep);
+      else if (timeStepStrategy == "simpleOverlap")
+         timeStep = simpleOverlapTimeStep;
+      else if (timeStepStrategy == "kernelOnly")
+      {
+         WALBERLA_LOG_INFO_ON_ROOT(
+            "Running only compute kernel without boundary - this makes only sense for benchmarking!")
+         // Run initial communication once to provide any missing stream-in populations
+         communication.communicate();
+         timeStep = kernelOnlyFunc;
+      }
+      else if (timeStepStrategy == "StreamOnly")
+      {
+         WALBERLA_LOG_INFO_ON_ROOT(
+            "Running only streaming kernel without LBM - this makes only sense for benchmarking!")
+         // Run initial communication once to provide any missing stream-in populations
+         timeStep = StreamOnlyFunc;
+      }
+      else
+      {
+         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
+                                      "'simpleOverlap', 'kernelOnly'")
+      }
+
+      timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
+
+      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      if (vtkWriteFrequency > 0)
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                         "simulation_step", false, true, true, false, 0);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldId, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+
+         vtkOutput->addBeforeFunction([&]() {
+           for (auto& block : *blocks){
+              getterSweep(&block);}
+         });
+
+         timeLoop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      }
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      int warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
+      int outerIterations = parameters.getParameter< int >("outerIterations", 1);
+      for (int i = 0; i < warmupSteps; ++i)
+         timeLoop.singleStep();
+
+      real_t remainingTimeLoggerFrequency =
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", -1.0); // in seconds
+      if (remainingTimeLoggerFrequency > 0)
+      {
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations),
+                                                   remainingTimeLoggerFrequency);
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
+      }
+
+      for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
+      {
+         timeLoop.setCurrentTimeStepToZero();
+         WcTimer simTimer;
+         WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+         simTimer.start();
+         timeLoop.run();
+         simTimer.end();
+         WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+         real_t time      = simTimer.last();
+         WALBERLA_MPI_SECTION()
+         {
+            walberla::mpi::reduceInplace(time, walberla::mpi::MAX);
+         }
+         auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+         auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
+         WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
+         WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
+         WALBERLA_ROOT_SECTION()
+         {
+            python_coupling::PythonCallback pythonCallbackResults("results_callback");
+            if (pythonCallbackResults.isCallable())
+            {
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+               // Call Python function to report results
+               pythonCallbackResults();
+            }
+         }
+      }
+   }
+   return EXIT_SUCCESS;
+}
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
+from dataclasses import replace
+
+import sympy as sp
+import pystencils as ps
+
+from pystencils.fast_approximation import insert_fast_divisions, insert_fast_sqrts
+from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases, insert_constants,\
+    insert_symbol_times_minus_one
+
+from lbmpy.advanced_streaming import Timestep, is_inplace
+from lbmpy.advanced_streaming.utility import streaming_patterns
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
+from lbmpy.enums import Method, Stencil
+from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
+from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
+from lbmpy.updatekernels import create_stream_only_kernel
+
+from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep,\
+    generate_mpidtype_info_from_kernel, generate_info_header
+
+from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary, generate_lb_pack_info
+
+omega = sp.symbols('omega')
+omega_free = sp.Symbol('omega_free')
+
+# best configs in terms of FLOPS
+options_dict = {
+    'srt': {
+        'method': Method.SRT,
+        'relaxation_rate': omega,
+        'compressible': False,
+    },
+    'trt': {
+        'method': Method.TRT,
+        'relaxation_rate': omega,
+        'compressible': False,
+    },
+    'mrt': {
+        'method': Method.MRT,
+        'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
+        'compressible': False,
+    },
+    'mrt-overrelax': {
+        'method': Method.MRT,
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
+        'compressible': False,
+    },
+    'central': {
+        'method': Method.CENTRAL_MOMENT,
+        'relaxation_rate': omega,
+        'compressible': True,
+    },
+    'central-overrelax': {
+        'method': Method.CENTRAL_MOMENT,
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
+        'compressible': True,
+    },
+    'cumulant': {
+        'method': Method.MONOMIAL_CUMULANT,
+        'relaxation_rate': omega,
+        'compressible': True,
+    },
+    'cumulant-overrelax': {
+        'method': Method.MONOMIAL_CUMULANT,
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
+        'compressible': True,
+    },
+    'entropic': {
+        'method': Method.TRT_KBC_N4,
+        'compressible': True,
+        'relaxation_rates': [omega, omega_free],
+        'entropic': True,
+        'entropic_newton_iterations': False
+    },
+    'smagorinsky': {
+        'method': Method.SRT,
+        'smagorinsky': False,
+        'relaxation_rate': omega,
+    }
+}
+
+
+info_header = """
+const char * infoStencil = "{stencil}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+
+# DEFAULTS
+optimize = True
+
+with CodeGeneration() as ctx:
+    openmp = True if ctx.openmp else False
+    field_type = "float64" if ctx.double_accuracy else "float32"
+    if ctx.optimize_for_localhost:
+        cpu_vec = {"nontemporal": True, "assume_aligned": True}
+    else:
+        cpu_vec = None
+
+    config_tokens = ctx.config.split('_')
+
+    assert len(config_tokens) >= 3
+    stencil_str = config_tokens[0]
+    streaming_pattern = config_tokens[1]
+    collision_setup = config_tokens[2]
+
+    if len(config_tokens) >= 4:
+        optimize = (config_tokens[3] != 'noopt')
+
+    if stencil_str == "d3q27":
+        stencil = LBStencil(Stencil.D3Q27)
+    elif stencil_str == "d3q19":
+        stencil = LBStencil(Stencil.D3Q19)
+    else:
+        raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
+
+    assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
+
+    options = options_dict[collision_setup]
+
+    q = stencil.Q
+    dim = stencil.D
+    assert dim == 3, "This app supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({q}), pdfs_tmp({q}): {field_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+
+    lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
+    lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
+
+    if not is_inplace(streaming_pattern):
+        lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
+        field_swaps = [(pdfs, pdfs_tmp)]
+    else:
+        field_swaps = []
+
+    # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
+    # is_inplace is set to False to ensure that the streaming is done with src and dst field.
+    # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
+    accessor = CollideOnlyInplaceAccessor()
+    accessor.is_inplace = False
+    field_swaps_stream_only = [(pdfs, pdfs_tmp)]
+    stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
+
+    # LB Sweep
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    if optimize:
+        collision_rule = insert_fast_divisions(collision_rule)
+        collision_rule = insert_fast_sqrts(collision_rule)
+        collision_rule = insert_constants(collision_rule)
+        collision_rule = insert_zeros(collision_rule)
+        collision_rule = insert_aliases(collision_rule)
+        collision_rule = insert_symbol_times_minus_one(collision_rule)
+
+    lb_method = collision_rule.method
+
+    generate_alternating_lbm_sweep(ctx, 'UniformGridCPU_LbKernel', collision_rule, lbm_config=lbm_config,
+                                   lbm_optimisation=lbm_opt, target=ps.Target.CPU,
+                                   inner_outer_split=True, field_swaps=field_swaps,
+                                   cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
+    
+    # getter & setter
+    setter_assignments = macroscopic_values_setter(lb_method,
+                                                   density=density_field.center, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs,
+                                                   streaming_pattern=streaming_pattern,
+                                                   previous_timestep=Timestep.EVEN)
+    getter_assignments = macroscopic_values_getter(lb_method,
+                                                   density=density_field, velocity=velocity_field,
+                                                   pdfs=pdfs,
+                                                   streaming_pattern=streaming_pattern,
+                                                   previous_timestep=Timestep.EVEN)
+
+    generate_sweep(ctx, 'UniformGridCPU_MacroSetter', setter_assignments, target=ps.Target.CPU, cpu_openmp=openmp)
+    generate_sweep(ctx, 'UniformGridCPU_MacroGetter', getter_assignments, target=ps.Target.CPU, cpu_openmp=openmp)
+
+    # Stream only kernel
+    generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
+                   target=ps.Target.CPU, cpu_openmp=openmp)
+
+    # Boundaries
+    noslip = NoSlip()
+    ubb = UBB((0.05, 0, 0), data_type=field_type)
+
+    generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_NoSlip', noslip, lb_method, field_name=pdfs.name,
+                                      streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp)
+    generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_UBB', ubb, lb_method, field_name=pdfs.name,
+                                      streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp)
+
+    # communication
+    generate_lb_pack_info(ctx, 'UniformGridCPU_PackInfo', stencil, pdfs,
+                          streaming_pattern=streaming_pattern, target=ps.Target.CPU,
+                          always_generate_separate_classes=True)
+
+    infoHeaderParams = {
+        'stencil': stencil_str,
+        'streaming_pattern': streaming_pattern,
+        'collision_setup': collision_setup,
+        'cse_global': int(lbm_opt.cse_global),
+        'cse_pdfs': int(lbm_opt.cse_pdfs),
+    }
+
+    stencil_typedefs = {'Stencil_T': stencil,
+                        'CommunicationStencil_T': stencil}
+    field_typedefs = {'PdfField_T': pdfs,
+                      'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'UniformGridCPU_InfoHeader',
+                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
--- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+import os
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sys
+import sqlite3
+from math import prod
+
+# Number of time steps run for a workload of 128^3 per process
+# if double as many cells are on the process, half as many time steps are run etc.
+# increase this to get more reliable measurements
+TIME_STEPS_FOR_128_BLOCK = 5
+DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
+
+
+def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
+    cells = block_size[0] * block_size[1] * block_size[2]
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    return int(time_steps)
+
+
+ldc_setup = {'Border': [
+    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+]}
+
+
+class Scenario:
+    def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1),
+                 timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
+                 warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
+                 vtk_write_frequency=0, remaining_time_logger_frequency=-1):
+
+        if boundary_setup:
+            init_shear_flow = False
+            periodic = (0, 0, 0)
+
+        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+
+        self.cells_per_block = cells_per_block
+        self.periodic = periodic
+
+        self.time_step_strategy = time_step_strategy
+        self.omega = omega
+        self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
+        self.inner_outer_split = inner_outer_split
+        self.init_shear_flow = init_shear_flow
+        self.boundary_setup = boundary_setup
+        self.warmup_steps = warmup_steps
+        self.outer_iterations = outer_iterations
+
+        self.vtk_write_frequency = vtk_write_frequency
+        self.remaining_time_logger_frequency = remaining_time_logger_frequency
+
+        self.config_dict = self.config(print_dict=False)
+
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        from pprint import pformat
+        config_dict = {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+            },
+            'Parameters': {
+                'omega': self.omega,
+                'warmupSteps': self.warmup_steps,
+                'outerIterations': self.outer_iterations,
+                'timeStepStrategy': self.time_step_strategy,
+                'timesteps': self.timesteps,
+                'initShearFlow': self.init_shear_flow,
+                'innerOuterSplit': self.inner_outer_split,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
+            }
+        }
+        if self.boundary_setup:
+            config_dict["Boundaries"] = ldc_setup
+
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+        return config_dict
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        sequenceValuesToScalars(data)
+
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
+        table_name = table_name.replace("-", "_")
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, DB_FILE)
+                storeSingle(result, table_name, DB_FILE)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+
+
+# -------------------------------------- Profiling -----------------------------------
+def profiling():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running 2 timesteps for profiling")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    cells = (128, 128, 128)
+
+    scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly',
+                           inner_outer_split=(1, 1, 1), timesteps=2,
+                           outer_iterations=1, warmup_steps=0))
+
+
+# -------------------------------------- Functions trying different parameter sets -----------------------------------
+
+
+def overlap_benchmark():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running different communication overlap strategies")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    inner_outer_splits = [(1, 1, 1), (4, 1, 1)]
+    cells_per_block = [(i, i, i) for i in (16, 32, 64, 128)]
+
+    for cell_per_block in cells_per_block:
+        scenarios.add(Scenario(time_step_strategy='noOverlap',
+                               inner_outer_split=(1, 1, 1),
+                               cells_per_block=cell_per_block))
+
+        for inner_outer_split in inner_outer_splits:
+            scenario = Scenario(time_step_strategy='simpleOverlap',
+                                inner_outer_split=inner_outer_split,
+                                cells_per_block=cell_per_block)
+            scenarios.add(scenario)
+
+
+def scaling_benchmark():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running scaling benchmark")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    cells_per_block = [(32, 32, 32), (128, 128, 128)]
+
+    for cell_per_block in cells_per_block:
+        scenarios.add(Scenario(time_step_strategy='noOverlap',
+                               inner_outer_split=(1, 1, 1),
+                               cells_per_block=cell_per_block))
+
+
+def single_node_benchmark():
+    """Benchmarks only the LBM compute kernel"""
+    wlb.log_info_on_root("Running single Node benchmarks")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]
+    for block_size in block_sizes:
+        scenario = Scenario(cells_per_block=block_size,
+                            time_step_strategy='kernelOnly',
+                            timesteps=num_time_steps(block_size))
+        scenarios.add(scenario)
+
+
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+    wlb.log_info_on_root("")
+
+    time_step_strategy = 'simpleOverlap'  # 'noOverlap'
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(cells_per_block=(64, 64, 64),
+                        time_step_strategy=time_step_strategy,
+                        timesteps=101,
+                        outer_iterations=1,
+                        warmup_steps=0,
+                        init_shear_flow=True,
+                        boundary_setup=False,
+                        vtk_write_frequency=100,
+                        remaining_time_logger_frequency=10)
+    scenarios.add(scenario)
+
+
+wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
+# Select the benchmark you want to run
+single_node_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
+# performance of compute kernel (no communication)
+# overlap_benchmark()  # benchmarks different communication overlap options
+# profiling()  # run only two timesteps on a smaller domain for profiling only
+# validation_run()
+# scaling_benchmark()
--- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
-waLBerla_link_files_to_builddir( "*.prm" )
-waLBerla_link_files_to_builddir( "*.py" )
-
-
-foreach(config srt trt smagorinsky mrt entropic_kbc_n4 cumulant )
-    waLBerla_generate_target_from_python(NAME UniformGridGenerated_${config}
-          CODEGEN_CFG ${config}
-          FILE UniformGridGenerated.py
-          OUT_FILES GenMacroGetter.cpp GenMacroGetter.h GenMacroSetter.cpp GenMacroSetter.h
-          GenPackInfo.cpp GenPackInfo.h GenPackInfoAAPush.cpp GenPackInfoAAPush.h GenPackInfoAAPull.cpp GenPackInfoAAPull.h
-          GenLbKernel.cpp GenLbKernel.h GenLbKernelAAEven.cpp GenLbKernelAAEven.h GenLbKernelAAOdd.cpp GenLbKernelAAOdd.h
-          GenMpiDtypeInfo.h GenMpiDtypeInfoAAPull.h GenMpiDtypeInfoAAPush.h
-          GenDefines.h)
-
-    waLBerla_add_executable ( NAME UniformGridBenchmarkGenerated_${config}
-          FILES UniformGridGenerated.cpp
-          DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk gui
-          UniformGridGenerated_${config} python_coupling)
-
-endforeach()
--- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
+++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
-DomainSetup
-{
-   blocks        <  1,    1,   1 >;
-   cellsPerBlock <  64, 64, 64 >;
-   periodic      <  1,   1,   1 >;
-}
-
-Parameters 
-{
-
-	timesteps       200;   // time steps of one performance measurement
-	warmupSteps     1;      // number of steps to run before measurement starts
-    outerIterations 4;      // how many measurements to conduct
-
-	vtkWriteFrequency 0;           // write a VTK file every n'th step, if zero VTK output is disabled
-	timeStepMode twoField;
-	//twoFieldKernelType manualD3Q19;
-	remainingTimeLoggerFrequency 6;  // interval in seconds to log the estimated remaining time
-    directComm 0;
-
-	omega 1.8;
-	shearVelocityMagnitude 0.02;
-	useGui 0;
-}
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
-#include "core/Environment.h"
-#include "core/logging/Initialization.h"
-#include "core/OpenMP.h"
-#include "python_coupling/CreateConfig.h"
-#include "python_coupling/PythonCallback.h"
-#include "python_coupling/DictWrapper.h"
-#include "blockforest/Initialization.h"
-#include "field/vtk/VTKWriter.h"
-#include "field/AddToStorage.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-#include "blockforest/communication/UniformDirectScheme.h"
-#include "timeloop/all.h"
-#include "core/timing/TimingPool.h"
-#include "core/timing/RemainingTimeLogger.h"
-#include "core/waLBerlaBuildInfo.h"
-#include "domain_decomposition/SharedSweep.h"
-#include "gui/Gui.h"
-#include "InitShearVelocity.h"
-#include "ManualKernels.h"
-#include "lbm/lattice_model/D3Q19.h"
-
-#include "GenDefines.h"
-
-#include <iomanip>
-
-using namespace walberla;
-
-int main( int argc, char **argv )
-{
-   mpi::Environment env( argc, argv );
-
-   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
-   {
-      WALBERLA_MPI_WORLD_BARRIER()
-
-      auto config = *cfg;
-      logging::configureLogging( config );
-      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
-
-      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
-      // Reading parameters
-      auto parameters = config->getOneBlock( "Parameters" );
-      const std::string timeStepMode = parameters.getParameter<std::string>( "timeStepMode", "twoField");
-      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
-            uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 ));
-      const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", real_t(0.08));
-      const bool directComm = parameters.getParameter<bool>("directComm", false);
-
-      auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage * const storage) {
-          return new PdfField_T(storage->getNumberOfXCells(*block),
-                                storage->getNumberOfYCells(*block),
-                                storage->getNumberOfZCells(*block),
-                                uint_t(1),
-                                field::fzyx,
-                                make_shared<field::AllocateAligned<real_t, 64>>());
-      };
-
-      // Creating fields
-      BlockDataID pdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
-      BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
-
-      pystencils::GenMacroSetter setterKernel(pdfFieldId, velFieldId);
-      pystencils::GenMacroGetter getterKernel(pdfFieldId, velFieldId);
-
-      if( shearVelocityMagnitude > 0 )
-          initShearVelocity(blocks, velFieldId, shearVelocityMagnitude);
-      for( auto & b : *blocks)
-          setterKernel(&b);
-
-      // Buffered Comm
-      blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks );
-      twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) );
-
-      blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm(blocks);
-      aaPullComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPull>(pdfFieldId));
-
-      blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks);
-      aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId));
-
-      // Direct Comm
-      blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect(blocks);
-      twoFieldCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfo>(pdfFieldId));
-
-      blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect(blocks);
-      aaPullCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPull>(pdfFieldId));
-
-      blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect(blocks);
-      aaPushCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPush>(pdfFieldId));
-
-
-      const std::string twoFieldKernelType = parameters.getParameter<std::string>( "twoFieldKernelType", "generated");
-      std::function<void(IBlock*)> twoFieldKernel;
-      if( twoFieldKernelType == "generated") {
-          twoFieldKernel = pystencils::GenLbKernel(pdfFieldId, omega);
-      } else if (twoFieldKernelType == "manualGeneric") {
-          using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
-          BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
-          twoFieldKernel = StreamPullCollideGeneric<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
-      } else if (twoFieldKernelType == "manualD3Q19") {
-          using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
-          BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
-          twoFieldKernel = StreamPullCollideD3Q19<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
-      } else {
-          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid option for \"twoFieldKernelType\", "
-                                       "valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\"")
-      }
-
-      using F = std::function<void()>;
-      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
-      if( timeStepMode == "twoField")
-      {
-          timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
-                         << Sweep( twoFieldKernel, "LB stream & collide1" );
-          timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
-                         << Sweep( twoFieldKernel, "LB stream & collide2" );
-
-      } else if ( timeStepMode == "twoFieldKernelOnly") {
-          timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide1" );
-          timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" );
-      } else if ( timeStepMode == "aa") {
-          timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
-          timeLoop.add() << BeforeFunction( directComm ? F(aaPullCommDirect) : F(aaPullComm) )
-                         << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd")
-                         << AfterFunction( directComm ? F(aaPushCommDirect) : F(aaPushComm) );
-      } else if ( timeStepMode == "aaKernelOnly") {
-          timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
-          timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd");
-      } else {
-          WALBERLA_ABORT("Invalid value for timeStepMode")
-      }
-
-
-      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
-      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
-      for(int i=0; i < warmupSteps; ++i )
-         timeLoop.singleStep();
-
-      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
-      if (remainingTimeLoggerFrequency > 0) {
-          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
-          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
-      }
-
-      // VTK
-      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
-      if( vtkWriteFrequency > 0 )
-      {
-          auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                           "simulation_step", false, true, true, false, 0 );
-          auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" );
-          vtkOutput->addCellDataWriter( velWriter );
-          vtkOutput->addBeforeFunction( [&]()
-                                        { for( auto & b : *blocks)
-                                            getterKernel(&b);
-                                        } );
-          timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
-      }
-
-
-      bool useGui = parameters.getParameter<bool>( "useGui", false );
-      if( useGui )
-      {
-          GUI gui( timeLoop, blocks, argc, argv);
-          gui.run();
-      }
-      else
-      {
-          for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
-          {
-              timeLoop.setCurrentTimeStepToZero();
-              WcTimer simTimer;
-
-              auto threads = omp_get_max_threads();
-
-              simTimer.start();
-              timeLoop.run();
-              simTimer.end();
-              auto time = simTimer.last();
-              auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-              auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-
-              using std::setw;
-              WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode <<
-                                                     "  procs: " << setw(6) << MPIManager::instance()->numProcesses() <<
-                                                     "  threads: " << threads <<
-                                                     "  direct_comm: " << directComm <<
-                                                     "  time steps: " << timesteps <<
-                                                     setw(15) << "  block size: " << cellsPerBlock <<
-                                                     "  mlups/core:  " << int(mlupsPerProcess/ threads) <<
-                                                     "  mlups:  " << int(mlupsPerProcess) *  MPIManager::instance()->numProcesses())
-
-              WALBERLA_ROOT_SECTION()
-              {
-                  python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                  if ( pythonCallbackResults.isCallable())
-                  {
-                      pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                      pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
-                      pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
-                      pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode );
-                      pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType );
-                      pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict );
-                      pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() );
-                      pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() );
-                      pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() );
-
-                      // Call Python function to report results
-                      pythonCallbackResults();
-                  }
-              }
-          }
-      }
-   }
-
-   return 0;
-}
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
-import sympy as sp
-import pystencils as ps
-from lbmpy.creationfunctions import create_lb_update_rule, create_lb_collision_rule
-from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep,\
-    generate_mpidtype_info_from_kernel, generate_info_header
-
-from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
-from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
-from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
-
-omega = sp.symbols('omega')
-omega_free = sp.Symbol('omega_free')
-omega_fill = sp.symbols('omega_:10')
-
-options_dict = {
-    'srt': {
-        'method': 'srt',
-        'stencil': 'D3Q19',
-        'relaxation_rate': omega,
-        'compressible': False,
-    },
-    'trt': {
-        'method': 'trt',
-        'stencil': 'D3Q19',
-        'compressible': False,
-        'relaxation_rate': omega,
-    },
-    'mrt': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [0.0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7],
-    },
-    'mrt_full': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
-                             omega_fill[3], omega_fill[4], omega_fill[5]],
-    },
-    'entropic': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
-        'entropic': True,
-    },
-    'entropic_kbc_n4': {
-        'method': 'trt_kbc_n4',
-        'stencil': 'D3Q27',
-        'compressible': True,
-        'relaxation_rates': [omega, omega_free],
-        'entropic': True,
-    },
-    'smagorinsky': {
-        'method': 'srt',
-        'stencil': 'D3Q19',
-        'smagorinsky': True,
-        'relaxation_rate': omega,
-    },
-    'cumulant': {
-        'method': 'cumulant',
-        'stencil': 'D3Q19',
-        'compressible': True,
-        'relaxation_rate': omega,
-    },
-}
-
-with CodeGeneration() as ctx:
-    common_options = {
-        'field_name': 'pdfs',
-        'temporary_field_name': 'pdfs_tmp',
-    }
-    opts = {
-        'two_field_cse_pdfs': False,
-        'two_field_cse_global': False,
-        'two_field_split': True,
-        'two_field_nt_stores': True,
-
-        'aa_even_cse_pdfs': False,
-        'aa_even_cse_global': False,
-        'aa_even_split': False,
-        'aa_even_nt_stores': False,
-
-        'aa_odd_cse_pdfs': False,
-        'aa_odd_cse_global': False,
-        'aa_odd_split': True,
-        'aa_odd_nt_stores': False,
-
-        'compiled_in_boundaries': False,
-    }
-    config_name = ctx.config
-    noopt = False
-    d3q27 = False
-    if config_name.endswith('_d3q27'):
-        d3q27 = True
-        config_name = config_name[:-len('_d3q27')]
-
-    if config_name == '':
-        config_name = 'trt'
-    options = options_dict[config_name]
-    options.update(common_options)
-    options = options.copy()
-
-    if d3q27:
-        stencil = LBStencil(Stencil.D3Q27)
-        options['stencil'] = stencil
-    else:
-        stencil = LBStencil(options['stencil'])
-
-    dtype_string = 'float64' if ctx.double_accuracy else 'float32'
-
-    pdfs, velocity_field = ps.fields(f'pdfs({stencil.Q}), velocity(3) : {dtype_string}[3D]', layout='fzyx')
-
-    lbm_config = LBMConfig(**options)
-    lbm_optimisation = LBMOptimisation(symbolic_field=pdfs, split=opts['two_field_split'],
-                                       cse_global=opts['two_field_cse_global'], cse_pdfs=opts['two_field_cse_pdfs'])
-
-    update_rule_two_field = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_optimisation)
-
-    if opts['compiled_in_boundaries']:
-        from lbmpy.boundaries import NoSlip, UBB
-        from lbmpy.boundaries.boundaries_in_kernel import update_rule_with_push_boundaries
-        from collections import OrderedDict
-        boundaries = OrderedDict((
-            ((1, 0, 0), NoSlip()),
-            ((-1, 0, 0), NoSlip()),
-            ((0, 1, 0), NoSlip()),
-            ((0, -1, 0), NoSlip()),
-            ((0, 0, 1), UBB([0.05, 0, 0])),
-            ((0, 0, -1), NoSlip()),
-        ))
-        cr_even = create_lb_collision_rule(lbm_config=LBMConfig(stencil=LBStencil(Stencil.D3Q19), compressible=False),
-                                           lbm_optimisation=LBMOptimisation(cse_global=opts['aa_even_cse_global'],
-                                                                            cse_pdfs=opts['aa_even_cse_pdfs']))
-
-        cr_odd = create_lb_collision_rule(lbm_config=LBMConfig(stencil=LBStencil(Stencil.D3Q19), compressible=False),
-                                          lbm_optimisation=LBMOptimisation(cse_global=opts['aa_odd_cse_global'],
-                                                                           cse_pdfs=opts['aa_odd_cse_pdfs']))
-
-        update_rule_aa_even = update_rule_with_push_boundaries(cr_even, pdfs, boundaries,
-                                                               AAEvenTimeStepAccessor, AAOddTimeStepAccessor.read)
-        update_rule_aa_odd = update_rule_with_push_boundaries(cr_odd, pdfs, boundaries,
-                                                              AAOddTimeStepAccessor, AAEvenTimeStepAccessor.read)
-    else:
-        lbm_opt_even = LBMOptimisation(symbolic_field=pdfs, split=opts['aa_even_split'],
-                                       cse_global=opts['aa_even_cse_global'], cse_pdfs=opts['aa_even_cse_pdfs'])
-
-        update_rule_aa_even = create_lb_update_rule(lbm_config=LBMConfig(**options,
-                                                                         kernel_type=AAEvenTimeStepAccessor()),
-                                                    lbm_optimisation=lbm_opt_even)
-
-        lbm_opt_odd = LBMOptimisation(symbolic_field=pdfs, split=opts['aa_odd_split'],
-                                      cse_global=opts['aa_odd_cse_global'], cse_pdfs=opts['aa_odd_cse_pdfs'])
-
-        update_rule_aa_odd = create_lb_update_rule(lbm_config=LBMConfig(**options,
-                                                                        kernel_type=AAOddTimeStepAccessor()),
-                                                   lbm_optimisation=lbm_opt_odd)
-
-    vec = {'assume_aligned': True, 'assume_inner_stride_one': True}
-
-    # check if openmp is enabled in cmake
-    if ctx.openmp:
-        openmp_enabled = True
-    else:
-        openmp_enabled = False
-
-    # Sweeps
-    vec['nontemporal'] = opts['two_field_nt_stores']
-    generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')],
-                   cpu_vectorize_info=vec)
-    vec['nontemporal'] = opts['aa_even_nt_stores']
-    generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec,
-                   cpu_openmp=openmp_enabled, ghost_layers=1)
-    vec['nontemporal'] = opts['aa_odd_nt_stores']
-    generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec,
-                   cpu_openmp=openmp_enabled, ghost_layers=1)
-
-    setter_assignments = macroscopic_values_setter(update_rule_two_field.method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=1.0)
-    getter_assignments = macroscopic_values_getter(update_rule_two_field.method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=None)
-    generate_sweep(ctx, 'GenMacroSetter', setter_assignments, cpu_openmp=openmp_enabled)
-    generate_sweep(ctx, 'GenMacroGetter', getter_assignments, cpu_openmp=openmp_enabled)
-
-    # Communication
-    generate_pack_info_from_kernel(ctx, 'GenPackInfo', update_rule_two_field,
-                                   cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False)
-    generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPull', update_rule_aa_odd, kind='pull',
-                                   cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False)
-    generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPush', update_rule_aa_odd, kind='push',
-                                   cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False)
-
-    generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfo', update_rule_two_field)
-    generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPull', update_rule_aa_odd, kind='pull')
-    generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPush', update_rule_aa_odd, kind='push')
-
-    additional_code = f"""
-    const char * infoStencil = "{stencil.name}";
-    const char * infoConfigName = "{ctx.config}";
-    const char * optimizationDict = "{str(opts)}";
-    """
-
-    stencil_typedefs = {'Stencil_T': stencil,
-                        'CommunicationStencil_T': stencil}
-    field_typedefs = {'PdfField_T': pdfs,
-                      'VelocityField_T': velocity_field}
-
-    generate_info_header(ctx, "GenDefines.h", stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
-                         additional_code=additional_code)
--- a/apps/benchmarks/UniformGridGenerated/params.py
+++ b/apps/benchmarks/UniformGridGenerated/params.py
-import math
-import os
-import operator
-import waLBerla as wlb
-from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
-from waLBerla.tools.config import block_decomposition
-from functools import reduce
-import sqlite3
-
-
-def prod(seq):
-    return reduce(operator.mul, seq, 1)
-
-
-def get_block_decomposition(block_decomposition, num_processes):
-    bx = by = bz = 1
-    blocks_per_axis = int(math.log(num_processes, 2))
-    for i in range(blocks_per_axis):
-        decomposition_axis = block_decomposition[i % len(block_decomposition)]
-        if decomposition_axis == 'y':
-            by *= 2
-        elif decomposition_axis == 'z':
-            bz *= 2
-        elif decomposition_axis == 'x':
-            bx *= 2
-
-    assert (bx * by * bz) == num_processes
-    return bx, by, bz
-
-
-def calculate_time_steps(runtime, expected_mlups, domain_size):
-    cells = prod(domain_size)
-    time_steps_per_second = expected_mlups * 1e6 / cells
-    return int(time_steps_per_second * runtime)
-
-
-def domain_decomposition_func_z(processes, threads, block_size):
-    return {
-        'blocks': (1, 1, processes),
-        'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads)
-    }
-
-
-def domain_decomposition_func_full(processes, threads, block_size):
-    return {
-        'blocks': block_decomposition(processes),
-        'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads)
-    }
-
-
-class BenchmarkScenario:
-    def __init__(self, block_size=(256, 128, 128), direct_comm=True,
-                 time_step_mode='aa', two_field_kernel_type='generated',
-                 domain_decomposition_func=domain_decomposition_func_z,
-                 db_file_name='uniform_grid_gen.sqlite'):
-        self.block_size = block_size
-        self.direct_comm = direct_comm
-        self.time_step_mode = time_step_mode
-        self.two_field_kernel_type = two_field_kernel_type
-        self.domain_decomposition_func = domain_decomposition_func
-        self.threads = int(os.environ['OMP_NUM_THREADS'])
-        self.processes = wlb.mpi.numProcesses()
-        self.db_file_name = db_file_name
-
-    @wlb.member_callback
-    def config(self, **kwargs):
-        time_steps_for_128_cubed = 10
-        time_steps = int(128**3 / prod(self.block_size) * time_steps_for_128_cubed)
-        time_steps = max(10, time_steps)
-        cfg = {
-            'DomainSetup': {
-                'periodic': (1, 1, 1),
-            },
-            'Parameters': {
-                'timesteps': time_steps,
-                'warmupSteps': 2,
-                'outerIterations': 4,
-                'vtkWriteFrequency': 0,
-                'remainingTimeLoggerFrequency': 0,
-                'omega': 1.6,
-                'timeStepMode': self.time_step_mode,
-                'twoFieldKernelType': self.two_field_kernel_type,
-                'directComm': self.direct_comm,
-            }
-        }
-        cfg['DomainSetup'].update(self.domain_decomposition_func(self.processes, self.threads, self.block_size))
-        return cfg
-
-    @wlb.member_callback
-    def results_callback(self, mlupsPerProcess, optimizations, **kwargs):
-        cfg = self.config()
-        result = {
-            'block_size': self.block_size,
-            'mlups_per_core': mlupsPerProcess / self.threads,
-            'threads': self.threads,
-            'processes': self.processes,
-            'time_step_mode': self.time_step_mode,
-            'direct_comm': self.direct_comm,
-            'time_steps': cfg['Parameters']['timesteps'],
-            'I_MPI_PIN_CELL': os.environ.get('I_MPI_PIN_CELL', ''),
-            'I_MPI_PIN_DOMAIN': os.environ.get('I_MPI_PIN_CELL', ''),
-        }
-
-        optimizations = eval(optimizations)
-        result.update(optimizations)
-        result.update(kwargs)
-        sequenceValuesToScalars(result)
-        num_tries = 4
-        for num_try in range(num_tries):
-            try:
-                checkAndUpdateSchema(result, "runs", self.db_file_name)
-                storeSingle(result, "runs", self.db_file_name)
-                break
-            except sqlite3.OperationalError as e:
-                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/4 " + str(e))
-
-
-def block_size_ok(sc):
-    block_size = sc.config()['DomainSetup']['cellsPerBlock']
-    return prod(block_size) * 19 < 2 ** 31
-
-
-def single_node_benchmark():
-    scenarios = wlb.ScenarioManager()
-    for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64),
-                       (1024, 64, 32), (2048, 64, 16),
-                       (64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]:
-        for direct_comm in (True, False):
-            for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']:
-                if time_step_mode == 'twoField':
-                    for kt in ['generated', 'manualGeneric', 'manualD3Q19']:
-                        sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
-                                               time_step_mode=time_step_mode, two_field_kernel_type=kt,
-                                               domain_decomposition_func=domain_decomposition_func_z
-                                               )
-                        if not block_size_ok(sc):
-                            continue
-                        scenarios.add(sc)
-                else:
-                    sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
-                                           domain_decomposition_func=domain_decomposition_func_z,
-                                           time_step_mode=time_step_mode)
-                    if not block_size_ok(sc):
-                        continue
-                        # scenarios.add(sc)
-
-
-def single_node_benchmark_small():
-    scenarios = wlb.ScenarioManager()
-    for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64),
-                       (1024, 64, 32), (2048, 64, 16), (64, 32, 32), (32, 32, 32), (16, 16, 16),
-                       (256, 128, 64), (512, 128, 32)]:
-        for direct_comm in (True, False):
-            for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']:
-                sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, time_step_mode=time_step_mode)
-                if not block_size_ok(sc):
-                    continue
-                scenarios.add(sc)
-
-
-def weak_scaling():
-    scenarios = wlb.ScenarioManager()
-    for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64),
-                       (1024, 64, 32), (2048, 64, 16), (64, 32, 32), (32, 32, 32), (16, 16, 16),
-                       (256, 128, 64), (512, 128, 32)]:
-        for direct_comm in (True, False):
-            sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
-                                   domain_decomposition_func=domain_decomposition_func_full)
-            if not block_size_ok(sc):
-                continue
-            scenarios.add(sc)
-
-
-single_node_benchmark()
--- a/extern/lodepng/CMakeLists.txt
+++ b/extern/lodepng/CMakeLists.txt
 add_library(
-      lodepng
-      EXCLUDE_FROM_ALL
-      lodepng.cpp
-      lodepng.h
+        lodepng
+        EXCLUDE_FROM_ALL
+        lodepng.cpp
+        lodepng.h
 )

 target_include_directories(lodepng PUBLIC SYSTEM "${CMAKE_CURRENT_SOURCE_DIR}")
@@ -10,4 +10,4 @@ if (MSVC)
   target_compile_options( lodepng PRIVATE /w )
 else ()
   target_compile_options( lodepng PRIVATE -w )
-endif ()
\ No newline at end of file
+endif ()
--- a/python/lbmpy_walberla/packinfo.py
+++ b/python/lbmpy_walberla/packinfo.py
@@ -17,7 +17,7 @@ def generate_lb_pack_info(generation_context,
                          namespace='lbm',
                          target=Target.CPU,
                          data_type=None,
-                          cpu_openmp=None,
+                          cpu_openmp=False,
                          **create_kernel_params):
    """Generates waLBerla MPI PackInfos for an LBM kernel, based on a given method
    and streaming pattern. For in-place streaming patterns, two PackInfos are generated;

--- a/python/pystencils_walberla/codegen.py
+++ b/python/pystencils_walberla/codegen.py
@@ -160,7 +160,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
 def generate_pack_info_for_field(generation_context, class_name: str, field: Field,
                                 direction_subset: Optional[Tuple[Tuple[int, int, int]]] = None,
                                 operator=None, gl_to_inner=False,
-                                 target=Target.CPU, data_type=None, cpu_openmp=None,
+                                 target=Target.CPU, data_type=None, cpu_openmp=False,
                                 **create_kernel_params):
    """Creates a pack info for a pystencils field assuming a pull-type stencil, packing all cell elements.

@@ -188,7 +188,7 @@ def generate_pack_info_for_field(generation_context, class_name: str, field: Fie


 def generate_pack_info_from_kernel(generation_context, class_name: str, assignments: Sequence[Assignment],
-                                   kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=None,
+                                   kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=False,
                                   **create_kernel_params):
    """Generates a waLBerla GPU PackInfo from a (pull) kernel.

@@ -241,7 +241,7 @@ def generate_pack_info_from_kernel(generation_context, class_name: str, assignme
 def generate_pack_info(generation_context, class_name: str,
                       directions_to_pack_terms: Dict[Tuple[Tuple], Sequence[Field.Access]],
                       namespace='pystencils', operator=None, gl_to_inner=False,
-                       target=Target.CPU, data_type=None, cpu_openmp=None,
+                       target=Target.CPU, data_type=None, cpu_openmp=False,
                       **create_kernel_params):
    """Generates a waLBerla GPU PackInfo

@@ -258,6 +258,9 @@ def generate_pack_info(generation_context, class_name: str,
        cpu_openmp: if loops should use openMP or not.
        **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
    """
+    if cpu_openmp:
+        raise ValueError("The packing kernels are already called inside an OpenMP parallel region. Thus "
+                         "additionally parallelising each kernel is not supported.")
    items = [(e[0], sorted(e[1], key=lambda x: str(x))) for e in directions_to_pack_terms.items()]
    items = sorted(items, key=lambda e: e[0])
    directions_to_pack_terms = OrderedDict(items)
@@ -286,7 +289,7 @@ def generate_pack_info(generation_context, class_name: str,
    if len(data_types) == 0:
        raise ValueError("No fields to pack!")
    if len(data_types) != 1:
-        err_detail = "\n".join(" - {} [{}]".format(f.name, f.dtype) for f in fields_accessed)
+        err_detail = "\n".join(f" - {f.name} [{f.dtype}]" for f in fields_accessed)
        raise NotImplementedError("Fields of different data types are used - this is not supported.\n" + err_detail)
    dtype = data_types.pop()


--- a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h
+++ b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h
@@ -44,7 +44,6 @@ public:
       unpackData( receiver, stencil::inverseDir[dir], rBuffer );
   }

-private:
   void packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const {
        const auto dataSize = size(dir, sender);
        pack(dir, outBuffer.forward(dataSize), const_cast<IBlock*>(sender));
@@ -54,6 +53,7 @@ private:
   void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block) const;
   uint_t size  (stencil::Direction dir, const IBlock * block) const;

+ private:
    {{fused_kernel|generate_members(parameters_to_ignore=['buffer'])|indent(4)}}
 };


--- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
@@ -102,7 +102,7 @@ void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_param
 {
    {{kernel|generate_block_data_to_field_extraction|indent(4)}}

-    if( layers_.size() == 0 )
+    if( layers_.empty() )
    {
        CellInterval ci;


--- a/python/pystencils_walberla/tests/test_packinfo_generation.py
+++ b/python/pystencils_walberla/tests/test_packinfo_generation.py
@@ -28,8 +28,12 @@ class PackinfoGenTest(unittest.TestCase):

                    for file_name_to_test in ('PI1.cpp', 'PI2.cpp'):
                        file_to_test = ctx.files[file_name_to_test]
+
+                        # For Packing kernels it is better to not have OpenMP in the code because the packing kernels
+                        # themselves are launched in an OpenMP environment. Still it could be forced but setting
+                        # openmp to True in generate_pack_info_from_kernel
                        if openmp:
-                            assert '#pragma omp parallel' in file_to_test
+                            assert '#pragma omp parallel' not in file_to_test

                        if da:
                            assert 'float ' not in file_to_test

--- a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
+++ b/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
@@ -82,5 +82,3 @@ class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo

 } // namespace lbm
 } // namespace walberla
-
-
--- a/src/lbm/communication/CombinedInPlaceCpuPackInfo.h
+++ b/src/lbm/communication/CombinedInPlaceCpuPackInfo.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CombinedInPlaceCpuPackInfo.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+#define IS_EVEN(x) (((x) &1) ^ 1)
+
+#include "communication/UniformPackInfo.h"
+
+#include "lbm/inplace_streaming/TimestepTracker.h"
+
+namespace walberla
+{
+namespace lbm
+{
+template< typename EvenPackInfo, typename OddPackInfo >
+class CombinedInPlaceCpuPackInfo : public ::walberla::communication::UniformPackInfo
+{
+ public:
+   template< typename... Args >
+   CombinedInPlaceCpuPackInfo(std::shared_ptr< lbm::TimestepTracker >& tracker, Args&&... args)
+      : tracker_(tracker), evenPackInfo_(std::forward< Args >(args)...), oddPackInfo_(std::forward< Args >(args)...)
+   {}
+
+   ~CombinedInPlaceCpuPackInfo() override = default;
+   bool constantDataExchange() const override { return true; }
+   bool threadsafeReceiving() const override { return true; }
+
+   void unpackData(IBlock* receiver, stencil::Direction dir, mpi::RecvBuffer& buffer) override
+   {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         return evenPackInfo_.unpackData(receiver, dir, buffer);
+      }
+      else
+      {
+         return oddPackInfo_.unpackData(receiver, dir, buffer);
+      }
+   }
+
+   void communicateLocal(const IBlock* sender, IBlock* receiver, stencil::Direction dir) override
+   {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         return evenPackInfo_.communicateLocal(sender, receiver, dir);
+      }
+      else
+      {
+         return oddPackInfo_.communicateLocal(sender, receiver, dir);
+      }
+   }
+
+   void packDataImpl(const IBlock* sender, stencil::Direction dir, mpi::SendBuffer& outBuffer) const override
+   {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         return evenPackInfo_.packDataImpl(sender, dir, outBuffer);
+      }
+      else
+      {
+         return oddPackInfo_.packDataImpl(sender, dir, outBuffer);
+      }
+   }
+
+   void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const
+   {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         evenPackInfo_.pack(dir, buffer, block);
+      }
+      else
+      {
+         oddPackInfo_.pack(dir, buffer, block);
+      }
+   }
+
+   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const
+   {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         evenPackInfo_.unpack(dir, buffer, block);
+      }
+      else
+      {
+         oddPackInfo_.unpack(dir, buffer, block);
+      }
+   }
+
+   uint_t size(stencil::Direction dir, IBlock* block) const
+   {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         return evenPackInfo_.size(dir, block);
+      }
+      else
+      {
+         return oddPackInfo_.size(dir, block);
+      }
+   }
+
+ private:
+   const std::shared_ptr< lbm::TimestepTracker >& tracker_;
+   EvenPackInfo evenPackInfo_;
+   OddPackInfo oddPackInfo_;
+};
+
+} // namespace lbm
+} // namespace walberla