diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index f9c1d6e7a8feeb0d4bfbb6a1e55ab2946c2b8cee..41519581d10865c5a1f6c405178ab79725ba7776 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -21,6 +21,7 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
    add_subdirectory( FieldCommunication )
 
    if ( WALBERLA_BUILD_WITH_CODEGEN )
+      add_subdirectory( FlowAroundSphereCodeGen )
       add_subdirectory( UniformGridGenerated )
       add_subdirectory( PhaseFieldAllenCahn )
    endif()
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6f7a128f33735ef9365c6daca215de07f7a09ac0
--- /dev/null
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
@@ -0,0 +1,31 @@
+waLBerla_link_files_to_builddir( "*.py" )
+
+if (WALBERLA_BUILD_WITH_CUDA)
+    waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated
+            FILE FlowAroundSphereCodeGen.py
+            OUT_FILES FlowAroundSphereCodeGen_LbSweep.cu FlowAroundSphereCodeGen_LbSweep.h
+            FlowAroundSphereCodeGen_MacroSetter.cu FlowAroundSphereCodeGen_MacroSetter.h
+            FlowAroundSphereCodeGen_UBB.cu FlowAroundSphereCodeGen_UBB.h
+            FlowAroundSphereCodeGen_NoSlip.cu FlowAroundSphereCodeGen_NoSlip.h
+            FlowAroundSphereCodeGen_Outflow.cu FlowAroundSphereCodeGen_Outflow.h
+            FlowAroundSphereCodeGen_PackInfoEven.cu FlowAroundSphereCodeGen_PackInfoEven.h
+            FlowAroundSphereCodeGen_PackInfoOdd.cu FlowAroundSphereCodeGen_PackInfoOdd.h
+            FlowAroundSphereCodeGen_InfoHeader.h)
+    waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
+            DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk FlowAroundSphereGenerated)
+    set_target_properties( FlowAroundSphereCodeGen PROPERTIES CXX_VISIBILITY_PRESET hidden)
+else ()
+    waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated
+            FILE FlowAroundSphereCodeGen.py
+            OUT_FILES FlowAroundSphereCodeGen_LbSweep.cpp FlowAroundSphereCodeGen_LbSweep.h
+            FlowAroundSphereCodeGen_MacroSetter.cpp FlowAroundSphereCodeGen_MacroSetter.h
+            FlowAroundSphereCodeGen_UBB.cpp FlowAroundSphereCodeGen_UBB.h
+            FlowAroundSphereCodeGen_NoSlip.cpp FlowAroundSphereCodeGen_NoSlip.h
+            FlowAroundSphereCodeGen_Outflow.cpp FlowAroundSphereCodeGen_Outflow.h
+            FlowAroundSphereCodeGen_PackInfoEven.cpp FlowAroundSphereCodeGen_PackInfoEven.h
+            FlowAroundSphereCodeGen_PackInfoOdd.cpp FlowAroundSphereCodeGen_PackInfoOdd.h
+            FlowAroundSphereCodeGen_InfoHeader.h)
+    waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
+            DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk FlowAroundSphereGenerated)
+    set_target_properties( FlowAroundSphereCodeGen PROPERTIES CXX_VISIBILITY_PRESET hidden)
+endif()
\ No newline at end of file
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..68aab5882901317263c04045b69ac0a6de403adc
--- /dev/null
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
@@ -0,0 +1,307 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FlowAroundSphereCodeGen.cpp
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "blockforest/all.h"
+
+#include "core/all.h"
+
+#include "domain_decomposition/all.h"
+
+#include "field/all.h"
+
+#include "geometry/all.h"
+
+#include "lbm/inplace_streaming/TimestepTracker.h"
+#include "lbm/vtk/QCriterion.h"
+
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/PythonCallback.h"
+
+#include "timeloop/all.h"
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+#   include "cuda/AddGPUFieldToStorage.h"
+#   include "cuda/DeviceSelectMPI.h"
+#   include "cuda/HostFieldAllocator.h"
+#   include "cuda/NVTX.h"
+#   include "cuda/ParallelStreams.h"
+#   include "cuda/communication/GPUPackInfo.h"
+#   include "cuda/communication/UniformGPUScheme.h"
+#endif
+
+// CodeGen includes
+#include "FlowAroundSphereCodeGen_InfoHeader.h"
+
+namespace walberla
+{
+typedef lbm::FlowAroundSphereCodeGen_PackInfoEven PackInfoEven_T;
+typedef lbm::FlowAroundSphereCodeGen_PackInfoOdd PackInfoOdd_T;
+
+typedef walberla::uint8_t flag_t;
+typedef FlagField< flag_t > FlagField_T;
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+typedef cuda::GPUField< real_t > GPUField;
+#endif
+
+using namespace std::placeholders;
+
+auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) {
+   return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block),
+                         storage->getNumberOfZCells(*block), uint_t(1), field::fzyx,
+                         make_shared< field::AllocateAligned< real_t, 64 > >());
+};
+
+auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block,
+                           real_t inflow_velocity) {
+   Cell globalCell;
+   CellInterval domain = SbF->getDomainCellBB();
+   real_t h_y          = domain.yMax() - domain.yMin();
+   real_t h_z          = domain.zMax() - domain.zMin();
+   SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
+
+   real_t y1 = globalCell[1] - (h_y / 2.0 + 0.5);
+   real_t z1 = globalCell[2] - (h_z / 2.0 + 0.5);
+
+   real_t u = (inflow_velocity * 16) / (h_y * h_y * h_z * h_z) * (h_y / 2.0 - y1) * (h_y / 2 + y1) * (h_z / 2 - z1) *
+              (h_z / 2 + z1);
+
+   Vector3< real_t > result(u, 0.0, 0.0);
+   return result;
+};
+
+class AlternatingBeforeFunction
+{
+ public:
+   typedef std::function< void() > BeforeFunction;
+
+   AlternatingBeforeFunction(BeforeFunction evenFunc, BeforeFunction oddFunc,
+                             std::shared_ptr< lbm::TimestepTracker >& tracker)
+      : tracker_(tracker), funcs_{ evenFunc, oddFunc } {};
+
+   void operator()() { funcs_[tracker_->getCounter()](); }
+
+ private:
+   std::shared_ptr< lbm::TimestepTracker > tracker_;
+   std::vector< BeforeFunction > funcs_;
+};
+
+class Filter
+{
+ public:
+   explicit Filter(Vector3< uint_t > numberOfCells) : numberOfCells_(numberOfCells) {}
+
+   void operator()(const IBlock& /*block*/) {}
+
+   bool operator()(const cell_idx_t x, const cell_idx_t y, const cell_idx_t z) const
+   {
+      return x >= -1 && x <= cell_idx_t(numberOfCells_[0]) && y >= -1 && y <= cell_idx_t(numberOfCells_[1]) &&
+             z >= -1 && z <= cell_idx_t(numberOfCells_[2]);
+   }
+
+ private:
+   Vector3< uint_t > numberOfCells_;
+};
+
+using FluidFilter_T = Filter;
+
+int main(int argc, char** argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   cuda::selectDeviceBasedOnMpiRank();
+#endif
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      WALBERLA_MPI_WORLD_BARRIER()
+
+      auto config = *cfg;
+      logging::configureLogging(config);
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config);
+
+      // read parameters
+      Vector3< uint_t > cellsPerBlock =
+         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
+      auto parameters = config->getOneBlock("Parameters");
+
+      const uint_t timesteps       = parameters.getParameter< uint_t >("timesteps", uint_c(10));
+      const real_t omega           = parameters.getParameter< real_t >("omega", real_t(1.9));
+      const real_t u_max           = parameters.getParameter< real_t >("u_max", real_t(0.05));
+      const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000));
+      const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
+
+      const double remainingTimeLoggerFrequency =
+         parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+
+      // create fields
+      BlockDataID pdfFieldID     = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "PDFs");
+      BlockDataID velFieldID     = field::addToStorage< VelocityField_T >(blocks, "velocity", real_t(0), field::fzyx);
+      BlockDataID densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_t(0), field::fzyx);
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      BlockDataID pdfFieldIDGPU = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "PDFs on GPU", true);
+      BlockDataID velFieldIDGPU =
+         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldID, "velocity on GPU", true);
+      BlockDataID densityFieldIDGPU =
+         cuda::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldID, "density on GPU", true);
+#endif
+
+      BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+      // initialise all PDFs
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldIDGPU, velFieldIDGPU);
+      for (auto& block : *blocks)
+         setterSweep(&block);
+      cuda::fieldCpy< PdfField_T, GPUField >(blocks, pdfFieldID, pdfFieldIDGPU);
+#else
+      pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldID, velFieldID);
+      for (auto& block : *blocks)
+         setterSweep(&block);
+#endif
+      // Create communication
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      // This way of using alternating pack infos is temporary and will soon be replaced
+      // by something more straight-forward
+
+      cuda::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false);
+      comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU));
+      auto evenComm = std::function< void() >([&]() { comEven.communicate(nullptr); });
+
+      cuda::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false);
+      comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU));
+      auto oddComm = std::function< void() >([&]() { comODD.communicate(nullptr); });
+#else
+      blockforest::communication::UniformBufferedScheme< Stencil_T > evenComm(blocks);
+      evenComm.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldID));
+
+      blockforest::communication::UniformBufferedScheme< Stencil_T > oddComm(blocks);
+      oddComm.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldID));
+#endif
+
+      // create and initialize boundary handling
+      const FlagUID fluidFlagUID("Fluid");
+
+      auto boundariesConfig = config->getOneBlock("Boundaries");
+
+      std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
+         velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max);
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation);
+      lbm::FlowAroundSphereCodeGen_NoSlip noSlip(blocks, pdfFieldIDGPU);
+      lbm::FlowAroundSphereCodeGen_Outflow outflow(blocks, pdfFieldIDGPU, pdfFieldID);
+
+      lbm::FlowAroundSphereCodeGen_LbSweep lbSweep(densityFieldIDGPU, pdfFieldIDGPU, velFieldIDGPU, omega);
+#else
+      lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldID, velocity_initialisation);
+      lbm::FlowAroundSphereCodeGen_NoSlip noSlip(blocks, pdfFieldID);
+      lbm::FlowAroundSphereCodeGen_Outflow outflow(blocks, pdfFieldID);
+
+      lbm::FlowAroundSphereCodeGen_LbSweep lbSweep(densityFieldID, pdfFieldID, velFieldID, omega);
+#endif
+
+      geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+      ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("UBB"), fluidFlagUID);
+      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
+      outflow.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("Outflow"), fluidFlagUID);
+
+      // create time loop
+      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+      // Timestep Tracking and Sweeps
+      auto tracker = make_shared< lbm::TimestepTracker >(0);
+
+      AlternatingBeforeFunction communication(evenComm, oddComm, tracker);
+
+      // add LBM sweep and communication to time loop
+      timeloop.add() << BeforeFunction(communication, "communication")
+                     << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
+      timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary");
+      timeloop.add() << Sweep(ubb.getSweep(tracker), "ubb boundary");
+      timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement")
+                     << Sweep(lbSweep.getSweep(tracker), "LB update rule");
+
+      // LBM stability check
+      timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >(
+                                       config, blocks, pdfFieldID, flagFieldId, fluidFlagUID)),
+                                    "LBM stability check");
+
+      // log remaining time
+      timeloop.addFuncAfterTimeStep(
+         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+         "remaining time logger");
+
+      // add VTK output to time loop
+      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      if (vtkWriteFrequency > 0)
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                         "simulation_step", false, true, true, false, 0);
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+         vtkOutput->addBeforeFunction([&]() {
+            cuda::fieldCpy< VelocityField_T, GPUField >(blocks, velFieldID, velFieldIDGPU);
+            cuda::fieldCpy< ScalarField_T, GPUField >(blocks, densityFieldID, densityFieldIDGPU);
+         });
+#endif
+         auto velWriter     = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "velocity");
+         auto densityWriter = make_shared< field::VTKWriter< ScalarField_T > >(densityFieldID, "density");
+         FluidFilter_T filter(cellsPerBlock);
+
+         auto QCriterionWriter = make_shared< lbm::QCriterionVTKWriter< VelocityField_T, FluidFilter_T > >(
+            blocks, filter, velFieldID, "Q-Criterion");
+
+         vtkOutput->addCellDataWriter(velWriter);
+         vtkOutput->addCellDataWriter(densityWriter);
+         vtkOutput->addCellDataWriter(QCriterionWriter);
+
+         timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      }
+
+      WcTimer simTimer;
+
+      WALBERLA_LOG_INFO_ON_ROOT("Simulating flow around sphere:"
+                                "\n timesteps:               "
+                                << timesteps << "\n reynolds number:         " << reynolds_number
+                                << "\n relaxation rate:         " << omega << "\n maximum inflow velocity: " << u_max
+                                << "\n diameter_sphere:         " << diameter_sphere)
+
+      simTimer.start();
+      timeloop.run();
+      simTimer.end();
+      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+      auto time            = simTimer.last();
+      auto nrOfCells       = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+      auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
+      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
+      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
+   }
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace walberla
+
+int main(int argc, char** argv) { walberla::main(argc, argv); }
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
new file mode 100644
index 0000000000000000000000000000000000000000..571f3129ef00a3b4a2a258059b25263170cc73fb
--- /dev/null
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
@@ -0,0 +1,97 @@
+from pystencils.field import fields
+
+from lbmpy.advanced_streaming.utility import get_timesteps, Timestep
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+from lbmpy.stencils import get_stencil
+from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_method, create_lb_update_rule
+from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
+
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
+
+from lbmpy_walberla.additional_data_handler import UBBAdditionalDataHandler, OutflowAdditionalDataHandler
+from lbmpy_walberla import generate_boundary, generate_lb_pack_info
+from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary
+
+import sympy as sp
+
+stencil = get_stencil("D3Q27")
+q = len(stencil)
+dim = len(stencil[0])
+streaming_pattern = 'esotwist'
+timesteps = get_timesteps(streaming_pattern)
+
+pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : double[{dim}D]", layout='fzyx')
+omega = sp.Symbol("omega")
+u_max = sp.Symbol("u_max")
+
+output = {
+    'density': density_field,
+    'velocity': velocity_field
+}
+
+opt = {'symbolic_field': pdfs,
+       'cse_global': False,
+       'cse_pdfs': False}
+
+method_params = {'method': 'cumulant',
+                 'stencil': stencil,
+                 'relaxation_rate': omega,
+                 'galilean_correction': True,
+                 'field_name': 'pdfs',
+                 'streaming_pattern': streaming_pattern,
+                 'output': output,
+                 'optimization': opt}
+
+collision_rule = create_lb_collision_rule(**method_params)
+lb_method = collision_rule.method
+
+# getter & setter
+setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
+                                               pdfs=pdfs, density=1,
+                                               streaming_pattern=streaming_pattern,
+                                               previous_timestep=timesteps[0])
+
+# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
+
+stencil_typedefs = {'Stencil_T': stencil}
+field_typedefs = {'PdfField_T': pdfs,
+                  'VelocityField_T': velocity_field,
+                  'ScalarField_T': density_field}
+
+with CodeGeneration() as ctx:
+    if ctx.cuda:
+        target = 'gpu'
+    else:
+        target = 'cpu'
+
+    opt['target'] = target
+
+    # sweeps
+    generate_alternating_lbm_sweep(ctx, 'FlowAroundSphereCodeGen_LbSweep',
+                                   collision_rule, streaming_pattern, optimization=opt)
+    generate_sweep(ctx, 'FlowAroundSphereCodeGen_MacroSetter', setter_assignments, target=target)
+
+    # boundaries
+    ubb = UBB(lambda *args: None, dim=dim)
+    ubb_data_handler = UBBAdditionalDataHandler(stencil, ubb)
+    outflow = ExtrapolationOutflow(stencil[4], lb_method, streaming_pattern=streaming_pattern)
+    outflow_data_handler = OutflowAdditionalDataHandler(stencil, outflow, target=target)
+
+    generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_UBB', ubb, lb_method,
+                                      target=target, streaming_pattern=streaming_pattern,
+                                      additional_data_handler=ubb_data_handler)
+
+    generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_NoSlip', NoSlip(), lb_method,
+                                      target=target, streaming_pattern=streaming_pattern)
+
+    generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_Outflow', outflow, lb_method,
+                                      target=target, streaming_pattern=streaming_pattern,
+                                      additional_data_handler=outflow_data_handler)
+
+    # communication
+    generate_lb_pack_info(ctx, 'FlowAroundSphereCodeGen_PackInfo', stencil, pdfs,
+                          streaming_pattern=streaming_pattern, always_generate_separate_classes=True, target=target)
+
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'FlowAroundSphereCodeGen_InfoHeader',
+                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs)
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d046735fd3867cb83670e13090d43f0e52166d4
--- /dev/null
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
@@ -0,0 +1,63 @@
+import waLBerla as wlb
+from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
+
+
+class Scenario:
+    def __init__(self):
+        self.timesteps = 101
+        self.vtkWriteFrequency = 2000
+
+        self.cells = (384, 128, 128)
+        self.blocks = (1, 1, 1)
+        self.periodic = (0, 0, 0)
+
+        self.diameter_sphere = min(self.cells) // 2
+        self.u_max = 0.1
+        self.reynolds_number = 1000000
+
+        kinematic_viscosity = (self.diameter_sphere * self.u_max) / self.reynolds_number
+
+        self.omega = relaxation_rate_from_lattice_viscosity(kinematic_viscosity)
+
+        self.total_cells = (self.cells[0] * self.blocks[0],
+                            self.cells[1] * self.blocks[1],
+                            self.cells[2] * self.blocks[2])
+
+    @wlb.member_callback
+    def config(self):
+        return {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells,
+                'periodic': self.periodic,
+                'oneBlockPerProcess': True
+            },
+            'Parameters': {
+                'timesteps': self.timesteps,
+                'vtkWriteFrequency': self.vtkWriteFrequency,
+                'omega': self.omega,
+                'u_max': self.u_max,
+                'reynolds_number': self.reynolds_number,
+                'diameter_sphere': self.diameter_sphere
+            },
+            'Boundaries': {
+                'Border': [
+                    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'W', 'walldistance': -1, 'flag': 'UBB'},
+                    {'direction': 'E', 'walldistance': -1, 'flag': 'Outflow'},
+                    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+                ],
+                'Body': [
+                    {'shape': "sphere",
+                     'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2),
+                     'radius': self.diameter_sphere // 2,
+                     'flag': 'NoSlip'}
+                ]
+            },
+        }
+
+
+scenarios = wlb.ScenarioManager()
+scenarios.add(Scenario())
diff --git a/python/lbmpy_walberla/__init__.py b/python/lbmpy_walberla/__init__.py
index 1a7b136a7935d13be6c47d8a101e9efc4c631f56..15de37c8112e16a21476ba3e388adc843af92956 100644
--- a/python/lbmpy_walberla/__init__.py
+++ b/python/lbmpy_walberla/__init__.py
@@ -1,5 +1,8 @@
-from .boundary import generate_boundary
+from .boundary import generate_boundary, generate_alternating_lbm_boundary
 from .walberla_lbm_generation import RefinementScaling, generate_lattice_model
 from .packinfo import generate_lb_pack_info
+from .alternating_sweeps import generate_alternating_lbm_sweep
 
-__all__ = ['generate_lattice_model', 'RefinementScaling', 'generate_boundary', 'generate_lb_pack_info']
+__all__ = ['generate_lattice_model', 'generate_alternating_lbm_sweep',
+           'RefinementScaling', 'generate_boundary', 'generate_alternating_lbm_boundary',
+           'generate_lb_pack_info']
diff --git a/python/lbmpy_walberla/additional_data_handler.py b/python/lbmpy_walberla/additional_data_handler.py
index 03553a8f41ab9339b661354f323ba19c24cb6320..ac0dcd81233a5f387eed5912a1fe2338985869d1 100644
--- a/python/lbmpy_walberla/additional_data_handler.py
+++ b/python/lbmpy_walberla/additional_data_handler.py
@@ -1,11 +1,24 @@
 from pystencils.stencil import inverse_direction
 
 from lbmpy.advanced_streaming import AccessPdfValues, numeric_offsets, numeric_index
+from lbmpy.boundaries.boundaryconditions import LbBoundary
 from lbmpy.boundaries import ExtrapolationOutflow, UBB
 
 from pystencils_walberla.additional_data_handler import AdditionalDataHandler
 
 
+def default_additional_data_handler(boundary_obj: LbBoundary, lb_method, field_name, target='cpu'):
+    if not boundary_obj.additional_data:
+        return None
+
+    if isinstance(boundary_obj, UBB):
+        return UBBAdditionalDataHandler(lb_method.stencil, boundary_obj)
+    elif isinstance(boundary_obj, ExtrapolationOutflow):
+        return OutflowAdditionalDataHandler(lb_method.stencil, boundary_obj, target=target, field_name=field_name)
+    else:
+        raise ValueError(f"No default AdditionalDataHandler available for boundary of type {boundary_obj.__class__}")
+
+
 class UBBAdditionalDataHandler(AdditionalDataHandler):
     def __init__(self, stencil, boundary_object):
         assert isinstance(boundary_object, UBB)
@@ -114,7 +127,7 @@ class OutflowAdditionalDataHandler(AdditionalDataHandler):
         if self._dim == 2:
             pos.append("0")
         pos.append(str(numeric_index(pdf_accessor.accs[inv_dir])[0]))
-        result[f'pdf'] = ', '.join(pos)
+        result['pdf'] = ', '.join(pos)
 
         pos = []
         for p, o, t in zip(position, offsets, tangential_offset):
@@ -122,6 +135,6 @@ class OutflowAdditionalDataHandler(AdditionalDataHandler):
         if self._dim == 2:
             pos.append("0")
         pos.append(str(numeric_index(pdf_accessor.accs[inv_dir])[0]))
-        result[f'pdf_nd'] = ', '.join(pos)
+        result['pdf_nd'] = ', '.join(pos)
 
         return result
diff --git a/python/lbmpy_walberla/alternating_sweeps.py b/python/lbmpy_walberla/alternating_sweeps.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bafc5dc800020fea52674695146cb20e001d0e2
--- /dev/null
+++ b/python/lbmpy_walberla/alternating_sweeps.py
@@ -0,0 +1,141 @@
+import numpy as np
+from pystencils_walberla.codegen import generate_selective_sweep, get_vectorize_instruction_set
+from pystencils_walberla.kernel_selection import (
+    AbstractInterfaceArgumentMapping, AbstractConditionNode, KernelCallNode)
+from pystencils import TypedSymbol
+from lbmpy.creationfunctions import create_lb_ast
+from lbmpy.advanced_streaming import Timestep, is_inplace
+
+
+class EvenIntegerCondition(AbstractConditionNode):
+    def __init__(self, parameter_name: str,
+                 branch_true, branch_false,
+                 parameter_dtype=int):
+        self.parameter_symbol = TypedSymbol(parameter_name, parameter_dtype)
+        super(EvenIntegerCondition, self).__init__(branch_true, branch_false)
+
+    @property
+    def selection_parameters(self):
+        return {self.parameter_symbol}
+
+    @property
+    def condition_text(self):
+        return f"(({self.parameter_symbol.name} & 1) ^ 1)"
+
+
+class OddIntegerCondition(AbstractConditionNode):
+    def __init__(self, parameter_name: str,
+                 branch_true, branch_false,
+                 parameter_dtype=int):
+        self.parameter_symbol = TypedSymbol(parameter_name, parameter_dtype)
+        super(OddIntegerCondition, self).__init__(branch_true, branch_false)
+
+    @property
+    def selection_parameters(self):
+        return {self.parameter_symbol}
+
+    @property
+    def condition_text(self):
+        return f"({self.parameter_symbol.name} & 1)"
+
+
+class TimestepTrackerMapping(AbstractInterfaceArgumentMapping):
+
+    def __init__(self, low_level_arg: TypedSymbol, tracker_identifier='tracker'):
+        self.tracker_symbol = TypedSymbol(tracker_identifier, 'std::shared_ptr<lbm::TimestepTracker> &')
+        self.high_level_args = (self.tracker_symbol,)
+        self.low_level_arg = low_level_arg
+
+    @property
+    def mapping_code(self):
+        return f"{self.tracker_symbol.name}->getCounter()"
+
+    @property
+    def headers(self):
+        return {'"lbm/inplace_streaming/TimestepTracker.h"'}
+
+
+def generate_alternating_lbm_sweep(generation_context, class_name, collision_rule, streaming_pattern,
+                                   namespace='lbm', field_swaps=(), varying_parameters=(),
+                                   inner_outer_split=False, ghost_layers_to_include=0, optimization=None,
+                                   **create_ast_params):
+    """Generates an Alternating lattice Boltzmann sweep class. This is in particular meant for
+    in-place streaming patterns, but can of course also be used with two-fields patterns (why make it
+    simple if you can make it complicated?). From a collision rule, two kernel implementations are
+    generated; one for even, and one for odd timesteps. At run time, the correct one is selected
+    according to a time step counter. This counter can be managed by the `walberla::lbm::TimestepTracker`
+    class.
+
+    Args:
+        generation_context: See documentation of `pystencils_walberla.generate_sweep`
+        class_name: Name of the generated class
+        collision_rule: LbCollisionRule as returned by `lbmpy.create_lb_collision_rule`.
+        streaming_pattern: Name of the streaming pattern; see `lbmpy.advanced_streaming`
+        namespace: see documentation of `generate_sweep`
+        field_swaps: see documentation of `generate_sweep`
+        varying_parameters: see documentation of `generate_sweep`
+        inner_outer_split: see documentation of `generate_sweep`
+        ghost_layers_to_include: see documentation of `generate_sweep`
+        optimization: dictionary containing optimization parameters, c.f. `lbmpy.creationfunctions`
+        create_ast_params: Further parameters passed to `create_lb_ast`
+    """
+    optimization = default_lbm_optimization_parameters(generation_context, optimization)
+
+    target = optimization['target']
+    if not generation_context.cuda and target == 'gpu':
+        return
+
+    ast_even = create_lb_ast(collision_rule=collision_rule, streaming_pattern=streaming_pattern,
+                             timestep=Timestep.EVEN, optimization=optimization, **create_ast_params)
+    ast_even.function_name = 'even'
+    kernel_even = KernelCallNode(ast_even)
+
+    if is_inplace(streaming_pattern):
+        ast_odd = create_lb_ast(collision_rule=collision_rule, streaming_pattern=streaming_pattern,
+                                timestep=Timestep.ODD, optimization=optimization, **create_ast_params)
+        ast_odd.function_name = 'odd'
+        kernel_odd = KernelCallNode(ast_odd)
+    else:
+        kernel_odd = kernel_even
+
+    tree = EvenIntegerCondition('timestep', kernel_even, kernel_odd, np.uint8)
+    interface_mappings = [TimestepTrackerMapping(tree.parameter_symbol)]
+
+    vec_info = optimization['vectorization']
+    openmp = optimization['openmp']
+
+    generate_selective_sweep(generation_context, class_name, tree,
+                             interface_mappings=interface_mappings,
+                             target=target, namespace=namespace,
+                             field_swaps=field_swaps, varying_parameters=varying_parameters,
+                             inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include,
+                             cpu_vectorize_info=vec_info, cpu_openmp=openmp)
+
+
+# ---------------------------------- Internal --------------------------------------------------------------------------
+
+
+def default_lbm_optimization_parameters(generation_context, params):
+    if params is None:
+        params = dict()
+
+    params['target'] = params.get('target', 'cpu')
+    params['double_precision'] = params.get('double_precision', generation_context.double_accuracy)
+    params['openmp'] = params.get('cpu_openmp', generation_context.openmp)
+    params['vectorization'] = params.get('vectorization', {})
+
+    if isinstance(params['vectorization'], bool):
+        do_vectorization = params['vectorization']
+        params['vectorization'] = dict()
+    else:
+        do_vectorization = True
+
+    vec = params['vectorization']
+    if isinstance(vec, dict):
+        default_vec_is = get_vectorize_instruction_set(generation_context) if do_vectorization else None
+
+        vec['instruction_set'] = vec.get('instruction_set', default_vec_is)
+        vec['assume_inner_stride_one'] = vec.get('assume_inner_stride_one', True)
+        vec['assume_aligned'] = vec.get('assume_aligned', False)
+        vec['nontemporal'] = vec.get('nontemporal', False)
+    return params
diff --git a/python/lbmpy_walberla/boundary.py b/python/lbmpy_walberla/boundary.py
index b0dafb7b3b9835073101acfa33a611c71c62095d..795d8565616b8b2335f1c50ec960a170d638542b 100644
--- a/python/lbmpy_walberla/boundary.py
+++ b/python/lbmpy_walberla/boundary.py
@@ -2,6 +2,14 @@ import pystencils_walberla.boundary
 from lbmpy.boundaries.boundaryhandling import create_lattice_boltzmann_boundary_kernel
 from lbmpy.advanced_streaming import Timestep, is_inplace
 
+from pystencils_walberla.kernel_selection import KernelCallNode
+from lbmpy_walberla.alternating_sweeps import EvenIntegerCondition, OddIntegerCondition, TimestepTrackerMapping
+from lbmpy_walberla.additional_data_handler import default_additional_data_handler
+
+from pystencils.data_types import TypedSymbol
+
+import numpy as np
+
 
 def generate_boundary(generation_context,
                       class_name,
@@ -9,28 +17,77 @@ def generate_boundary(generation_context,
                       lb_method,
                       field_name='pdfs',
                       streaming_pattern='pull',
-                      always_generate_separate_classes=False,
+                      prev_timestep=Timestep.BOTH,
                       additional_data_handler=None,
+                      namespace='lbm',
                       **create_kernel_params):
-    def boundary_creation_function(field, index_field, stencil, boundary_functor, target='cpu', openmp=True, **kwargs):
+    if boundary_object.additional_data and additional_data_handler is None:
+        target = create_kernel_params.get('target', 'cpu')
+        additional_data_handler = default_additional_data_handler(boundary_object, lb_method, field_name, target=target)
+
+    def boundary_creation_function(field, index_field, stencil, boundary_functor, target='cpu', **kwargs):
+        return create_lattice_boltzmann_boundary_kernel(field, index_field, lb_method, boundary_functor,
+                                                        streaming_pattern=streaming_pattern,
+                                                        prev_timestep=prev_timestep,
+                                                        target=target,
+                                                        **kwargs)
+
+    pystencils_walberla.boundary.generate_boundary(generation_context,
+                                                   class_name,
+                                                   boundary_object,
+                                                   field_name=field_name,
+                                                   neighbor_stencil=lb_method.stencil,
+                                                   index_shape=[len(lb_method.stencil)],
+                                                   kernel_creation_function=boundary_creation_function,
+                                                   namespace=namespace,
+                                                   additional_data_handler=additional_data_handler,
+                                                   **create_kernel_params)
+
+
+def generate_alternating_lbm_boundary(generation_context,
+                                      class_name,
+                                      boundary_object,
+                                      lb_method,
+                                      field_name='pdfs',
+                                      streaming_pattern='pull',
+                                      after_collision=True,
+                                      additional_data_handler=None,
+                                      namespace='lbm',
+                                      **create_kernel_params):
+    if boundary_object.additional_data and additional_data_handler is None:
+        target = create_kernel_params.get('target', 'cpu')
+        additional_data_handler = default_additional_data_handler(boundary_object, lb_method, field_name, target=target)
+
+    timestep_param_name = 'timestep'
+    timestep_param_dtype = np.uint8
+    timestep_param = TypedSymbol(timestep_param_name, timestep_param_dtype)
+
+    def boundary_creation_function(field, index_field, stencil, boundary_functor, target='cpu', **kwargs):
         pargs = (field, index_field, lb_method, boundary_functor)
         kwargs = {'target': target, **kwargs}
-        if is_inplace(streaming_pattern) or always_generate_separate_classes:
-            return {
-                'EvenSweep': create_lattice_boltzmann_boundary_kernel(*pargs,
-                                                                      streaming_pattern=streaming_pattern,
-                                                                      prev_timestep=Timestep.EVEN,
-                                                                      **kwargs),
-                'OddSweep': create_lattice_boltzmann_boundary_kernel(*pargs,
-                                                                     streaming_pattern=streaming_pattern,
-                                                                     prev_timestep=Timestep.ODD,
-                                                                     **kwargs)
-            }
-        else:
-            return create_lattice_boltzmann_boundary_kernel(*pargs,
+        ast_even = create_lattice_boltzmann_boundary_kernel(*pargs,
                                                             streaming_pattern=streaming_pattern,
-                                                            prev_timestep=Timestep.BOTH,
+                                                            prev_timestep=Timestep.EVEN,
                                                             **kwargs)
+        ast_even.function_name = 'even'
+        kernel_even = KernelCallNode(ast_even)
+
+        if is_inplace(streaming_pattern):
+            ast_odd = create_lattice_boltzmann_boundary_kernel(*pargs,
+                                                               streaming_pattern=streaming_pattern,
+                                                               prev_timestep=Timestep.ODD,
+                                                               **kwargs)
+            ast_odd.function_name = 'odd'
+            kernel_odd = KernelCallNode(ast_odd)
+        else:
+            kernel_odd = kernel_even
+
+        if after_collision:
+            return EvenIntegerCondition(timestep_param_name, kernel_even, kernel_odd, timestep_param_dtype)
+        else:
+            return OddIntegerCondition(timestep_param_name, kernel_even, kernel_odd, timestep_param_dtype)
+
+    interface_mappings = [TimestepTrackerMapping(timestep_param)]
 
     pystencils_walberla.boundary.generate_boundary(generation_context,
                                                    class_name,
@@ -39,6 +96,7 @@ def generate_boundary(generation_context,
                                                    neighbor_stencil=lb_method.stencil,
                                                    index_shape=[len(lb_method.stencil)],
                                                    kernel_creation_function=boundary_creation_function,
-                                                   namespace='lbm',
+                                                   namespace=namespace,
                                                    additional_data_handler=additional_data_handler,
+                                                   interface_mappings=interface_mappings,
                                                    **create_kernel_params)
diff --git a/python/pystencils_walberla/__init__.py b/python/pystencils_walberla/__init__.py
index b6c310169019dcc215268c60cd90e914dfed3648..2c7892f9164feb57949aca3bfd9c976b1a1ae019 100644
--- a/python/pystencils_walberla/__init__.py
+++ b/python/pystencils_walberla/__init__.py
@@ -3,8 +3,10 @@ from .cmake_integration import CodeGeneration
 from .codegen import (
     generate_pack_info, generate_pack_info_for_field, generate_pack_info_from_kernel,
     generate_mpidtype_info_from_kernel, generate_sweep, get_vectorize_instruction_set)
+from .utility import generate_info_header
 
 __all__ = ['CodeGeneration',
            'generate_sweep', 'generate_pack_info_from_kernel', 'generate_pack_info_for_field', 'generate_pack_info',
            'generate_mpidtype_info_from_kernel', 'generate_staggered_boundary', 'generate_staggered_flux_boundary',
-           'get_vectorize_instruction_set']
+           'get_vectorize_instruction_set',
+           'generate_info_header']
diff --git a/python/pystencils_walberla/boundary.py b/python/pystencils_walberla/boundary.py
index 0d251236c0f9400bc2aea424e775aa1eb6c7e5a2..c81df201f43951031035c48771e3c891f3c36e33 100644
--- a/python/pystencils_walberla/boundary.py
+++ b/python/pystencils_walberla/boundary.py
@@ -1,4 +1,3 @@
-from collections import OrderedDict
 import numpy as np
 from jinja2 import Environment, PackageLoader, StrictUndefined
 from pystencils import Field, FieldType
@@ -7,9 +6,12 @@ from pystencils.boundaries.createindexlist import (
     boundary_index_array_coordinate_names, direction_member_name,
     numpy_data_type_for_boundary_object)
 from pystencils.data_types import TypedSymbol, create_type
-from pystencils_walberla.codegen import KernelInfo, default_create_kernel_parameters
+from pystencils_walberla.codegen import default_create_kernel_parameters
 from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
 from pystencils_walberla.additional_data_handler import AdditionalDataHandler
+from pystencils_walberla.kernel_selection import (
+    KernelFamily, AbstractKernelSelectionNode, KernelCallNode, HighLevelInterfaceSpec)
+from pystencils.astnodes import KernelFunction
 
 
 def generate_boundary(generation_context,
@@ -23,6 +25,8 @@ def generate_boundary(generation_context,
                       target='cpu',
                       namespace='pystencils',
                       additional_data_handler=None,
+                      interface_mappings=(),
+                      generate_functor=True,
                       **create_kernel_params):
 
     if boundary_object.additional_data and additional_data_handler is None:
@@ -51,37 +55,27 @@ def generate_boundary(generation_context,
     if not kernel_creation_function:
         kernel_creation_function = create_boundary_kernel
 
-    kernels = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object, **create_kernel_params)
-    if isinstance(kernels, dict):
-        sweep_to_kernel_info_dict = OrderedDict()
-        dummy_kernel_info = None
-        for sweep_class, sweep_kernel in kernels.items():
-            sweep_kernel.function_name = "boundary_" + boundary_object.name + '_' + sweep_class
-            sweep_kernel.assumed_inner_stride_one = False
-            kernel_info = KernelInfo(sweep_kernel)
-            sweep_to_kernel_info_dict[sweep_class] = kernel_info
-            if dummy_kernel_info is None:
-                dummy_kernel_info = kernel_info
-            # elif not dummy_kernel_info.has_same_interface(kernel_info):
-            #     raise ValueError("Multiple boundary sweeps must have the same kernel interface!")
-        multi_sweep = True
-    else:
-        multi_sweep = False
-        kernel = kernels
+    kernel = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object, **create_kernel_params)
+
+    if isinstance(kernel, KernelFunction):
         kernel.function_name = "boundary_" + boundary_object.name
-        kernel.assumed_inner_stride_one = False
-        kernel_info = KernelInfo(kernel)
-        sweep_to_kernel_info_dict = {'': kernel_info}
-        dummy_kernel_info = kernel_info
+        selection_tree = KernelCallNode(kernel)
+    elif isinstance(kernel, AbstractKernelSelectionNode):
+        selection_tree = kernel
+    else:
+        raise ValueError(f"kernel_creation_function returned wrong type: {kernel.__class__}")
+
+    kernel_family = KernelFamily(selection_tree, class_name)
+    interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, interface_mappings)
 
     if additional_data_handler is None:
         additional_data_handler = AdditionalDataHandler(stencil=neighbor_stencil)
 
     context = {
+        'kernel': kernel_family,
         'class_name': boundary_object.name,
-        'sweep_classes': sweep_to_kernel_info_dict,
-        'multi_sweep': multi_sweep,
-        'dummy_kernel_info': dummy_kernel_info,
+        'interface_spec': interface_spec,
+        'generate_functor': generate_functor,
         'StructName': struct_name,
         'StructDeclaration': struct_from_numpy_dtype(struct_name, index_struct_dtype),
         'dim': dim,
diff --git a/python/pystencils_walberla/cmake_integration.py b/python/pystencils_walberla/cmake_integration.py
index d8b7e615c6ef93d177fc3245661baded53252ec4..8533b21359a91fc52fa865185e5cf7242d8dad0c 100644
--- a/python/pystencils_walberla/cmake_integration.py
+++ b/python/pystencils_walberla/cmake_integration.py
@@ -64,9 +64,9 @@ def parse_json_args():
     expected_files = parsed['EXPECTED_FILES']
     cmake_vars = {}
     for key, value in parsed['CMAKE_VARS'].items():
-        if value in ("ON", "1", "YES"):
+        if value in ("ON", "1", "YES", "TRUE"):
             value = True
-        elif value in ("OFF", "0", "NO"):
+        elif value in ("OFF", "0", "NO", "FALSE"):
             value = False
         cmake_vars[key] = value
     return expected_files, cmake_vars
diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py
index f1a84798f2af17c254febf5e4c327964db824d91..27773a7e45d059ce322cb206ab3d92fae1e340df 100644
--- a/python/pystencils_walberla/codegen.py
+++ b/python/pystencils_walberla/codegen.py
@@ -12,6 +12,10 @@ from pystencils.backends.cbackend import get_headers
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
 from pystencils.stencil import inverse_direction, offset_to_direction_string
 from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, HighLevelInterfaceSpec
+from pystencils.backends.cuda_backend import CudaSympyPrinter
+from pystencils.kernelparameters import SHAPE_DTYPE
+from pystencils.data_types import TypedSymbol
 
 __all__ = ['generate_sweep', 'generate_pack_info', 'generate_pack_info_for_field', 'generate_pack_info_from_kernel',
            'generate_mpidtype_info_from_kernel', 'default_create_kernel_parameters', 'KernelInfo',
@@ -50,49 +54,92 @@ def generate_sweep(generation_context, class_name, assignments,
     """
     create_kernel_params = default_create_kernel_parameters(generation_context, create_kernel_params)
 
-    if not generation_context.cuda and create_kernel_params['target'] == 'gpu':
+    target = create_kernel_params['target']
+    if not generation_context.cuda and target == 'gpu':
         return
 
     if isinstance(assignments, KernelFunction):
         ast = assignments
-        create_kernel_params['target'] = ast.target
+        target = ast.target
     elif not staggered:
         ast = create_kernel(assignments, **create_kernel_params)
     else:
         ast = create_staggered_kernel(assignments, **create_kernel_params)
-    ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']
-    ast.nontemporal = create_kernel_params['cpu_vectorize_info']['nontemporal']
-    ast.openmp = create_kernel_params['cpu_openmp']
 
+    ast.function_name = class_name.lower()
+
+    selection_tree = KernelCallNode(ast)
+    generate_selective_sweep(generation_context, class_name, selection_tree, target=target, namespace=namespace,
+                             field_swaps=field_swaps, varying_parameters=varying_parameters,
+                             inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include,
+                             cpu_vectorize_info=create_kernel_params['cpu_vectorize_info'],
+                             cpu_openmp=create_kernel_params['cpu_openmp'])
+
+
+def generate_selective_sweep(generation_context, class_name, selection_tree, interface_mappings=(), target=None,
+                             namespace='pystencils', field_swaps=(), varying_parameters=(),
+                             inner_outer_split=False, ghost_layers_to_include=0,
+                             cpu_vectorize_info=None, cpu_openmp=False):
+    """Generates a selective sweep from a kernel selection tree. A kernel selection tree consolidates multiple
+    pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`.
+
+    Args:
+        generation_context: see documentation of `generate_sweep`
+        class_name: name of the generated sweep class
+        selection_tree: Instance of `AbstractKernelSelectionNode`, root of the selection tree
+        interface_mappings: sequence of `AbstractInterfaceArgumentMapping` instances for selection arguments of
+                            the selection tree
+        target: `None`, `'cpu'` or `'gpu'`; inferred from kernels if `None` is given.
+        namespace: see documentation of `generate_sweep`
+        field_swaps: see documentation of `generate_sweep`
+        varying_parameters: see documentation of `generate_sweep`
+        inner_outer_split: see documentation of `generate_sweep`
+        ghost_layers_to_include: see documentation of `generate_sweep`
+        cpu_vectorize_info: Dictionary containing information about CPU vectorization applied to the kernels
+        cpu_openmp: Whether or not CPU kernels use OpenMP parallelization
+    """
     def to_name(f):
         return f.name if isinstance(f, Field) else f
 
     field_swaps = tuple((to_name(e[0]), to_name(e[1])) for e in field_swaps)
     temporary_fields = tuple(e[1] for e in field_swaps)
 
-    ast.function_name = class_name.lower()
+    kernel_family = KernelFamily(selection_tree, class_name,
+                                 temporary_fields, field_swaps, varying_parameters)
+
+    if target is None:
+        target = kernel_family.get_ast_attr('target')
+    elif target != kernel_family.get_ast_attr('target'):
+        raise ValueError('Mismatch between target parameter and AST targets.')
+
+    if not generation_context.cuda and target == 'gpu':
+        return
+
+    representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter}
+    representative_field = sorted(representative_field)[0]
 
     env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
     add_pystencils_filters_to_jinja_env(env)
 
-    main_kernel_info = KernelInfo(ast, temporary_fields, field_swaps, varying_parameters)
-    representative_field = {p.field_name for p in main_kernel_info.parameters if p.is_field_parameter}
-    representative_field = sorted(representative_field)[0]
+    interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, interface_mappings)
 
     jinja_context = {
-        'kernel': main_kernel_info,
+        'kernel': kernel_family,
         'namespace': namespace,
         'class_name': class_name,
-        'target': create_kernel_params.get("target", "cpu"),
+        'target': target,
         'field': representative_field,
-        'headers': get_headers(ast),
         'ghost_layers_to_include': ghost_layers_to_include,
-        'inner_outer_split': inner_outer_split
+        'inner_outer_split': inner_outer_split,
+        'interface_spec': interface_spec,
+        'generate_functor': True,
+        'cpu_vectorize_info': cpu_vectorize_info,
+        'cpu_openmp': cpu_openmp
     }
     header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
     source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
 
-    source_extension = "cpp" if create_kernel_params.get("target", "cpu") == "cpu" else "cu"
+    source_extension = "cpp" if target == "cpu" else "cu"
     generation_context.write_file("{}.h".format(class_name), header)
     generation_context.write_file("{}.{}".format(class_name, source_extension), source)
 
@@ -327,11 +374,48 @@ class KernelInfo:
         self.varying_parameters = tuple(varying_parameters)
         self.parameters = ast.get_parameters()  # cache parameters here
 
-    def has_same_interface(self, other):
-        return self.temporary_fields == other.temporary_fields \
-            and self.field_swaps == other.field_swaps \
-            and self.varying_parameters == other.varying_parameters \
-            and self.parameters == other.parameters
+    @property
+    def fields_accessed(self):
+        return self.ast.fields_accessed
+
+    def get_ast_attr(self, name):
+        """Returns the value of an attribute of the AST managed by this KernelInfo.
+        For compatibility with KernelFamily."""
+        return self.ast.__getattribute__(name)
+
+    def generate_kernel_invocation_code(self, **kwargs):
+        ast = self.ast
+        ast_params = self.parameters
+        is_cpu = self.ast.target == 'cpu'
+        call_parameters = ", ".join([p.symbol.name for p in ast_params])
+
+        if not is_cpu:
+            stream = kwargs.get('stream', '0')
+            spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ())
+
+            if not spatial_shape_symbols:
+                spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape]
+                spatial_shape_symbols.sort(key=lambda e: e.coordinate)
+            else:
+                spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols]
+
+            assert spatial_shape_symbols, "No shape parameters in kernel function arguments.\n"\
+                "Please only use kernels for generic field sizes!"
+
+            indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
+            sp_printer_c = CudaSympyPrinter()
+            kernel_call_lines = [
+                "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
+                                                                  for e in indexing_dict['block']),
+                "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
+                                                                 for e in indexing_dict['grid']),
+                "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
+                                                                    stream, call_parameters),
+            ]
+
+            return "\n".join(kernel_call_lines)
+        else:
+            return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});"
 
 
 def get_vectorize_instruction_set(generation_context):
diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py
index ad6d317878ed6ce650ad81c7223f6d744cba79a5..00fb7279e989838a4fdfc6f2d733596bdcfec79c 100644
--- a/python/pystencils_walberla/jinja_filters.py
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -2,12 +2,9 @@ import jinja2
 import sympy as sp
 # import re
 
-from pystencils import TypedSymbol
 from pystencils.backends.cbackend import generate_c
-from pystencils.backends.cuda_backend import CudaSympyPrinter
-from pystencils.data_types import get_base_type
+from pystencils.data_types import TypedSymbol, get_base_type
 from pystencils.field import FieldType
-from pystencils.kernelparameters import SHAPE_DTYPE
 from pystencils.sympyextensions import prod
 
 temporary_fieldMemberTemplate = """
@@ -92,6 +89,24 @@ def generate_definition(kernel_info, target='cpu'):
     return result
 
 
+def generate_declarations(kernel_family, target='cpu'):
+    declarations = []
+    for ast in kernel_family.all_asts:
+        code = generate_c(ast, signature_only=True, dialect='cuda' if target == 'gpu' else 'c') + ";"
+        code = "namespace internal_%s {\n%s\n}\n" % (ast.function_name, code,)
+        declarations.append(code)
+    return "\n".join(declarations)
+
+
+def generate_definitions(kernel_family, target='cpu'):
+    definitions = []
+    for ast in kernel_family.all_asts:
+        code = generate_c(ast, dialect='cuda' if target == 'gpu' else 'c')
+        code = "namespace internal_%s {\nstatic %s\n}\n" % (ast.function_name, code)
+        definitions.append(code)
+    return "\n".join(definitions)
+
+
 def field_extraction_code(field, is_temporary, declaration_only=False,
                           no_declaration=False, is_gpu=False, update_member=False):
     """Returns code string for getting a field pointer.
@@ -178,7 +193,7 @@ def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignor
 
 
 @jinja2.contextfilter
-def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=None, stream='0',
+def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, stream='0',
                   spatial_shape_symbols=()):
     """Generates the function call to a pystencils kernel
 
@@ -199,16 +214,23 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
                                may be necessary.
     """
     assert isinstance(ghost_layers_to_include, str) or ghost_layers_to_include >= 0
-    ast = kernel_info.ast
-    ast_params = kernel_info.parameters
-    is_cpu = ctx['target'] == 'cpu'
+    ast_params = kernel.parameters
+    vec_info = ctx.get('cpu_vectorize_info', None)
+    instruction_set = kernel.get_ast_attr('instruction_set')
+    if vec_info:
+        assume_inner_stride_one = vec_info['assume_inner_stride_one']
+        nontemporal = vec_info['nontemporal']
+    else:
+        assume_inner_stride_one = nontemporal = False
+    cpu_openmp = ctx.get('cpu_openmp', False)
+    kernel_ghost_layers = kernel.get_ast_attr('ghost_layers')
 
     ghost_layers_to_include = sp.sympify(ghost_layers_to_include)
-    if ast.ghost_layers is None:
+    if kernel_ghost_layers is None:
         required_ghost_layers = 0
     else:
         # ghost layer info is ((x_gl_front, x_gl_end), (y_gl_front, y_gl_end).. )
-        required_ghost_layers = max(max(ast.ghost_layers))
+        required_ghost_layers = max(max(kernel_ghost_layers))
 
     kernel_call_lines = []
 
@@ -251,15 +273,15 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
                 coordinates = tuple(coordinates)
                 kernel_call_lines.append(f"{param.symbol.dtype} {param.symbol.name} = {param.field_name}->dataAt"
                                          f"({coordinates[0]}, {coordinates[1]}, {coordinates[2]}, {coordinates[3]});")
-                if ast.assumed_inner_stride_one and field.index_dimensions > 0:
+                if assume_inner_stride_one and field.index_dimensions > 0:
                     kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({param.field_name}->layout(), field::fzyx);")
-                if ast.instruction_set and ast.assumed_inner_stride_one:
-                    if ast.nontemporal and ast.openmp and 'cachelineZero' in ast.instruction_set:
+                if instruction_set and assume_inner_stride_one:
+                    if nontemporal and cpu_openmp and 'cachelineZero' in instruction_set:
                         kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                                 f"{ast.instruction_set['cachelineSize']}, 0);")
+                                                 f"{instruction_set['cachelineSize']}, 0);")
                     else:
                         kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                                 f"{ast.instruction_set['bytes']}, 0);")
+                                                 f"{instruction_set['bytes']}, 0);")
         elif param.is_field_stride:
             casted_stride = get_field_stride(param)
             type_str = param.symbol.dtype.base_name
@@ -273,38 +295,19 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
             max_value = f"{field.name}->{('x', 'y', 'z')[coord]}SizeWithGhostLayer()"
             kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({max_value}, {shape});")
             kernel_call_lines.append(f"const {type_str} {param.symbol.name} = {shape};")
-            if ast.assumed_inner_stride_one and field.index_dimensions > 0:
+            if assume_inner_stride_one and field.index_dimensions > 0:
                 kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({field.name}->layout(), field::fzyx);")
-            if ast.instruction_set and ast.assumed_inner_stride_one:
-                if ast.nontemporal and ast.openmp and 'cachelineZero' in ast.instruction_set:
+            if instruction_set and assume_inner_stride_one:
+                if nontemporal and cpu_openmp and 'cachelineZero' in instruction_set:
                     kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                             f"{ast.instruction_set['cachelineSize']}, 0);")
+                                             f"{instruction_set['cachelineSize']}, 0);")
                 else:
                     kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                             f"{ast.instruction_set['bytes']}, 0);")
+                                             f"{instruction_set['bytes']}, 0);")
 
-    call_parameters = ", ".join([p.symbol.name for p in ast_params])
+    kernel_call_lines.append(kernel.generate_kernel_invocation_code(stream=stream,
+                                                                    spatial_shape_symbols=spatial_shape_symbols))
 
-    if not is_cpu:
-        if not spatial_shape_symbols:
-            spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape]
-            spatial_shape_symbols.sort(key=lambda e: e.coordinate)
-        else:
-            spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols]
-
-        assert spatial_shape_symbols, "No shape parameters in kernel function arguments.\n"\
-            "Please be only use kernels for generic field sizes!"
-
-        indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
-        sp_printer_c = CudaSympyPrinter()
-        kernel_call_lines += [
-            "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) for e in indexing_dict['block']),
-            "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']),
-            "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
-                                                                stream, call_parameters),
-        ]
-    else:
-        kernel_call_lines.append("internal_%s::%s(%s);" % (ast.function_name, ast.function_name, call_parameters))
     return "\n".join(kernel_call_lines)
 
 
@@ -373,8 +376,7 @@ def generate_constructor_call_arguments(kernel_info, parameters_to_ignore=None):
 
 @jinja2.contextfilter
 def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=False):
-    ast = kernel_info.ast
-    fields = {f.name: f for f in ast.fields_accessed}
+    fields = {f.name: f for f in kernel_info.fields_accessed}
 
     params_to_skip = tuple(parameters_to_ignore) + tuple(kernel_info.temporary_fields)
     params_to_skip += tuple(e[1] for e in kernel_info.varying_parameters)
@@ -424,9 +426,58 @@ def nested_class_method_definition_prefix(ctx, nested_class_name):
         return outer_class + '::' + nested_class_name
 
 
+def generate_list_of_expressions(expressions, prepend=''):
+    if len(expressions) == 0:
+        return ''
+    return prepend + ", ".join(expressions)
+
+
+def type_identifier_list(nested_arg_list):
+    """
+    Filters a nested list of strings and TypedSymbols and returns a comma-separated string.
+    Strings are passed through as they are, but TypedSymbols are formatted as C-style
+    'type identifier' strings, e.g. 'uint32_t ghost_layers'.
+    """
+    result = []
+
+    def recursive_flatten(list):
+        for s in list:
+            if isinstance(s, str):
+                result.append(s)
+            elif isinstance(s, TypedSymbol):
+                result.append(f"{s.dtype} {s.name}")
+            else:
+                recursive_flatten(s)
+
+    recursive_flatten(nested_arg_list)
+    return ", ".join(result)
+
+
+def identifier_list(nested_arg_list):
+    """
+    Filters a nested list of strings and TypedSymbols and returns a comma-separated string.
+    Strings are passed through as they are, but TypedSymbols are replaced by their name.
+    """
+    result = []
+
+    def recursive_flatten(list):
+        for s in list:
+            if isinstance(s, str):
+                result.append(s)
+            elif isinstance(s, TypedSymbol):
+                result.append(s.name)
+            else:
+                recursive_flatten(s)
+
+    recursive_flatten(nested_arg_list)
+    return ", ".join(result)
+
+
 def add_pystencils_filters_to_jinja_env(jinja_env):
     jinja_env.filters['generate_definition'] = generate_definition
     jinja_env.filters['generate_declaration'] = generate_declaration
+    jinja_env.filters['generate_definitions'] = generate_definitions
+    jinja_env.filters['generate_declarations'] = generate_declarations
     jinja_env.filters['generate_members'] = generate_members
     jinja_env.filters['generate_constructor_parameters'] = generate_constructor_parameters
     jinja_env.filters['generate_constructor_initializer_list'] = generate_constructor_initializer_list
@@ -437,3 +488,6 @@ def add_pystencils_filters_to_jinja_env(jinja_env):
     jinja_env.filters['generate_refs_for_kernel_parameters'] = generate_refs_for_kernel_parameters
     jinja_env.filters['generate_destructor'] = generate_destructor
     jinja_env.filters['nested_class_method_definition_prefix'] = nested_class_method_definition_prefix
+    jinja_env.filters['type_identifier_list'] = type_identifier_list
+    jinja_env.filters['identifier_list'] = identifier_list
+    jinja_env.filters['list_of_expressions'] = generate_list_of_expressions
diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e2becd6f5efdbced366cdaed5b2d4ec536ae3e1
--- /dev/null
+++ b/python/pystencils_walberla/kernel_selection.py
@@ -0,0 +1,343 @@
+from typing import Sequence, Set
+from collections import OrderedDict
+from functools import reduce
+from jinja2.filters import do_indent
+from pystencils import TypedSymbol
+from pystencils.backends.cbackend import get_headers
+from pystencils.backends.cuda_backend import CudaSympyPrinter
+from pystencils.kernelparameters import SHAPE_DTYPE
+
+
+"""
+
+This module contains several classes and methods supporting the code generation of sweeps
+containing multiple kernels.
+
+A sweep class may be backed by multiple kernel implementations with (nearly) the same interface.
+Those might be slightly altered versions of a single kernel, and the one to be executed may
+be selected at run time according to certain criteria. An example are the even/odd alternating
+kernels required for lattice Boltzmann inplace streaming. Executing multiple kernels in sequence
+within a single sweep could also be a use case, which is however not yet implemented.
+
+## The Kernel Selection Tree
+
+The selection of the correct kernel is modelled by a tree structure, spanned by instances of
+subclasses of `AbstractKernelSelectionNode`. The code generator traverses this tree and generates
+code for invoking kernels according to its structure and nodes.
+Currently, two types of nodes exist:
+
+- Condition Nodes: Subclasses of `AbstractConditionNode` manifest as if/else statements
+    in the generated code. They model decision points and have two subtrees; one for the
+    `true` case, and one for `false`. A basic implementation is `SimpleBooleanCondition`,
+    which simply branches according to a boolean value. Each condition node requires a number
+    of selection parameters. The total of selection parameters of the entire tree are collected
+    during code generation, and must all be passed to the generated C++ functions.
+- Kernel Call Nodes: Currently, `KernelCallNode` corresponds to the invocation of a single
+    kernel. `KernelCallNode` acts as a wrapper for a single pystencils AST. When encountered
+    in the tree during code generation, the function call for this AST is inserted.
+
+
+## The Kernel Family
+
+The `KernelFamily` class is a wrapper around a kernel selection tree. It was developed as a
+generalization of the `KernelInfo` class. Its purpose is the collection and management of
+information about the tree and its kernels which is required for code generation.
+It also checks the tree's ASTs for consistency; for example by making sure that any fields
+and symbols required by multiple ASTs have the same type, dimensions, et cetera.
+
+
+## High-Level Interface
+
+Due to the tree's selection arguments, which must be passed to the methods wrapping the
+kernel calls, the generated class can not by itself be used as a sweep functor to be passed
+to the waLBerla timeloop. Instead, for all sweep types, a `get[...]Sweep` member function is
+generated. It takes any required selection arguments, and returns a lambda function which
+can be passed directly to the timeloop.
+
+Using the interface mapping system, the 'low-level' selection arguments of the kernel tree
+can be hidden behind higher-level arguments. Such argument mappings are modelled by the
+subclasses of `AbstractInterfaceArgumentMapping`. During code generation, they are organized
+in an instance of `HighLevelInterfaceSpec`, which is used to generate the high-level interface.
+
+
+"""
+
+# ---------------------------------- Selection Tree --------------------------------------------------------------------
+
+
+class AbstractKernelSelectionNode:
+
+    def collect_kernel_calls(self):
+        raise NotImplementedError()
+
+    @property
+    def selection_parameters(self) -> Set[TypedSymbol]:
+        raise NotImplementedError()
+
+    def collect_selection_parameters(self) -> Set[TypedSymbol]:
+        return self.selection_parameters
+
+    def get_selection_parameter_list(self) -> Sequence[TypedSymbol]:
+        all_params = self.collect_selection_parameters()
+        all_names = set(p.name for p in all_params)
+        if len(all_names) < len(all_params):
+            raise ValueError('There existed selection parameters of same name, but different type.')
+        return sorted(all_params, key=lambda x: x.name)
+
+    def get_code(self, **kwargs) -> str:
+        raise NotImplementedError()
+
+
+class AbstractConditionNode(AbstractKernelSelectionNode):
+    def __init__(self, branch_true, branch_false):
+        self.branch_true = branch_true
+        self.branch_false = branch_false
+
+    @property
+    def condition_text(self) -> str:
+        raise NotImplementedError()
+
+    def collect_kernel_calls(self):
+        return self.branch_true.collect_kernel_calls() | self.branch_false.collect_kernel_calls()
+
+    def collect_selection_parameters(self) -> Set[TypedSymbol]:
+        return self.selection_parameters.union(self.branch_true.collect_selection_parameters(),
+                                               self.branch_false.collect_selection_parameters())
+
+    def get_code(self, **kwargs):
+        true_branch_code = self.branch_true.get_code(**kwargs)
+        false_branch_code = self.branch_false.get_code(**kwargs)
+
+        true_branch_code = do_indent(true_branch_code, width=4, indentfirst=True)
+        false_branch_code = do_indent(false_branch_code, width=4, indentfirst=True)
+
+        code = f"if({self.condition_text}) {{\n"
+        code += true_branch_code
+        code += "\n} else {\n"
+        code += false_branch_code
+        code += "\n}"
+        return code
+
+
+class KernelCallNode(AbstractKernelSelectionNode):
+    def __init__(self, ast):
+        self.ast = ast
+        self.parameters = ast.get_parameters()  # cache parameters here
+
+    @property
+    def selection_parameters(self) -> Set[TypedSymbol]:
+        return set()
+
+    def collect_kernel_calls(self):
+        return {self}
+
+    def get_code(self, **kwargs):
+        ast = self.ast
+        ast_params = self.parameters
+        is_cpu = self.ast.target == 'cpu'
+        call_parameters = ", ".join([p.symbol.name for p in ast_params])
+
+        if not is_cpu:
+            stream = kwargs.get('stream', '0')
+            spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ())
+
+            if not spatial_shape_symbols:
+                spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape]
+                spatial_shape_symbols.sort(key=lambda e: e.coordinate)
+            else:
+                spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols]
+
+            assert spatial_shape_symbols, "No shape parameters in kernel function arguments.\n"\
+                "Please only use kernels for generic field sizes!"
+
+            indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
+            sp_printer_c = CudaSympyPrinter()
+            kernel_call_lines = [
+                "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
+                                                                  for e in indexing_dict['block']),
+                "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
+                                                                 for e in indexing_dict['grid']),
+                "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
+                                                                    stream, call_parameters),
+            ]
+
+            return "\n".join(kernel_call_lines)
+        else:
+            return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});"
+
+
+class SimpleBooleanCondition(AbstractConditionNode):
+    def __init__(self,
+                 parameter_name: str,
+                 branch_true: AbstractKernelSelectionNode,
+                 branch_false: AbstractKernelSelectionNode):
+        self.parameter_symbol = TypedSymbol(parameter_name, bool)
+        super(SimpleBooleanCondition, self).__init__(branch_true, branch_false)
+
+    @property
+    def selection_parameters(self) -> Set[TypedSymbol]:
+        return {self.parameter_symbol}
+
+    @property
+    def condition_text(self) -> str:
+        return self.parameter_symbol.name
+
+
+# ---------------------------------- Kernel Family ---------------------------------------------------------------------
+
+
+class KernelFamily:
+    def __init__(self, kernel_selection_tree: AbstractKernelSelectionNode,
+                 class_name: str,
+                 temporary_fields=(), field_swaps=(), varying_parameters=()):
+        self.kernel_selection_tree = kernel_selection_tree
+        self.kernel_selection_parameters = kernel_selection_tree.get_selection_parameter_list()
+        self.temporary_fields = tuple(temporary_fields)
+        self.field_swaps = tuple(field_swaps)
+        self.varying_parameters = tuple(varying_parameters)
+
+        all_kernel_calls = self.kernel_selection_tree.collect_kernel_calls()
+        all_param_lists = [k.parameters for k in all_kernel_calls]
+        asts_list = [k.ast for k in all_kernel_calls]
+        self.representative_ast = asts_list[0]
+
+        #   Eliminate duplicates
+        self.all_asts = set(asts_list)
+
+        #   Check function names for uniqueness and reformat them
+        #   using the class name
+        function_names = [ast.function_name.lower() for ast in self.all_asts]
+        unique_names = set(function_names)
+        if len(unique_names) < len(function_names):
+            raise ValueError('Function names of kernel family members must be unique!')
+
+        prefix = class_name.lower()
+        for ast in self.all_asts:
+            ast.function_name = prefix + '_' + ast.function_name
+
+        all_fields = [k.ast.fields_accessed for k in all_kernel_calls]
+
+        #   Collect function parameters and accessed fields
+        self.parameters = merge_lists_of_symbols(all_param_lists)
+        self.fields_accessed = reduce(lambda x, y: x | y, all_fields)
+
+        self._ast_attrs = dict()
+
+    def get_ast_attr(self, name):
+        """Returns the value of an attribute of the ASTs managed by this KernelFamily only
+        if it is the same in all ASTs."""
+        if name not in self._ast_attrs:
+            attr = self.representative_ast.__getattribute__(name)
+            for ast in self.all_asts:
+                if ast.__getattribute__(name) != attr:
+                    raise ValueError(f'Inconsistency in kernel family: Attribute {name} was different in {ast}!')
+            self._ast_attrs[name] = attr
+        return self._ast_attrs[name]
+
+    def get_headers(self):
+        all_headers = [get_headers(ast) for ast in self.all_asts]
+        return reduce(merge_sorted_lists, all_headers)
+
+    def generate_kernel_invocation_code(self, **kwargs):
+        return self.kernel_selection_tree.get_code(**kwargs)
+
+
+# --------------------------- High-Level Sweep Interface Specification ------------------------------------------------
+
+
+class AbstractInterfaceArgumentMapping:
+    def __init__(self, high_level_args: Sequence[TypedSymbol], low_level_arg: TypedSymbol):
+        self.high_level_args = high_level_args
+        self.low_level_arg = low_level_arg
+
+    @property
+    def mapping_code(self):
+        raise NotImplementedError()
+
+    @property
+    def headers(self):
+        return set()
+
+
+class IdentityMapping(AbstractInterfaceArgumentMapping):
+
+    def __init__(self, low_level_arg: TypedSymbol):
+        self.high_level_args = (low_level_arg,)
+        self.low_level_arg = low_level_arg
+
+    @property
+    def mapping_code(self):
+        return self.low_level_arg.name
+
+
+def _create_interface_mapping_dict(low_level_args: Sequence[TypedSymbol],
+                                   mappings: Sequence[AbstractInterfaceArgumentMapping]):
+    mapping_dict = OrderedDict()
+    for m in mappings:
+        larg = m.low_level_arg
+        if larg not in low_level_args:
+            raise ValueError(f'Low-level argument {larg} did not exist.')
+        if larg.name in mapping_dict:
+            raise ValueError(f'More than one mapping was given for low-level argument {larg}')
+        mapping_dict[larg.name] = m
+
+    for arg in low_level_args:
+        mapping_dict.setdefault(arg.name, IdentityMapping(arg))
+
+    return mapping_dict
+
+
+class HighLevelInterfaceSpec:
+    def __init__(self, low_level_args: Sequence[TypedSymbol],
+                 mappings: Sequence[AbstractInterfaceArgumentMapping]):
+        self.low_level_args = low_level_args
+        mapping_dict = _create_interface_mapping_dict(low_level_args, mappings)
+        self.mappings = mapping_dict.values()
+        high_level_args = OrderedDict()
+        self.mapping_codes = []
+        self.headers = set()
+        for larg in low_level_args:
+            m = mapping_dict[larg.name]
+            self.mapping_codes.append(m.mapping_code)
+            self.headers |= m.headers
+            for harg in m.high_level_args:
+                if high_level_args.setdefault(harg.name, harg) != harg:
+                    raise ValueError(f'High-Level Argument {harg} was given multiple times with different types.')
+
+        self.high_level_args = list(high_level_args.values())
+
+
+# ---------------------------------- Helpers --------------------------------------------------------------------------
+
+
+def merge_sorted_lists(lx, ly, sort_key=lambda x: x, identity_check_key=None):
+    if identity_check_key is None:
+        identity_check_key = sort_key
+    nx = len(lx)
+    ny = len(ly)
+
+    def recursive_merge(lx, ly, ix, iy):
+        if ix == nx:
+            return ly[iy:]
+        if iy == ny:
+            return lx[ix:]
+        x = lx[ix]
+        y = ly[iy]
+        skx = sort_key(x)
+        sky = sort_key(y)
+        if skx == sky:
+            if identity_check_key(x) == identity_check_key(y):
+                return [x] + recursive_merge(lx, ly, ix + 1, iy + 1)
+            else:
+                raise ValueError(f'Elements <{x}> and <{y}> with equal sort key where not identical!')
+        elif skx < sky:
+            return [x] + recursive_merge(lx, ly, ix + 1, iy)
+        else:
+            return [y] + recursive_merge(lx, ly, ix, iy + 1)
+    return recursive_merge(lx, ly, 0, 0)
+
+
+def merge_lists_of_symbols(lists):
+    def merger(lx, ly):
+        return merge_sorted_lists(lx, ly, sort_key=lambda x: x.symbol.name, identity_check_key=lambda x: x.symbol)
+    return reduce(merger, lists)
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.cpp b/python/pystencils_walberla/templates/Boundary.tmpl.cpp
index ef1ec3b7b860f3ac0fd4b290e1e5fe33bd7f772e..382047df7e2abaad65ac2ae2f45adde353b8266d 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.cpp
@@ -50,12 +50,7 @@ namespace {{namespace}} {
 #pragma diag_suppress = declared_but_not_referenced
 #endif
 
-
-{% for sweep_class, sweep_kernel in sweep_classes.items() %}
-
-{{sweep_kernel|generate_definition}}
-
-{% endfor %}
+{{kernel|generate_definitions(target)}}
 
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
@@ -65,46 +60,52 @@ namespace {{namespace}} {
 #pragma pop
 #endif
 
-{% for sweep_class, sweep_kernel in sweep_classes.items() %}
 
-
-void {{sweep_class|nested_class_method_definition_prefix}}::run( IBlock * block, IndexVectors::Type type {% if target == 'gpu'%}, cudaStream_t stream {%endif%})
+void {{class_name}}::run_impl(
+   {{- ["IBlock * block", "IndexVectors::Type type",
+        kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []]
+       | type_identifier_list -}}
+)
 {
-    auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
-    int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
-    if( indexVectorSize == 0)
-        return;
-
-    {% if target == 'gpu' -%}
-    auto pointer = indexVectors->pointerGpu(type);
-    {% else %}
-    auto pointer = indexVectors->pointerCpu(type);
-    {% endif %}
-
-    uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
-
-    {{sweep_kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}}
-    {{sweep_kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize'], ignore_fields=True)|indent(4) }}
-    {{sweep_kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}}
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   {% if target == 'gpu' -%}
+   auto pointer = indexVectors->pointerGpu(type);
+   {% else %}
+   auto pointer = indexVectors->pointerCpu(type);
+   {% endif %}
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   {{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}}
+   {{kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize'], ignore_fields=True)|indent(4) }}
+   {{kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}}
 }
 
-void {{sweep_class|nested_class_method_definition_prefix}}::operator() ( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} )
+void {{class_name}}::run(
+   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
+)
 {
-    run( block, IndexVectors::ALL{% if target == 'gpu'%}, stream {%endif%});
+   run_impl( {{- ["block", "IndexVectors::ALL", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
 }
 
-void {{sweep_class|nested_class_method_definition_prefix}}::inner( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} )
+void {{class_name}}::inner(
+   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
+)
 {
-    run( block, IndexVectors::INNER{% if target == 'gpu'%}, stream {%endif%} );
+   run_impl( {{- ["block", "IndexVectors::INNER", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
 }
 
-void {{sweep_class|nested_class_method_definition_prefix}}::outer( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} )
+void {{class_name}}::outer(
+   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
+)
 {
-    run( block, IndexVectors::OUTER{% if target == 'gpu'%}, stream {%endif%} );
+   run_impl( {{- ["block", "IndexVectors::OUTER", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
 }
 
-{% endfor %}
-
 } // namespace {{namespace}}
 } // namespace walberla
 
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.h b/python/pystencils_walberla/templates/Boundary.tmpl.h
index c0a39c32b0ae8c08202ae7f0641f03db0ad4e592..e22d96a796ac9e1420a4d69b5b9122081a655dbf 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.h
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.h
@@ -34,6 +34,10 @@
 #include <set>
 #include <vector>
 
+{% for header in interface_spec.headers %}
+#include {{header}}
+{% endfor %}
+
 #ifdef __GNUC__
 #define RESTRICT __restrict__
 #elif _MSC_VER
@@ -108,46 +112,55 @@ public:
         {%- endif %}
     };
 
-    {% if multi_sweep %}
-    {% for sweep_class_name, sweep_kernel_info in sweep_classes.items() %}
-    class {{sweep_class_name}}
+    {{class_name}}( const shared_ptr<StructuredBlockForest> & blocks,
+                   {{kernel|generate_constructor_parameters(['indexVector', 'indexVectorSize'])}}{{additional_data_handler.constructor_arguments}})
+        :{{additional_data_handler.initialiser_list}} {{ kernel|generate_constructor_initializer_list(['indexVector', 'indexVectorSize']) }}
     {
-        public:
-            {{sweep_class_name}} ( {{sweep_kernel_info|generate_constructor_parameters(['indexVectorSize'])}} )
-                : {{ sweep_kernel_info|generate_constructor_initializer_list(['indexVectorSize']) }} {};
-
-            void operator() ( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
-            void inner( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
-            void outer( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
-        
-        private:
-            void run( IBlock * block, IndexVectors::Type type{% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
-
-            {{sweep_kernel_info|generate_members(['indexVectorSize'])|indent(12)}}
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_{{class_name}}");
     };
 
-    {{sweep_class_name}} get{{sweep_class_name}} ()
+    void run (
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+    );
+
+    {% if generate_functor -%}
+    void operator() (
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+    )
     {
-        return {{sweep_class_name}} ( {{sweep_kernel_info|generate_constructor_call_arguments(['indexVectorSize'])|indent(12)}} );
+        run( {{- ["block", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
     }
-    {% endfor %}
-    {% endif %}
+    {%- endif %}
 
-    {{class_name}}( const shared_ptr<StructuredBlockForest> & blocks,
-                   {{dummy_kernel_info|generate_constructor_parameters(['indexVector', 'indexVectorSize'])}}{{additional_data_handler.constructor_arguments}})
-        :{{additional_data_handler.initialiser_list}} {{ dummy_kernel_info|generate_constructor_initializer_list(['indexVector', 'indexVectorSize']) }}
-    {
-        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
-        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_{{class_name}}");
-    };
+    void inner (
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+    );
+
+    void outer (
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+    );
 
-    {% if not multi_sweep %}
+    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    {
+        return [ {{- ["this", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ]
+               (IBlock * b)
+               { this->run( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
+    }
 
-    void operator() ( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
-    void inner( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
-    void outer( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
+    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    {
+        return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ]
+               (IBlock * b)
+               { this->inner( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
+    }
 
-    {% endif %}
+    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    {
+        return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ]
+               (IBlock * b)
+               { this->outer( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
+    }
 
     template<typename FlagField_T>
     void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
@@ -208,14 +221,16 @@ public:
     }
 
 private:
-    {% if not multi_sweep %}
-    void run( IBlock * block, IndexVectors::Type type{% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
-    {% endif %}
+    void run_impl(
+        {{- ["IBlock * block", "IndexVectors::Type type",
+             kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []]
+            | type_identifier_list -}}
+   );
 
     BlockDataID indexVectorID;
     {{additional_data_handler.additional_member_variable|indent(4)}}
 public:
-    {{dummy_kernel_info|generate_members(('indexVector', 'indexVectorSize'))|indent(4)}}
+    {{kernel|generate_members(('indexVector', 'indexVectorSize'))|indent(4)}}
 };
 
 
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
index 9711be8cde6d7a94475c8172cb858c03631f60d4..d70096e97faa2a9ff0da26fbb996b5f5b5764ac1 100644
--- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
@@ -23,7 +23,8 @@
 #include "core/DataTypes.h"
 #include "core/Macros.h"
 #include "{{class_name}}.h"
-{% for header in headers %}
+
+{% for header in kernel.get_headers() %}
 #include {{header}}
 {% endfor %}
 
@@ -53,9 +54,9 @@ namespace walberla {
 namespace {{namespace}} {
 
 
-{{kernel|generate_definition(target)}}
+{{kernel|generate_definitions(target)}}
 
-void {{class_name}}::operator()( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
+void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
     {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }}
@@ -64,10 +65,11 @@ void {{class_name}}::operator()( IBlock * block{%if target is equalto 'gpu'%} ,
 }
 
 
-void {{class_name}}::runOnCellInterval( const shared_ptr<StructuredBlockStorage> & blocks,
-                                        const CellInterval & globalCellInterval,
-                                        cell_idx_t ghostLayers,
-                                        IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
+void {{class_name}}::runOnCellInterval(
+    {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", "cell_idx_t ghostLayers", "IBlock * block",
+         kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] 
+        | type_identifier_list -}}
+)
 {
     CellInterval ci = globalCellInterval;
     CellInterval blockBB = blocks->getBlockCellBB( *block);
@@ -84,7 +86,7 @@ void {{class_name}}::runOnCellInterval( const shared_ptr<StructuredBlockStorage>
 }
 
 {%if inner_outer_split%}
-void {{class_name}}::inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
+void {{class_name}}::inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
 
@@ -96,7 +98,7 @@ void {{class_name}}::inner( IBlock * block{%if target is equalto 'gpu'%} , cudaS
 }
 
 
-void {{class_name}}::outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream {% endif %} )
+void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
 
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.h b/python/pystencils_walberla/templates/Sweep.tmpl.h
index 9f0711a1e807553b2ca9cea2fa5dc5790c9c91af..093e2332ac4e5925305701a4f0d54887f18dffe1 100644
--- a/python/pystencils_walberla/templates/Sweep.tmpl.h
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.h
@@ -34,6 +34,10 @@
 #include "domain_decomposition/StructuredBlockStorage.h"
 #include <set>
 
+{% for header in interface_spec.headers %}
+#include {{header}}
+{% endfor %}
+
 #ifdef __GNUC__
 #define RESTRICT __restrict__
 #elif _MSC_VER
@@ -61,52 +65,100 @@ public:
 
     {{ kernel| generate_destructor(class_name) |indent(4) }}
 
-    void operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} );
-    void runOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks,
-                           const CellInterval & globalCellInterval, cell_idx_t ghostLayers, IBlock * block
-                           {%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %});
+    void run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
+    
+    void runOnCellInterval(
+        {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", "cell_idx_t ghostLayers", "IBlock * block",
+             kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] 
+            | type_identifier_list -}}
+    );
+
+    {% if generate_functor %}
+    void operator() (
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+    )
+    {
+        run( {{- ["block", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
+    }
+    {% endif %}
 
+    static std::function<void (IBlock *)> getSweep(
+        {{- ["const shared_ptr<" + class_name + "> & kernel", kernel.kernel_selection_parameters] | type_identifier_list -}}                                                  
+    )
+    {
+        return [ {{- [ ['kernel'], kernel.kernel_selection_parameters ] | identifier_list -}} ] 
+               (IBlock * b) 
+               { kernel->run( {{- [ ['b'], kernel.kernel_selection_parameters] | identifier_list -}} ); };
+    }
 
+    static std::function<void (IBlock* {%- if target is equalto 'gpu' %}, cudaStream_t {% endif -%} )> getSweepOnCellInterval(
+        {{- ["const shared_ptr<" + class_name + "> & kernel", "const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval",
+             kernel.kernel_selection_parameters, 'cell_idx_t ghostLayers=1']
+            | type_identifier_list -}}
+    )
+    {
+        return [ {{- ["kernel", "blocks", "globalCellInterval", "ghostLayers", kernel.kernel_selection_parameters] | identifier_list -}} ]
+               (IBlock * b{%- if target is equalto 'gpu'%}, cudaStream_t stream = nullptr{% endif -%}) 
+               { kernel->runOnCellInterval(
+                    {{- ["blocks", "globalCellInterval", "ghostLayers", "b", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []]
+                        | identifier_list 
+                    -}}
+                ); };
+    }
 
-    static std::function<void (IBlock*)> getSweep(const shared_ptr<{{class_name}}> & kernel) {
-        return [kernel](IBlock * b) { (*kernel)(b); };
+    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    {
+        return [ {{- ["this", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ] 
+               (IBlock * b) 
+               { this->run( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
     }
 
-    static std::function<void (IBlock*{%if target is equalto 'gpu'%} , cudaStream_t {% endif %})>
-            getSweepOnCellInterval(const shared_ptr<{{class_name}}> & kernel,
-                                   const shared_ptr<StructuredBlockStorage> & blocks,
-                                   const CellInterval & globalCellInterval,
-                                   cell_idx_t ghostLayers=1 )
+    std::function<void (IBlock *)> getSweepOnCellInterval(
+        {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval",
+             interface_spec.high_level_args, 'cell_idx_t ghostLayers=1', ["cudaStream_t stream = nullptr"] if target == 'gpu' else []]
+            | type_identifier_list -}}
+    )
     {
-        return [kernel, blocks, globalCellInterval, ghostLayers] (IBlock * b{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %}) {
-            kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b{%if target is equalto 'gpu'%}, stream {% endif %});
-        };
+        return [ {{- ["this", "blocks", "globalCellInterval", "ghostLayers", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ]
+               (IBlock * b) 
+               { this->runOnCellInterval( {{- ["blocks", "globalCellInterval", "ghostLayers", "b", interface_spec.mapping_codes, ["stream"] if target == 'gpu' else []] | identifier_list -}} ); };
     }
 
 {% if inner_outer_split %}
+    void inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
 
-    void inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} );
-    void outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} );
+    void outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
 
-    void setOuterPriority(int priority ) {
-       {%if target is equalto 'gpu'%}
-       parallelStreams_.setStreamPriority(priority);
-       {%endif%}
+    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    {
+        return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] 
+               (IBlock * b) 
+               { this->inner( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    {
+        return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] 
+               (IBlock * b) 
+               { this->outer( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
+    }
+
+    void setOuterPriority(int priority) 
+    {
+        {%if target is equalto 'gpu'%}
+        parallelStreams_.setStreamPriority(priority);
+        {%endif%}
     }
     {{kernel|generate_members|indent(4)}}
 
 private:
-    {%if target is equalto 'gpu'%}
-    cuda::ParallelStreams parallelStreams_;
-    {% endif %}
+    {%if target is equalto 'gpu' -%} cuda::ParallelStreams parallelStreams_; {%- endif %}
 
     Cell outerWidth_;
     std::vector<CellInterval> layers_;
-
-{%- else -%}
+{% else %}
     {{ kernel|generate_members|indent(4) }}
-{% endif -%}
-
+{% endif %}
 };
 
 
diff --git a/python/pystencils_walberla/utility.py b/python/pystencils_walberla/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bdf6e9923fe4fb106ab2975109a7c31a7f0d633
--- /dev/null
+++ b/python/pystencils_walberla/utility.py
@@ -0,0 +1,91 @@
+from os import path
+from pystencils.data_types import get_base_type
+from pystencils_walberla.cmake_integration import CodeGenerationContext
+
+HEADER_EXTENSIONS = {'.h', '.hpp'}
+
+
+def generate_info_header(ctx: CodeGenerationContext,
+                         filename: str,
+                         stencil_typedefs: dict = None,
+                         field_typedefs: dict = None,
+                         additional_headers: set = None,
+                         headers_to_ignore: set = None,
+                         additional_typedefs: dict = None):
+    """Generates an info header, consolidating required information about the generated code.
+    The info header #includes all generated header files, and is thus the only header the
+    application needs to #include. It can also contain aliases for waLBerla stencil types and
+    instances of the GhostLayerField template.
+
+    Args:
+        ctx: Code Generation Context
+        filename: Name of the generated info header file
+        stencil_typedefs: dict mapping type names to stencil names or tuples
+        field_typedefs: dict mapping type names to pystencils `Field` instances
+        additional_headers: additional header files to be included
+        headers_to_ignore: headers which should not be included
+        additional_typedefs: dict mapping aliases to types.
+    """
+    stencil_typedefs = stencil_typedefs if stencil_typedefs is not None else dict()
+    field_typedefs = field_typedefs if field_typedefs is not None else dict()
+    additional_typedefs = additional_typedefs if additional_typedefs is not None else dict()
+
+    additional_headers = additional_headers if additional_headers is not None else set()
+    headers_to_ignore = headers_to_ignore if headers_to_ignore is not None else set()
+
+    headers_in_ctx = set(_filter_headers(ctx.files_written))
+
+    stencil_headers, stencil_typedefs = _stencil_inclusion_code(stencil_typedefs)
+    field_headers, field_typedefs = _field_inclusion_code(field_typedefs)
+
+    headers_to_include = stencil_headers | field_headers | headers_in_ctx | additional_headers
+    headers_to_include = sorted(headers_to_include - headers_to_ignore)
+    headers_to_include = [f'#include "{header}"' for header in headers_to_include]
+
+    typedefs = {**stencil_typedefs, **field_typedefs, **additional_typedefs}
+    typedefs = [f"using {alias} = {typename};" for alias, typename in typedefs.items()]
+
+    lines = '\n'.join(headers_to_include + [''] + typedefs) + '\n'
+
+    if path.splitext(filename)[1] not in HEADER_EXTENSIONS:
+        filename += '.h'
+
+    ctx.write_file(filename, lines)
+
+
+#   ------------------------------------- INTERNAL -------------------------------------------------------------
+
+
+def _filter_headers(filepaths):
+    for p in filepaths:
+        if path.splitext(p)[1] in HEADER_EXTENSIONS:
+            yield path.split(p)[1]
+
+
+def _stencil_inclusion_code(stencil_typedefs):
+    headers = set()
+    typedefs = dict()
+    for typename, stencil in stencil_typedefs.items():
+        if isinstance(stencil, tuple):
+            dim = len(stencil[0])
+            q = len(stencil)
+            stencil = f"D{dim}Q{q}"
+        elif not isinstance(stencil, str):
+            raise ValueError(f'Invalid stencil: Do not know what to make of {stencil}')
+
+        headers.add(f"stencil/{stencil}.h")
+        typedefs[typename] = f"walberla::stencil::{stencil}"
+
+    return headers, typedefs
+
+
+def _field_inclusion_code(field_typedefs):
+    headers = set()
+    typedefs = dict()
+    for typename, field in field_typedefs.items():
+        f_size = field.values_per_cell()
+        dtype = get_base_type(field.dtype)
+        headers.add("field/GhostLayerField.h")
+        typedefs[typename] = f"walberla::field::GhostLayerField<{dtype}, {f_size}>"
+
+    return headers, typedefs
diff --git a/src/lbm/inplace_streaming/TimestepTracker.h b/src/lbm/inplace_streaming/TimestepTracker.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0ca22635fea5d9a454f99e2d4fe08a519578eef
--- /dev/null
+++ b/src/lbm/inplace_streaming/TimestepTracker.h
@@ -0,0 +1,50 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file TimestepTracker.h
+//! \\ingroup lbm
+//! \\author Frederik Hennig <frederik.hennig@fau.de>
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+
+namespace walberla
+{
+namespace lbm
+{
+class TimestepTracker
+{
+ private:
+   uint8_t counter_;
+
+ public:
+   TimestepTracker() : counter_(0) {}
+   TimestepTracker(uint8_t zeroth_timestep) : counter_(zeroth_timestep & 1) {}
+
+   void advance() { counter_ = (counter_ + 1) & 1; }
+
+   std::function< void() > getAdvancementFunction()
+   {
+      return [this]() { this->advance(); };
+   }
+
+   uint8_t getCounter() const { return counter_; }
+
+}; // class TimestepTracker
+
+} // namespace lbm
+} // namespace walberla
diff --git a/tests/lbm/CMakeLists.txt b/tests/lbm/CMakeLists.txt
index b67913654b775abb7fe5b9d246bf6be5bb52d7d3..2ce27af42009821fcf9e920d4582ab521104c0f8 100644
--- a/tests/lbm/CMakeLists.txt
+++ b/tests/lbm/CMakeLists.txt
@@ -124,4 +124,25 @@ waLBerla_generate_target_from_python(NAME LbmPackInfoGenerationTestCodegen FILE
 waLBerla_link_files_to_builddir( "diff_packinfos.sh" )
 waLBerla_execute_test( NAME LbmPackInfoGenerationDiffTest COMMAND bash diff_packinfos.sh )
 
+
+waLBerla_generate_target_from_python( NAME InplaceStreamingCodegen
+      FILE codegen/InplaceStreamingCodegen.py
+      OUT_FILES PullSweep.h PullSweep.cpp
+                PullNoSlip.h PullNoSlip.cpp
+                PullUBB.h PullUBB.cpp
+                PullOutflow.h PullOutflow.cpp
+                PullInit.h PullInit.cpp
+                InPlaceSweep.h InPlaceSweep.cpp
+                InPlaceNoSlip.h InPlaceNoSlip.cpp
+                InPlaceUBB.h InPlaceUBB.cpp
+                InPlaceOutflow.h InPlaceOutflow.cpp
+                InPlaceInit.h InPlaceInit.cpp
+                InplaceStreamingCodegen.h)
+
+waLBerla_compile_test( FILES codegen/InplaceStreamingCodegenTest.cpp
+                       DEPENDS InplaceStreamingCodegen )
+
+waLBerla_execute_test( NAME InplaceStreamingCodegenTest
+                       COMMAND $<TARGET_FILE:InplaceStreamingCodegenTest> ${CMAKE_CURRENT_SOURCE_DIR}/codegen/InplaceStreamingCodegen3D.prm  )
+
 endif()
diff --git a/tests/lbm/codegen/InplaceStreamingCodegen.py b/tests/lbm/codegen/InplaceStreamingCodegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fcef0f4215805cfa40f68d6fe82f6d3cfbba1c6
--- /dev/null
+++ b/tests/lbm/codegen/InplaceStreamingCodegen.py
@@ -0,0 +1,92 @@
+from lbmpy_walberla import generate_alternating_lbm_sweep, generate_boundary, generate_alternating_lbm_boundary
+from lbmpy_walberla.additional_data_handler import OutflowAdditionalDataHandler
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
+
+from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_ast
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
+from lbmpy.advanced_streaming import Timestep
+
+from pystencils import Field
+from lbmpy.stencils import get_stencil
+
+#   Common Setup
+
+stencil = get_stencil('D3Q27')
+dim = len(stencil[0])
+q = len(stencil)
+target = 'cpu'
+inplace_pattern = 'aa'
+two_fields_pattern = 'pull'
+namespace = 'lbmpy'
+
+f_field = Field.create_generic('f', dim, index_shape=(q,), layout='fzyx')
+f_field_tmp = Field.create_generic('f_tmp', dim, index_shape=(q,), layout='fzyx')
+u_field = Field.create_generic('u', dim, index_shape=(dim,), layout='fzyx')
+
+output = {
+    'velocity': u_field
+}
+
+method_params = {
+    'stencil': stencil,
+    'method': 'srt',
+    'relaxation_rate': 1.5,
+    'output': output
+}
+
+optimization = {
+    'target': target,
+    'symbolic_field': f_field,
+    'symbolic_temporary_field': f_field_tmp
+}
+
+collision_rule = create_lb_collision_rule(**method_params)
+lb_method = collision_rule.method
+noslip = NoSlip()
+ubb = UBB((0.05,) + (0,) * (dim - 1))
+
+outflow_normal = (1,) + (0,) * (dim - 1)
+outflow_pull = ExtrapolationOutflow(outflow_normal, lb_method, streaming_pattern=two_fields_pattern)
+
+outflow_inplace = ExtrapolationOutflow(outflow_normal, lb_method, streaming_pattern=inplace_pattern)
+
+init_velocity = (0, ) * dim
+
+init_kernel_pull = macroscopic_values_setter(lb_method, 1, init_velocity, f_field, streaming_pattern=two_fields_pattern)
+init_kernel_inplace = macroscopic_values_setter(
+    lb_method, 1, init_velocity, f_field, streaming_pattern=inplace_pattern, previous_timestep=Timestep.ODD)
+
+stencil_typedefs = {'Stencil_T': stencil}
+field_typedefs = {'PdfField_T': f_field, 'VelocityField_T': u_field}
+
+with CodeGeneration() as ctx:
+    #   Pull-Pattern classes
+    ast_pull = create_lb_ast(collision_rule=collision_rule,
+                             streaming_pattern=two_fields_pattern, optimization=optimization)
+    generate_sweep(ctx, 'PullSweep', ast_pull, field_swaps=[(f_field, f_field_tmp)], namespace=namespace)
+
+    generate_boundary(ctx, 'PullNoSlip', noslip, lb_method,
+                      streaming_pattern=two_fields_pattern, target=target, namespace=namespace)
+    generate_boundary(ctx, 'PullUBB', ubb, lb_method, streaming_pattern=two_fields_pattern,
+                      target=target, namespace=namespace)
+    generate_boundary(ctx, 'PullOutflow', outflow_pull, lb_method,
+                      streaming_pattern=two_fields_pattern, target=target, namespace=namespace)
+
+    generate_sweep(ctx, 'PullInit', init_kernel_pull, target=target, namespace=namespace)
+
+    #   Inplace Pattern classes
+    generate_alternating_lbm_sweep(ctx, 'InPlaceSweep', collision_rule, inplace_pattern,
+                                   optimization=optimization, namespace=namespace)
+
+    generate_alternating_lbm_boundary(ctx, 'InPlaceNoSlip', noslip, lb_method, streaming_pattern=inplace_pattern,
+                                      after_collision=True, target=target, namespace=namespace)
+    generate_alternating_lbm_boundary(ctx, 'InPlaceUBB', ubb, lb_method, streaming_pattern=inplace_pattern,
+                                      after_collision=True, target=target, namespace=namespace)
+    generate_alternating_lbm_boundary(ctx, 'InPlaceOutflow', outflow_inplace, lb_method, streaming_pattern=inplace_pattern,
+                                      after_collision=True, target=target, namespace=namespace)
+
+    generate_sweep(ctx, 'InPlaceInit', init_kernel_inplace, target=target, namespace=namespace)
+
+    generate_info_header(ctx, "InplaceStreamingCodegen.h",
+                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs)
diff --git a/tests/lbm/codegen/InplaceStreamingCodegen2D.prm b/tests/lbm/codegen/InplaceStreamingCodegen2D.prm
new file mode 100644
index 0000000000000000000000000000000000000000..08b9584d69ea211bc4cbbac0c85b70980843ade9
--- /dev/null
+++ b/tests/lbm/codegen/InplaceStreamingCodegen2D.prm
@@ -0,0 +1,33 @@
+
+Parameters
+{
+   timesteps   50;
+   vtkWriteFrequency 0;
+}
+
+DomainSetup
+{
+   blocks        <   1,  1,  1 >;
+   cellsPerBlock <  30, 30, 1 >;
+   periodic      <  0,    0, 0 >;
+}
+
+Boundaries
+{
+	Border { direction W;
+            walldistance -1;
+            flag UBB;
+            ghostLayersToInitialize 0; }
+	Border { direction E;
+            walldistance -1;
+            flag Outflow;
+            ghostLayersToInitialize 0; }
+   Border { direction N;
+            walldistance -1;
+            flag NoSlip;
+            ghostLayersToInitialize 1; }
+   Border { direction S;
+            walldistance -1;
+            flag NoSlip;
+            ghostLayersToInitialize 1; }
+}
diff --git a/tests/lbm/codegen/InplaceStreamingCodegen3D.prm b/tests/lbm/codegen/InplaceStreamingCodegen3D.prm
new file mode 100644
index 0000000000000000000000000000000000000000..0fa8f50f1ee88c521401e345c8f88ca7dd645a5c
--- /dev/null
+++ b/tests/lbm/codegen/InplaceStreamingCodegen3D.prm
@@ -0,0 +1,45 @@
+
+Parameters
+{
+   timesteps   70;
+   vtkWriteFrequency 0;
+}
+
+DomainSetup
+{
+   blocks        <   1,  1,  1 >;
+   cellsPerBlock <  30, 30, 30 >;
+   periodic      <  0,    0, 0 >;
+}
+
+Boundaries
+{
+	Border { direction W;
+            walldistance -1;
+            flag UBB; }
+
+   Border { direction E;
+            walldistance -1;
+            flag Outflow;
+            ghostLayersToInitialize 0; }
+
+   Border { direction N;
+         walldistance -1;
+         flag NoSlip;
+         ghostLayersToInitialize 1; }
+
+   Border { direction S;
+         walldistance -1;
+         flag NoSlip;
+         ghostLayersToInitialize 1; }
+
+   Border { direction T;
+         walldistance -1;
+         flag NoSlip;
+         ghostLayersToInitialize 1; }
+
+   Border { direction B;
+         walldistance -1;
+         flag NoSlip;
+         ghostLayersToInitialize 1; }
+}
diff --git a/tests/lbm/codegen/InplaceStreamingCodegenTest.cpp b/tests/lbm/codegen/InplaceStreamingCodegenTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..718042efa2b1edb911052328a28cdb210ab65de9
--- /dev/null
+++ b/tests/lbm/codegen/InplaceStreamingCodegenTest.cpp
@@ -0,0 +1,163 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file InplaceStreamingCodegenTest.cpp
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+
+#include "InplaceStreamingCodegen.h"
+
+#include "blockforest/all.h"
+
+#include "core/Environment.h"
+
+#include "field/all.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "lbm/inplace_streaming/TimestepTracker.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include <string>
+
+using namespace walberla::lbmpy;
+
+namespace walberla
+{
+typedef uint8_t boundaryFlag_t;
+typedef FlagField< boundaryFlag_t > FlagField_T;
+
+int main(int argc, char** argv)
+{
+   Environment walberlaEnv(argc, argv);
+   auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config());
+
+   auto params              = walberlaEnv.config()->getBlock("Parameters");
+   uint_t timesteps         = params.getParameter< uint_t >("timesteps");
+   uint_t vtkWriteFrequency = params.getParameter< uint_t >("vtkWriteFrequency");
+
+   BlockDataID pullPdfFieldID    = field::addToStorage< PdfField_T >(blocks, "f_pull", real_c(0.0), field::fzyx);
+   BlockDataID inplacePdfFieldID = field::addToStorage< PdfField_T >(blocks, "f_inplace", real_c(0.0), field::fzyx);
+
+   BlockDataID pullVelocityFieldID = field::addToStorage< VelocityField_T >(blocks, "u_pull", real_c(0.0), field::fzyx);
+   BlockDataID inplaceVelocityFieldID =
+      field::addToStorage< VelocityField_T >(blocks, "u_inplace", real_c(0.0), field::fzyx);
+
+   BlockDataID boundaryFlagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "boundaryFlagField", 1);
+
+   // Set up boundary handling
+
+   const FlagUID fluidFlagUID("Fluid");
+   const FlagUID noslipFlagUID("NoSlip");
+   const FlagUID ubbFlagUID("UBB");
+   const FlagUID outflowFlagUID("Outflow");
+
+   auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries");
+
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, boundaryFlagFieldID, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, boundaryFlagFieldID, fluidFlagUID);
+
+   PullNoSlip pullNoSlip(blocks, pullPdfFieldID);
+   pullNoSlip.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, noslipFlagUID, fluidFlagUID);
+
+   PullUBB pullUBB(blocks, pullPdfFieldID);
+   pullUBB.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, ubbFlagUID, fluidFlagUID);
+
+   PullOutflow pullOutflow(blocks, pullPdfFieldID);
+   pullOutflow.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, outflowFlagUID, fluidFlagUID);
+
+   InPlaceNoSlip inplaceNoSlip(blocks, inplacePdfFieldID);
+   inplaceNoSlip.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, noslipFlagUID, fluidFlagUID);
+
+   InPlaceUBB inplaceUBB(blocks, inplacePdfFieldID);
+   inplaceUBB.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, ubbFlagUID, fluidFlagUID);
+
+   InPlaceOutflow inplaceOutflow(blocks, inplacePdfFieldID);
+   inplaceOutflow.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, outflowFlagUID, fluidFlagUID);
+
+   // Init fields
+
+   PullInit pullInit(pullPdfFieldID);
+   InPlaceInit inplaceInit(inplacePdfFieldID);
+
+   for (auto b = blocks->begin(); b != blocks->end(); b++)
+   {
+      pullInit(&(*b));
+      inplaceInit(&(*b));
+   }
+
+   // Set up sweeps
+
+   PullSweep pullLbmSweep(pullPdfFieldID, pullVelocityFieldID);
+   InPlaceSweep inplaceLbmSweep(inplacePdfFieldID, inplaceVelocityFieldID);
+
+   auto tracker = make_shared< lbm::TimestepTracker >(1);
+
+   // Pull Timeloop
+
+   SweepTimeloop pullTimeloop(blocks, timesteps);
+   pullTimeloop.add() << Sweep(pullUBB);
+   pullTimeloop.add() << Sweep(pullNoSlip);
+   pullTimeloop.add() << Sweep(pullOutflow);
+   pullTimeloop.add() << Sweep(pullLbmSweep);
+
+   if (vtkWriteFrequency > 0)
+   {
+      pullTimeloop.addFuncAfterTimeStep(field::createVTKOutput< VelocityField_T, float >(
+         pullVelocityFieldID, *blocks, "pull velocity", vtkWriteFrequency, uint_c(0), false, "pull_vtk"));
+   }
+
+   pullTimeloop.run();
+
+   // In-Place Timeloop
+
+   SweepTimeloop inplaceTimeloop(blocks, timesteps);
+   inplaceTimeloop.add() << Sweep(inplaceUBB.getSweep(tracker));
+   inplaceTimeloop.add() << Sweep(inplaceNoSlip.getSweep(tracker));
+   inplaceTimeloop.add() << Sweep(inplaceOutflow.getSweep(tracker)) << AfterFunction(tracker->getAdvancementFunction());
+   inplaceTimeloop.add() << Sweep(inplaceLbmSweep.getSweep(tracker));
+
+   if (vtkWriteFrequency > 0)
+   {
+      inplaceTimeloop.addFuncAfterTimeStep(field::createVTKOutput< VelocityField_T, float >(
+         inplaceVelocityFieldID, *blocks, "inplace velocity", vtkWriteFrequency, uint_c(0), false, "inplace_vtk"));
+   }
+
+   inplaceTimeloop.run();
+
+   // Validate results
+
+   for (auto b = blocks->begin(); b != blocks->end(); b++)
+   {
+      auto pullVelocityField    = b->getData< VelocityField_T >(pullVelocityFieldID);
+      auto inplaceVelocityField = b->getData< VelocityField_T >(inplaceVelocityFieldID);
+
+      CellInterval fieldSize = pullVelocityField->xyzSize();
+      for (Cell c : fieldSize)
+      {
+         for (uint_t i = 0; i < pullVelocityField->fSize(); i++)
+         {
+            WALBERLA_CHECK_FLOAT_EQUAL(pullVelocityField->get(c, i), inplaceVelocityField->get(c, i));
+         }
+      }
+   }
+
+   return 0;
+}
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }