diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index 3f6b8a92eb7650d9d885e92ad1162cd6dbda5757..f418d434ca38902dc79be8f9b9a4207f9fb90de6 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -22,7 +22,6 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
    add_subdirectory( FieldCommunication )
 
    if ( WALBERLA_BUILD_WITH_CODEGEN )
-      add_subdirectory( FlowAroundSphereCodeGen )
       add_subdirectory( UniformGridCPU )
       add_subdirectory( PhaseFieldAllenCahn )
       add_subdirectory( NonUniformGridCPU )
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
deleted file mode 100644
index 17cfd93fd6f8c1efce2a82fcd6ad24dfd14c76bf..0000000000000000000000000000000000000000
--- a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-waLBerla_link_files_to_builddir( "*.py" )
-
-waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated
-        FILE FlowAroundSphereCodeGen.py
-        OUT_FILES FlowAroundSphereCodeGen_LbSweep.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_LbSweep.h
-        FlowAroundSphereCodeGen_MacroSetter.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_MacroSetter.h
-        FlowAroundSphereCodeGen_UBB.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_UBB.h
-        FlowAroundSphereCodeGen_NoSlip.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_NoSlip.h
-        FlowAroundSphereCodeGen_Outflow.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_Outflow.h
-        FlowAroundSphereCodeGen_PackInfoEven.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_PackInfoEven.h
-        FlowAroundSphereCodeGen_PackInfoOdd.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_PackInfoOdd.h
-        FlowAroundSphereCodeGen_InfoHeader.h)
-
-if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
-    waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
-            DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
-else ()
-    waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
-            DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
-endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
\ No newline at end of file
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
deleted file mode 100644
index 494424a562e25503c6b189fa5cf2d957fafca313..0000000000000000000000000000000000000000
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file FlowAroundSphereCodeGen.cpp
-//! \author Frederik Hennig <frederik.hennig@fau.de>
-//! \author Markus Holzer <markus.holzer@fau.de>
-//
-//======================================================================================================================
-#include "blockforest/all.h"
-
-#include "core/all.h"
-
-#include "domain_decomposition/all.h"
-
-#include "field/all.h"
-
-#include "geometry/all.h"
-
-#include "lbm/inplace_streaming/TimestepTracker.h"
-#include "lbm/vtk/QCriterion.h"
-
-#include "python_coupling/CreateConfig.h"
-#include "python_coupling/PythonCallback.h"
-
-#include "timeloop/all.h"
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-#   include "gpu/AddGPUFieldToStorage.h"
-#   include "gpu/DeviceSelectMPI.h"
-#   include "gpu/HostFieldAllocator.h"
-#   include "gpu/NVTX.h"
-#   include "gpu/ParallelStreams.h"
-#   include "gpu/communication/GPUPackInfo.h"
-#   include "gpu/communication/UniformGPUScheme.h"
-#endif
-
-// CodeGen includes
-#include "FlowAroundSphereCodeGen_InfoHeader.h"
-
-namespace walberla
-{
-typedef lbm::FlowAroundSphereCodeGen_PackInfoEven PackInfoEven_T;
-typedef lbm::FlowAroundSphereCodeGen_PackInfoOdd PackInfoOdd_T;
-
-typedef walberla::uint8_t flag_t;
-typedef FlagField< flag_t > FlagField_T;
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-typedef gpu::GPUField< real_t > GPUField;
-#endif
-
-using namespace std::placeholders;
-
-auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) {
-   return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block),
-                         storage->getNumberOfZCells(*block), uint_t(1), field::fzyx,
-                         make_shared< field::AllocateAligned< real_t, 64 > >());
-};
-
-auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block,
-                           real_t inflow_velocity, const bool constant_inflow = true) {
-   if (constant_inflow)
-   {
-      Vector3< real_t > result(inflow_velocity, real_c(0.0), real_c(0.0));
-      return result;
-   }
-   else
-   {
-      Cell globalCell;
-      CellInterval domain = SbF->getDomainCellBB();
-      auto h_y          = real_c(domain.ySize());
-      auto h_z          = real_c(domain.zSize());
-      SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
-
-      auto y1 = real_c(globalCell[1] - (h_y / 2.0 - 0.5));
-      auto z1 = real_c(globalCell[2] - (h_z / 2.0 - 0.5));
-
-      real_t u = (inflow_velocity * real_c(16.0)) / (h_y * h_y * h_z * h_z) * (h_y / real_c(2.0) - y1) *
-                 (h_y / real_c(2.0) + y1) * (h_z / real_c(2.0) - z1) * (h_z / real_c(2.0) + z1);
-
-      Vector3< real_t > result(u, 0.0, 0.0);
-      return result;
-   }
-};
-
-class AlternatingBeforeFunction
-{
- public:
-   typedef std::function< void() > BeforeFunction;
-
-   AlternatingBeforeFunction(BeforeFunction evenFunc, BeforeFunction oddFunc,
-                             std::shared_ptr< lbm::TimestepTracker >& tracker)
-      : tracker_(tracker), funcs_{ evenFunc, oddFunc } {};
-
-   void operator()() { funcs_[tracker_->getCounter()](); }
-
- private:
-   std::shared_ptr< lbm::TimestepTracker > tracker_;
-   std::vector< BeforeFunction > funcs_;
-};
-
-class Filter
-{
- public:
-   explicit Filter(Vector3< uint_t > numberOfCells) : numberOfCells_(numberOfCells) {}
-
-   void operator()(const IBlock& /*block*/) {}
-
-   bool operator()(const cell_idx_t x, const cell_idx_t y, const cell_idx_t z) const
-   {
-      return x >= -1 && x <= cell_idx_t(numberOfCells_[0]) && y >= -1 && y <= cell_idx_t(numberOfCells_[1]) &&
-             z >= -1 && z <= cell_idx_t(numberOfCells_[2]);
-   }
-
- private:
-   Vector3< uint_t > numberOfCells_;
-};
-
-using FluidFilter_T = Filter;
-
-int main(int argc, char** argv)
-{
-   walberla::Environment walberlaEnv(argc, argv);
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-   gpu::selectDeviceBasedOnMpiRank();
-#endif
-
-   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
-   {
-      WALBERLA_MPI_WORLD_BARRIER()
-
-      auto config = *cfg;
-      logging::configureLogging(config);
-      auto blocks = blockforest::createUniformBlockGridFromConfig(config);
-
-      // read parameters
-      Vector3< uint_t > cellsPerBlock =
-         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
-      auto parameters = config->getOneBlock("Parameters");
-
-      const uint_t timesteps       = parameters.getParameter< uint_t >("timesteps", uint_c(10));
-      const real_t omega           = parameters.getParameter< real_t >("omega", real_c(1.9));
-      const real_t u_max           = parameters.getParameter< real_t >("u_max", real_c(0.05));
-      const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_c(1000.0));
-      const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
-      const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
-
-      const real_t remainingTimeLoggerFrequency =
-         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
-
-      // create fields
-      BlockDataID pdfFieldID     = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "PDFs");
-      BlockDataID velFieldID     = field::addToStorage< VelocityField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
-      BlockDataID densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx);
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      BlockDataID pdfFieldIDGPU = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "PDFs on GPU", true);
-      BlockDataID velFieldIDGPU =
-         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldID, "velocity on GPU", true);
-      BlockDataID densityFieldIDGPU =
-         gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldID, "density on GPU", true);
-#endif
-
-      BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
-
-      // initialise all PDFs
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldIDGPU, velFieldIDGPU);
-      for (auto& block : *blocks)
-         setterSweep(&block);
-      gpu::fieldCpy< PdfField_T, GPUField >(blocks, pdfFieldID, pdfFieldIDGPU);
-#else
-      pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldID, velFieldID);
-      for (auto& block : *blocks)
-         setterSweep(&block);
-#endif
-      // Create communication
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      // This way of using alternating pack infos is temporary and will soon be replaced
-      // by something more straight-forward
-
-      gpu::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false);
-      comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU));
-      auto evenComm = std::function< void() >([&]() { comEven.communicate(); });
-
-      gpu::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false);
-      comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU));
-      auto oddComm = std::function< void() >([&]() { comODD.communicate(); });
-#else
-      blockforest::communication::UniformBufferedScheme< Stencil_T > evenComm(blocks);
-      evenComm.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldID));
-
-      blockforest::communication::UniformBufferedScheme< Stencil_T > oddComm(blocks);
-      oddComm.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldID));
-#endif
-
-      // create and initialize boundary handling
-      const FlagUID fluidFlagUID("Fluid");
-
-      auto boundariesConfig = config->getOneBlock("Boundaries");
-
-      std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
-         velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max, constant_inflow);
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation);
-      lbm::FlowAroundSphereCodeGen_NoSlip noSlip(blocks, pdfFieldIDGPU);
-      lbm::FlowAroundSphereCodeGen_Outflow outflow(blocks, pdfFieldIDGPU, pdfFieldID);
-
-      lbm::FlowAroundSphereCodeGen_LbSweep lbSweep(densityFieldIDGPU, pdfFieldIDGPU, velFieldIDGPU, omega);
-#else
-      lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldID, velocity_initialisation);
-      lbm::FlowAroundSphereCodeGen_NoSlip noSlip(blocks, pdfFieldID);
-      lbm::FlowAroundSphereCodeGen_Outflow outflow(blocks, pdfFieldID);
-
-      lbm::FlowAroundSphereCodeGen_LbSweep lbSweep(densityFieldID, pdfFieldID, velFieldID, omega);
-#endif
-
-      geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
-      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
-
-      ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("UBB"), fluidFlagUID);
-      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
-      outflow.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("Outflow"), fluidFlagUID);
-
-      // create time loop
-      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
-
-      // Timestep Tracking and Sweeps
-      auto tracker = make_shared< lbm::TimestepTracker >(0);
-
-      AlternatingBeforeFunction communication(evenComm, oddComm, tracker);
-
-      // add LBM sweep and communication to time loop
-      timeloop.add() << BeforeFunction(communication, "communication") << Sweep(ubb.getSweep(tracker), "ubb boundary");
-      timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary");
-      timeloop.add() << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
-      timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement")
-                     << Sweep(lbSweep.getSweep(tracker), "LB update rule");
-
-      // LBM stability check
-      timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >(
-                                       config, blocks, pdfFieldID, flagFieldId, fluidFlagUID)),
-                                    "LBM stability check");
-
-      // log remaining time
-      timeloop.addFuncAfterTimeStep(
-         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
-         "remaining time logger");
-
-      // add VTK output to time loop
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
-      if (vtkWriteFrequency > 0)
-      {
-         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                         "simulation_step", false, true, true, false, 0);
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-         vtkOutput->addBeforeFunction([&]() {
-            gpu::fieldCpy< VelocityField_T, GPUField >(blocks, velFieldID, velFieldIDGPU);
-            gpu::fieldCpy< ScalarField_T, GPUField >(blocks, densityFieldID, densityFieldIDGPU);
-         });
-#endif
-         auto velWriter     = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "velocity");
-         auto densityWriter = make_shared< field::VTKWriter< ScalarField_T > >(densityFieldID, "density");
-         FluidFilter_T filter(cellsPerBlock);
-
-         auto QCriterionWriter = make_shared< lbm::QCriterionVTKWriter< VelocityField_T, FluidFilter_T > >(
-            blocks, filter, velFieldID, "Q-Criterion");
-
-         vtkOutput->addCellDataWriter(velWriter);
-         vtkOutput->addCellDataWriter(densityWriter);
-         vtkOutput->addCellDataWriter(QCriterionWriter);
-
-         timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
-      }
-
-      WcTimer simTimer;
-
-      WALBERLA_LOG_INFO_ON_ROOT("Simulating flow around sphere:"
-                                "\n timesteps:               "
-                                << timesteps << "\n reynolds number:         " << reynolds_number
-                                << "\n relaxation rate:         " << omega << "\n maximum inflow velocity: " << u_max
-                                << "\n diameter_sphere:         " << diameter_sphere)
-
-      simTimer.start();
-      timeloop.run();
-      simTimer.end();
-      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-      auto time            = real_c(simTimer.last());
-      auto nrOfCells       = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
-      auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
-      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
-      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
-   }
-
-   return EXIT_SUCCESS;
-}
-
-} // namespace walberla
-
-int main(int argc, char** argv) { walberla::main(argc, argv); }
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
deleted file mode 100644
index 7dd9d531b9730e9851e0f8cf53b7b48c4ae930a0..0000000000000000000000000000000000000000
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from pystencils import Target
-from pystencils.field import fields
-from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
-
-from lbmpy.advanced_streaming.utility import get_timesteps
-from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
-from lbmpy.creationfunctions import create_lb_collision_rule
-from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
-
-from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
-
-from lbmpy_walberla.additional_data_handler import UBBAdditionalDataHandler, OutflowAdditionalDataHandler
-from lbmpy_walberla import generate_lb_pack_info
-from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary
-
-import sympy as sp
-
-with CodeGeneration() as ctx:
-    data_type = "float64" if ctx.double_accuracy else "float32"
-    stencil = LBStencil(Stencil.D3Q27)
-    q = stencil.Q
-    dim = stencil.D
-    streaming_pattern = 'esotwist'
-    timesteps = get_timesteps(streaming_pattern)
-
-    pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : {data_type}[{dim}D]",
-                                                 layout='fzyx')
-    omega = sp.Symbol("omega")
-    u_max = sp.Symbol("u_max")
-
-    output = {
-        'density': density_field,
-        'velocity': velocity_field
-    }
-
-    lbm_config = LBMConfig(stencil=stencil, method=Method.CUMULANT, compressible=True,
-                           relaxation_rate=omega, galilean_correction=True,
-                           field_name='pdfs', streaming_pattern=streaming_pattern, output=output)
-
-    lbm_optimisation = LBMOptimisation(symbolic_field=pdfs, cse_global=False, cse_pdfs=False)
-
-    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_optimisation)
-    lb_method = collision_rule.method
-
-    # getter & setter
-    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs, density=1.0,
-                                                   streaming_pattern=streaming_pattern,
-                                                   previous_timestep=timesteps[0])
-    setter_assignments = setter_assignments.new_without_unused_subexpressions()
-
-    # opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
-
-    stencil_typedefs = {'Stencil_T': stencil}
-    field_typedefs = {'PdfField_T': pdfs,
-                      'VelocityField_T': velocity_field,
-                      'ScalarField_T': density_field}
-
-    if ctx.cuda:
-        target = Target.GPU
-    else:
-        target = Target.CPU
-
-    # sweeps
-    generate_alternating_lbm_sweep(ctx, 'FlowAroundSphereCodeGen_LbSweep',
-                                   collision_rule, lbm_config=lbm_config, lbm_optimisation=lbm_optimisation,
-                                   target=target)
-    generate_sweep(ctx, 'FlowAroundSphereCodeGen_MacroSetter', setter_assignments, target=target)
-
-    # boundaries
-    ubb = UBB(lambda *args: None, dim=dim, data_type=data_type)
-    ubb_data_handler = UBBAdditionalDataHandler(stencil, ubb)
-    outflow = ExtrapolationOutflow(stencil[4], lb_method, streaming_pattern=streaming_pattern, data_type=data_type)
-    outflow_data_handler = OutflowAdditionalDataHandler(stencil, outflow, target=target)
-
-    generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_UBB', ubb, lb_method,
-                                      target=target, streaming_pattern=streaming_pattern,
-                                      additional_data_handler=ubb_data_handler,
-                                      layout='fzyx')
-
-    generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_NoSlip', NoSlip(), lb_method,
-                                      target=target, streaming_pattern=streaming_pattern,
-                                      layout='fzyx')
-
-    generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_Outflow', outflow, lb_method,
-                                      target=target, streaming_pattern=streaming_pattern,
-                                      additional_data_handler=outflow_data_handler,
-                                      layout='fzyx')
-
-    # communication
-    generate_lb_pack_info(ctx, 'FlowAroundSphereCodeGen_PackInfo', stencil, pdfs,
-                          streaming_pattern=streaming_pattern, always_generate_separate_classes=True, target=target)
-
-    # Info header containing correct template definitions for stencil and field
-    generate_info_header(ctx, 'FlowAroundSphereCodeGen_InfoHeader',
-                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs)
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
deleted file mode 100644
index 673c10e4d7a2a04117d2cb3a25ab1999d94311bd..0000000000000000000000000000000000000000
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import waLBerla as wlb
-from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
-
-
-class Scenario:
-    def __init__(self):
-        self.timesteps = 10
-        self.vtkWriteFrequency = 100
-
-        self.cells = (64, 32, 32)
-        self.blocks = (1, 1, 1)
-        self.periodic = (0, 0, 0)
-
-        self.constant_inflow = True
-
-        self.diameter_sphere = min(self.cells) // 2
-        self.u_max = 0.1
-        self.reynolds_number = 1000000
-
-        kinematic_viscosity = (self.diameter_sphere * self.u_max) / self.reynolds_number
-
-        self.omega = relaxation_rate_from_lattice_viscosity(kinematic_viscosity)
-
-        self.total_cells = (self.cells[0] * self.blocks[0],
-                            self.cells[1] * self.blocks[1],
-                            self.cells[2] * self.blocks[2])
-
-    @wlb.member_callback
-    def config(self):
-        return {
-            'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
-                'periodic': self.periodic,
-                'oneBlockPerProcess': True
-            },
-            'Parameters': {
-                'timesteps': self.timesteps,
-                'vtkWriteFrequency': self.vtkWriteFrequency,
-                'omega': self.omega,
-                'u_max': self.u_max,
-                'reynolds_number': self.reynolds_number,
-                'diameter_sphere': self.diameter_sphere,
-                'constant_inflow': self.constant_inflow
-            },
-            'Boundaries': {
-                'Border': [
-                    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
-                    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
-                    {'direction': 'W', 'walldistance': -1, 'flag': 'UBB'},
-                    {'direction': 'E', 'walldistance': -1, 'flag': 'Outflow'},
-                    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
-                    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
-                ],
-                'Body': [
-                    {'shape': "sphere",
-                     'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2),
-                     'radius': self.diameter_sphere // 2,
-                     'flag': 'NoSlip'}
-                ],
-            },
-        }
-
-
-scenarios = wlb.ScenarioManager()
-scenarios.add(Scenario())
diff --git a/apps/showcases/CMakeLists.txt b/apps/showcases/CMakeLists.txt
index 1442bac94b94111f1b290edf4ed86a9b51b46419..f601fdd24ebb1454ae518db8405be992f9f0be57 100644
--- a/apps/showcases/CMakeLists.txt
+++ b/apps/showcases/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory( PegIntoSphereBed )
 if ( WALBERLA_BUILD_WITH_CODEGEN)
 
    add_subdirectory( Antidunes )
+   add_subdirectory( FlowAroundSphere )
 
    if (WALBERLA_BUILD_WITH_PYTHON)
       add_subdirectory( PhaseFieldAllenCahn )
diff --git a/apps/showcases/FlowAroundSphere/CMakeLists.txt b/apps/showcases/FlowAroundSphere/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f11856924f68a541dffeb1a89ce136d5f10ce52d
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/CMakeLists.txt
@@ -0,0 +1,31 @@
+waLBerla_link_files_to_builddir( *.py )
+waLBerla_link_files_to_builddir( *.prm )
+waLBerla_link_files_to_builddir( *.png )
+waLBerla_link_files_to_builddir( *.obj )
+waLBerla_link_files_to_builddir( *.stl )
+waLBerla_link_files_to_builddir( *.mtl )
+
+waLBerla_generate_target_from_python(NAME FlowAroundSphereCodeGen
+        FILE FlowAroundSphere.py
+        OUT_FILES FlowAroundSphereSweepCollection.h FlowAroundSphereSweepCollection.${CODEGEN_FILE_SUFFIX}
+        FlowAroundSphereStorageSpecification.h FlowAroundSphereStorageSpecification.${CODEGEN_FILE_SUFFIX}
+        FreeSlip.h FreeSlip.${CODEGEN_FILE_SUFFIX}
+        NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX}
+        Obstacle.h Obstacle.${CODEGEN_FILE_SUFFIX}
+        UBB.h UBB.${CODEGEN_FILE_SUFFIX}
+        Outflow.h Outflow.${CODEGEN_FILE_SUFFIX}
+        FixedDensity.h FixedDensity.${CODEGEN_FILE_SUFFIX}
+        FlowAroundSphereBoundaryCollection.h
+        FlowAroundSphereInfoHeader.h
+        FlowAroundSphereStaticDefines.h)
+
+if (WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP)
+    waLBerla_add_executable ( NAME FlowAroundSphere
+            FILES FlowAroundSphere.cpp Sphere.cpp Evaluation.cpp GridGeneration.h Types.h
+            DEPENDS blockforest boundary core field gpu lbm_generated stencil timeloop vtk FlowAroundSphereCodeGen )
+else()
+    waLBerla_add_executable ( NAME FlowAroundSphere
+            FILES FlowAroundSphere.cpp Sphere.cpp Evaluation.cpp GridGeneration.h Types.h
+            DEPENDS blockforest boundary core field lbm_generated stencil timeloop vtk FlowAroundSphereCodeGen )
+
+endif(WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP)
diff --git a/apps/showcases/FlowAroundSphere/Evaluation.cpp b/apps/showcases/FlowAroundSphere/Evaluation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b5eab1c93e1d63f05f95524fcdfe9471705216b
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/Evaluation.cpp
@@ -0,0 +1,181 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Evaluation.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "Evaluation.h"
+
+
+namespace walberla
+{
+
+void Evaluation::operator()()
+{
+   if (checkFrequency_ == uint_c(0)) return;
+   ++coarseExecutionCounter_;
+   if (rampUpTime_ > coarseExecutionCounter_) return;
+
+   if ((coarseExecutionCounter_ - uint_c(1)) % checkFrequency_ != 0) return;
+
+   WALBERLA_CHECK_NOT_NULLPTR(blocks_)
+   WALBERLA_ROOT_SECTION()
+   {
+      for(auto it = dragResults.begin(); it != dragResults.end(); ++it){
+         coefficients_[0].push_back(it->cDRealArea);
+         coefficients_[1].push_back(it->cLRealArea);
+         coefficients_[2].push_back(it->cDDiscreteArea);
+         coefficients_[3].push_back(it->cLDiscreteArea);
+      }
+
+      if (coefficients_[0].size() > setup_.nbrOfEvaluationPointsForCoefficientExtremas){
+         for (uint_t i = uint_c(0); i < uint_c(4); ++i)
+            coefficients_[i].pop_front();
+      }
+
+      for (uint_t i = uint_c(0); i < uint_c(4); ++i){
+         coefficientExtremas_[i] = std::make_pair(*(coefficients_[i].begin()), *(coefficients_[i].begin()));
+         for (auto v = coefficients_[i].begin(); v != coefficients_[i].end(); ++v){
+            coefficientExtremas_[i].first  = std::min(coefficientExtremas_[i].first, *v);
+            coefficientExtremas_[i].second = std::max(coefficientExtremas_[i].second, *v);
+         }
+      }
+
+      std::ostringstream oss;
+
+      if (logToStream_ and !dragResults.empty()){
+         WALBERLA_LOG_RESULT_ON_ROOT(
+            "force acting on sphere (in dimensionless lattice units of the coarsest grid - evaluated in time step "
+            << coarseExecutionCounter_ - uint_c(1) << "):\n   " << force_ << oss.str()
+            << "\ndrag and lift coefficients (including extrema of last " << (coefficients_[0].size() * checkFrequency_)
+            << " time steps):"
+               "\n   \"real\" area:"
+               "\n      c_D: "
+            << dragResults.back().cDRealArea << " (min = " << coefficientExtremas_[0].first << ", max = " << coefficientExtremas_[0].second
+            << ")"
+            << "\n      c_L: " << dragResults.back().cLRealArea << " (min = " << coefficientExtremas_[1].first
+            << ", max = " << coefficientExtremas_[1].second << ")"
+            << "\n   discrete area:"
+               "\n      c_D: "
+            << dragResults.back().cDDiscreteArea << " (min = " << coefficientExtremas_[2].first
+            << ", max = " << coefficientExtremas_[2].second << ")"
+            << "\n      c_L: " << dragResults.back().cLDiscreteArea << " (min = " << coefficientExtremas_[3].first
+            << ", max = " << coefficientExtremas_[3].second << ")")
+      }
+
+      if (logToFile_ and !dragResults.empty()){
+         std::ofstream outfile( dragFilename_.c_str(), std::ios_base::app );
+
+         for(auto it = dragResults.begin(); it != dragResults.end(); ++it){
+            outfile << it->timestep << ",";
+            outfile << it->Fx << "," << it->Fy << "," << it->Fz << ",";
+            outfile << it->cDRealArea << ",";
+            outfile << it->cLRealArea << ",";
+            outfile << it->cDDiscreteArea << ",";
+            outfile << it->cLDiscreteArea;
+            outfile << "\n";
+         }
+         outfile.close();
+      }
+      dragResults.clear();
+   }
+}
+
+void Evaluation::resetForce()
+{
+   if (!initialized_) refresh();
+}
+
+void Evaluation::forceCalculation(const uint_t level)
+{
+   if (rampUpTime_ > coarseExecutionCounter_) return;
+
+   if(level == maxLevel_){
+      for (auto b : finestBlocks_){
+         force_ += Vector3<double>(boundaryCollection_.ObstacleObject->getForce(b));
+      }
+
+      mpi::reduceInplace(force_, mpi::SUM);
+      WALBERLA_ROOT_SECTION(){
+         const double meanU2 = double_c(meanVelocity) * double_c(meanVelocity);
+
+         const double cDRealArea = (double_c(8.0) * force_[0]) / (meanU2 * double_c(surfaceAreaSphere));
+         const double cLRealArea = (double_c(8.0) * force_[1]) / (meanU2 * double_c(surfaceAreaSphere));
+         const double cDDiscreteArea = (double_c(8.0) * force_[0]) / (meanU2 * double_c(AD_));
+         const double cLDiscreteArea = (double_c(8.0) * force_[1]) / (meanU2 * double_c(AL_));
+
+         DragCoefficient DC(fineExecutionCounter_, force_, cDRealArea, cLRealArea, cDDiscreteArea, cLDiscreteArea);
+         dragResults.push_back(DC);
+
+         fineExecutionCounter_++;
+      }
+   }
+
+   force_[0] = double_c(0.0);
+   force_[1] = double_c(0.0);
+   force_[2] = double_c(0.0);
+}
+
+void Evaluation::refresh()
+{
+   WALBERLA_CHECK_NOT_NULLPTR(blocks_)
+   const uint_t finestLevel = blocks_->getDepth();
+
+   double AD(double_c(0));
+   double AL(double_c(0));
+   for (auto block = blocks_->begin(); block != blocks_->end(); ++block){
+      const uint_t blockLevel = blocks_->getLevel(*block);
+      const FlagField_T* const flagField = block->template getData< FlagField_T >(ids_.flagField);
+
+      const auto fluid    = flagField->getFlag(setup_.fluidUID);
+      const auto obstacle = flagField->getFlag(setup_.obstacleUID);
+      const double area           = double_c(4.0);
+
+      auto xyzSize = flagField->xyzSize();
+      for (cell_idx_t z = xyzSize.zMin(); z <= xyzSize.zMax(); ++z){
+         for (cell_idx_t y = xyzSize.yMin(); y <= xyzSize.yMax(); ++y){
+            for (cell_idx_t x = xyzSize.xMin(); x <= xyzSize.xMax(); ++x){
+               if (flagField->isFlagSet(x, y, z, fluid)){
+                  for (auto it = Stencil_T::beginNoCenter(); it != Stencil_T::end(); ++it){
+                     const cell_idx_t nx = x + cell_idx_c(it.cx());
+                     const cell_idx_t ny = y + cell_idx_c(it.cy());
+                     const cell_idx_t nz = z + cell_idx_c(it.cz());
+
+                     if (flagField->isFlagSet(nx, ny, nz, obstacle)){
+                        WALBERLA_CHECK(blockLevel == finestLevel, "The sphere must be completely located on the finest level")
+                        if (it.cx() == 1 && it.cy() == 0 && it.cz() == 0) { AD += area; }
+                        else if (it.cx() == 0 && it.cz() == 0){
+                           if (it.cy() == 1){
+                              AL += area;
+                           }
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+   mpi::reduceInplace(AD, mpi::SUM);
+   mpi::reduceInplace(AL, mpi::SUM);
+
+   WALBERLA_ROOT_SECTION(){
+      AD_ = AD;
+      AL_ = AL;
+   }
+   initialized_ = true;
+}
+}
\ No newline at end of file
diff --git a/apps/showcases/FlowAroundSphere/Evaluation.h b/apps/showcases/FlowAroundSphere/Evaluation.h
new file mode 100644
index 0000000000000000000000000000000000000000..659ab258a94876d47cf3d54c1fe052416555da68
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/Evaluation.h
@@ -0,0 +1,164 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Evaluation.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+# pragma once
+
+#include "core/config/Config.h"
+#include "core/math/Constants.h"
+#include "core/Filesystem.h"
+#include "core/math/Sample.h"
+
+#include "lbm_generated/field/PdfField.h"
+
+#include "Types.h"
+#include "Setup.h"
+#include "FlowAroundSphereInfoHeader.h"
+
+#include <deque>
+#include <fstream>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace walberla;
+
+using FlagField_T          = FlagField< uint8_t >;
+using BoundaryCollection_T = lbm::FlowAroundSphereBoundaryCollection< FlagField_T >;
+
+namespace walberla
+{
+
+struct DragCoefficient {
+   uint_t timestep;
+   double Fx;
+   double Fy;
+   double Fz;
+   double cDRealArea;
+   double cLRealArea;
+   double cDDiscreteArea;
+   double cLDiscreteArea;
+   DragCoefficient(uint_t t, Vector3<double> f, double cdR, double clR, double cdD, double clD) : timestep(t), Fx(f[0]), Fy(f[1]), Fz(f[2]), cDRealArea(cdR), cLRealArea(clR), cDDiscreteArea(cdD), cLDiscreteArea(clD) {}
+};
+
+class Evaluation
+{
+ public:
+   Evaluation(std::shared_ptr< StructuredBlockForest >& blocks, const uint_t checkFrequency, const uint_t rampUpTime,
+              BoundaryCollection_T & boundaryCollection,
+              const IDs& ids, const Setup& setup,
+              const bool logToStream = true, const bool logToFile = true)
+      : blocks_(blocks), checkFrequency_(checkFrequency), rampUpTime_(rampUpTime),
+        boundaryCollection_(boundaryCollection), ids_(ids), setup_(setup),
+        logToStream_(logToStream), logToFile_(logToFile)
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR(blocks)
+      maxLevel_ = blocks->getDepth();
+      blocks->getBlocks(finestBlocks_, maxLevel_);
+
+      coefficients_.resize(uint_c(4));
+      coefficientExtremas_.resize(uint_c(4));
+
+      const double factor = setup_.dxC / setup_.dxF;
+      diameterSphere      = double_c(2.0) * (double_c(setup_.sphereRadius) * factor);
+      surfaceAreaSphere   = math::pi * diameterSphere * diameterSphere;
+      meanVelocity        = setup_.inflowVelocity; // (double_c(4.0) * setup_.inflowVelocity) / double_c(9.0);
+
+      dragFilename_ = std::string("dragSphereRe_") + std::to_string(uint_t(setup_.reynoldsNumber)) + std::string("_meshLevels_") +  std::to_string(uint_t(setup_.refinementLevels + 1)) + std::string(".csv");
+
+      WALBERLA_ROOT_SECTION(){
+         if (logToFile_){
+            filesystem::path dataFile( dragFilename_.c_str() );
+            if( filesystem::exists( dataFile ) )
+               std::remove( dataFile.string().c_str() );
+
+            std::ofstream outfile( dragFilename_.c_str() );
+            outfile << "SEP=," << "\n";
+
+            setup_.writeParameterHeader(outfile);
+
+            outfile << "timestep," << "Fx," << "Fy," << "Fz," << "cDRealArea," << "cLRealArea," << "cDDiscreteArea," << "cLDiscreteArea" << "\n";
+            outfile.close();
+         }
+      }
+
+      refresh();
+
+      WALBERLA_LOG_INFO_ON_ROOT(
+         "Evaluation initialised:"
+            "\n   + Sphere real area:                      " << surfaceAreaSphere
+         << "\n   + Sphere real diameter:                  " << diameterSphere
+         << "\n   + Sphere discrete area drag coefficient: " << AD_
+         << "\n   + Sphere discrete area lift coefficient: " << AL_
+      )
+
+   }
+
+   void operator()();
+   void forceCalculation(const uint_t level); // for calculating the force
+   void resetForce();
+
+   std::function<void (const uint_t)> forceCalculationFunctor()
+   {
+      return [this](uint_t level) { forceCalculation(level); };
+   }
+
+   std::function<void()> resetForceFunctor()
+   {
+      return [this]() { resetForce(); };
+   }
+
+   void refresh();
+
+ protected:
+   bool initialized_{false};
+
+   std::shared_ptr< StructuredBlockForest > blocks_;
+   uint_t maxLevel_;
+   std::vector<Block *> finestBlocks_;
+
+   uint_t coarseExecutionCounter_{ uint_c(0) };
+   uint_t fineExecutionCounter_{ uint_c(0) };
+   uint_t checkFrequency_;
+   uint_t rampUpTime_;
+
+   BoundaryCollection_T & boundaryCollection_;
+
+   IDs ids_;
+   Setup setup_;
+
+   double diameterSphere;
+   double surfaceAreaSphere;
+   double meanVelocity;
+
+   Vector3< double > force_;
+   double AD_;
+   double AL_;
+   std::vector<DragCoefficient> dragResults;
+
+   std::vector< std::deque< double > > coefficients_;
+   std::vector< std::pair< double, double > > coefficientExtremas_;
+
+   bool logToStream_;
+   bool logToFile_;
+   std::string dragFilename_;
+
+}; // class Evaluation
+
+}
\ No newline at end of file
diff --git a/apps/showcases/FlowAroundSphere/FlowAroundSphere.cpp b/apps/showcases/FlowAroundSphere/FlowAroundSphere.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8f5865173d0a63b8986db79cf5d82fd7183b7dd
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/FlowAroundSphere.cpp
@@ -0,0 +1,556 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FlowAroundSphere.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/StructuredBlockForest.h"
+#include "blockforest/communication/NonUniformBufferedScheme.h"
+
+#include "core/Abort.h"
+#include "core/DataTypes.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/logging/Logging.h"
+#include "core/math/Vector3.h"
+#include "core/Environment.h"
+#include "core/mpi/MPIManager.h"
+#include "core/MemoryUsage.h"
+#include "core/mpi/Reduce.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include "field/AddToStorage.h"
+#include "field/CellCounter.h"
+#include "field/FlagField.h"
+#include "field/StabilityChecker.h"
+#include "field/adaptors/AdaptorCreators.h"
+#include "field/iterators/FieldIterator.h"
+#include "field/vtk/VTKWriter.h"
+#include "field/vtk/FlagFieldCellFilter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/DeviceSelectMPI.h"
+#   include "gpu/ErrorChecking.h"
+#   include "gpu/FieldCopy.h"
+#   include "gpu/HostFieldAllocator.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/NonUniformGPUScheme.h"
+#   include "gpu/communication/UniformGPUScheme.h"
+#endif
+
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
+#include "lbm_generated/refinement/RefinementScaling.h"
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+#   include "lbm_generated/gpu/AddToStorage.h"
+#   include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.h"
+#   include "lbm_generated/gpu/GPUPdfField.h"
+#   include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h"
+#   include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h"
+#endif
+
+#include "timeloop/SweepTimeloop.h"
+#include "vtk/VTKOutput.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+
+#include "Evaluation.h"
+#include "GridGeneration.h"
+#include "Setup.h"
+#include "Sphere.h"
+#include "Types.h"
+
+#include "FlowAroundSphereStaticDefines.h"
+
+using namespace walberla;
+
+using BoundaryCollection_T = lbm::FlowAroundSphereBoundaryCollection< FlagField_T >;
+using SweepCollection_T = lbm::FlowAroundSphereSweepCollection;
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
+using gpu::communication::NonUniformGPUScheme;
+using gpu::communication::UniformGPUScheme;
+
+using lbm_generated::NonuniformGeneratedGPUPdfPackInfo;
+using lbm_generated::UniformGeneratedGPUPdfPackInfo;
+#else
+using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
+using blockforest::communication::NonUniformBufferedScheme;
+using blockforest::communication::UniformBufferedScheme;
+
+using lbm_generated::NonuniformGeneratedPdfPackInfo;
+using lbm_generated::UniformGeneratedPdfPackInfo;
+#endif
+
+int main(int argc, char** argv)
+{
+   Environment env( argc, argv );
+   mpi::MPIManager::instance()->useWorldComm();
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   gpu::selectDeviceBasedOnMpiRank();
+#endif
+
+   auto config = env.config();
+
+   ///////////////////////
+   /// PARAMETER INPUT ///
+   ///////////////////////
+
+   // read general simulation parameters
+   auto parameters       = config->getOneBlock("Parameters");
+   auto domainParameters = config->getOneBlock("DomainSetup");
+   auto boundariesConfig = config->getBlock("Boundaries");
+   auto loggingParameters= config->getOneBlock("Logging");
+   Setup setup(parameters, domainParameters, loggingParameters, boundariesConfig, infoMap);
+
+   bool writeSetupForestAndReturn              = loggingParameters.getParameter< bool >("writeSetupForestAndReturn", false);
+   if (uint_c(MPIManager::instance()->numProcesses()) > 1) writeSetupForestAndReturn = false;
+
+   WALBERLA_LOG_INFO_ON_ROOT("Diameter of the Sphere is resolved with " << setup.resolutionSphere << " lattice cells.")
+   Sphere Sphere( setup );
+
+   ///////////////////////////
+   /// CREATE BLOCK FOREST ///
+   ///////////////////////////
+
+   shared_ptr< BlockForest > bfs;
+   createBlockForest(bfs, setup);
+
+   if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+
+      const uint_t nBlocks = bfs->getNumberOfBlocks();
+      const uint_t nCells = nBlocks * (setup.cellsPerBlock[0] * setup.cellsPerBlock[1] * setup.cellsPerBlock[2]);
+      const memory_t totalMemory = memory_t(nCells) * setup.memoryPerCell;
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Benchmark run data:"
+                                "\n- simulation parameters:"   << std::setprecision(16) <<
+                                "\n   + collision model:     " << infoMap["collisionOperator"] <<
+                                "\n   + stencil:             " << infoMap["stencil"] <<
+                                "\n   + streaming:           " << infoMap["streamingPattern"] <<
+                                "\n   + compressible:        " << ( StorageSpecification_T::compressible ? "yes" : "no" ) <<
+                                "\n   + mesh levels:         " << setup.refinementLevels + uint_c(1) <<
+                                "\n   + resolution:          " << setup.dxC << " - on the coarsest grid" <<
+                                "\n   + resolution:          " << setup.dxF << " - on the finest grid" <<
+                                "\n- domain properties:      "
+                                "\n   + # blocks:            " << nBlocks <<
+                                "\n   + # cells:             " << uint_c(real_c(nCells) / real_c(1e6)) << " [1e6]"
+                                "\n   + # cells sphere D:    " << setup.resolutionSphere <<
+                                "\n   + total memory:        " << totalMemory / 1e9 << " [GB]" <<
+                                "\n- simulation properties:  "
+                                "\n   + sphere pos.(x):      " << setup.sphereXPosition * setup.dxC << " [m]" <<
+                                "\n   + sphere pos.(y):      " << setup.sphereYPosition * setup.dxC << " [m]" <<
+                                "\n   + sphere pos.(z):      " << setup.sphereZPosition * setup.dxC << " [m]" <<
+                                "\n   + sphere radius:       " << setup.sphereRadius * setup.dxC << " [m]" <<
+                                "\n   + kin. viscosity:      " << setup.viscosity * setup.dxC * setup.dxC / setup.dt << " [m^2/s] (on the coarsest grid)" <<
+                                "\n   + viscosity LB:        " << setup.viscosity  << " [dx*dx/dt] (on the coarsest grid)" <<
+                                "\n   + omega:               " << setup.omega << " (on the coarsest grid)" <<
+                                "\n   + rho:                 " << setup.rho << " [kg/m^3]" <<
+                                "\n   + inflow velocity:     " << setup.referenceVelocity << " [m/s]" <<
+                                "\n   + lattice velocity:    " << setup.inflowVelocity << " [dx/dt]" <<
+                                "\n   + Reynolds number:     " << setup.reynoldsNumber <<
+                                "\n   + Mach number:         " << setup.machNumber <<
+                                "\n   + dx (coarsest grid):  " << setup.dxC << " [m]" <<
+                                "\n   + dt (coarsest grid):  " << setup.dt << " [s]"
+                                "\n   + #time steps:         " << setup.timesteps << " (on the coarsest grid, " << ( real_c(1.0) / setup.dt ) << " for 1s of real time)" <<
+                                "\n   + simulation time:     " << ( real_c( setup.timesteps ) * setup.dt ) << " [s]" )
+
+      logging::Logging::printFooterOnStream();
+      return EXIT_SUCCESS;
+   }
+
+   auto blocks = std::make_shared< StructuredBlockForest >(bfs, setup.cellsPerBlock[0], setup.cellsPerBlock[1], setup.cellsPerBlock[2]);
+   blocks->createCellBoundingBoxes();
+
+   ////////////////////////////////////
+   /// CREATE AND INITIALIZE FIELDS ///
+   ////////////////////////////////////
+
+   // create fields
+   const StorageSpecification_T StorageSpec = StorageSpecification_T();
+   IDs ids;
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   auto PDFAllocator = make_shared< gpu::HostFieldAllocator< PdfField_T::value_type > >();
+   auto allocator = make_shared< gpu::HostFieldAllocator< VelocityField_T::value_type > >();
+   ids.pdfField = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx, PDFAllocator);
+   ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "velocity", real_c(0.0), field::fzyx, uint_c(2), allocator);
+   ids.densityField = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(2), allocator);
+   ids.omegaField = field::addToStorage< ScalarField_T >(blocks, "omega", real_c(0.0), field::fzyx, uint_c(2), allocator);
+   ids.flagField = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
+
+   ids.pdfFieldGPU = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, ids.pdfField, StorageSpec, "pdfs on GPU", true);
+   ids.velocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityField, "velocity on GPU", true);
+   ids.densityFieldGPU = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, ids.densityField, "density on GPU", true);
+   ids.omegaFieldGPU = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, ids.omegaField, "omega on GPU", true);
+
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#else
+   ids.pdfField = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx);
+   ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "vel", VelocityField_T::value_type(0.0), field::fzyx, uint_c(2));
+   ids.densityField = field::addToStorage< ScalarField_T >(blocks, "density", ScalarField_T::value_type(1.0), field::fzyx, uint_c(2));
+   ids.omegaField = field::addToStorage< ScalarField_T >(blocks, "omega", ScalarField_T::value_type(0.0), field::fzyx, uint_c(2));
+   ids.flagField = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
+#endif
+
+   auto spongeLayer     = config->getOneBlock("SpongeLayer");
+   const bool deactivateSpongeLayer = spongeLayer.getParameter< bool >("deactivateSpongeLayer");
+   const real_t targetOmega         = spongeLayer.getParameter< real_t >("targetOmega");
+   const real_t spongeStart         = spongeLayer.getParameter< real_t >("spongeStart");
+
+   const real_t omegaDiff   = setup.omega - targetOmega;
+   const real_t spongeWidth = real_c(blocks->getDomain().xMax()) - spongeStart;
+   const real_t spongeMid   = spongeStart + (spongeWidth / real_c(2.0));
+
+   if(!deactivateSpongeLayer)
+      WALBERLA_LOG_WARNING_ON_ROOT("Using a sponge layer at the Outlet. The sponge layer starts at " << spongeStart << " [m] and targets a relaxation rate of " << targetOmega << " at the outlet" )
+
+   for (auto& block : *blocks)
+   {
+      Block& b = dynamic_cast< Block& >(block);
+      const uint_t level = blocks->getLevel(b);
+
+      auto * omegaField = b.getData< ScalarField_T > ( ids.omegaField );
+      for( auto it = omegaField->beginWithGhostLayer(0); it != omegaField->end(); ++it ){
+         if(deactivateSpongeLayer){
+            omegaField->get(it.cell()) = ScalarField_T::value_type(lbm_generated::relaxationRateScaling(setup.omega, level));
+         }
+         else{
+            Cell globalCell;
+            blocks->transformBlockLocalToGlobalCell(globalCell, block, it.cell());
+            Vector3<real_t> cellCenter = blocks->getCellCenter(globalCell, level);
+            const real_t factor = real_c(0.5) + real_c(0.5) * std::tanh(real_c(2.0) * (cellCenter[0] - spongeMid) / spongeWidth);
+            omegaField->get(it.cell()) = ScalarField_T::value_type(lbm_generated::relaxationRateScaling(setup.omega - (factor * omegaDiff), level));
+         }
+      }
+   }
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   gpu::fieldCpy< gpu::GPUField< ScalarField_T::value_type >, ScalarField_T>(blocks, ids.omegaFieldGPU, ids.omegaField);
+#endif
+
+   WALBERLA_MPI_BARRIER()
+
+   const Cell innerOuterSplit =
+      Cell(parameters.getParameter< Vector3< cell_idx_t > >("innerOuterSplit", Vector3< cell_idx_t >(1, 1, 1)));
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   SweepCollection_T sweepCollection(blocks, ids.pdfFieldGPU, ids.omegaFieldGPU, ids.densityFieldGPU, ids.velocityFieldGPU, innerOuterSplit);
+#else
+   SweepCollection_T sweepCollection(blocks, ids.pdfField, ids.omegaField, ids.densityField, ids.velocityField, innerOuterSplit);
+#endif
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+   const bool gpuEnabledMPI       = parameters.getParameter< bool >("gpuEnabledMPI", false);
+
+   auto nonUniformCommunication = std::make_shared< NonUniformGPUScheme< CommunicationStencil_T > >(blocks, gpuEnabledMPI);
+   auto nonUniformPackInfo      = lbm_generated::setupNonuniformGPUPdfCommunication< GPUPdfField_T >(blocks, ids.pdfFieldGPU);
+   nonUniformCommunication->addPackInfo(nonUniformPackInfo);
+#else
+   WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+   auto nonUniformCommunication = std::make_shared< NonUniformBufferedScheme< CommunicationStencil_T > >(blocks);
+   auto nonUniformPackInfo      = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, ids.pdfField);
+   nonUniformCommunication->addPackInfo(nonUniformPackInfo);
+
+#endif
+   WALBERLA_MPI_BARRIER()
+   WALBERLA_LOG_INFO_ON_ROOT("Setting up communication done")
+
+   /////////////////////////
+   /// BOUNDARY HANDLING ///
+   /////////////////////////
+   WALBERLA_LOG_INFO_ON_ROOT("Start BOUNDARY HANDLING")
+   // create and initialize boundary handling
+   Sphere.setupBoundary(blocks, ids.flagField);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, setup.fluidUID, cell_idx_c(0));
+
+   if(parameters.getParameter< bool >("initialiseWithInletVelocity"))
+   {
+      for (auto& block : *blocks)
+      {
+         auto * flagField = block.getData< FlagField_T > ( ids.flagField );
+         auto * velField = block.getData< VelocityField_T > ( ids.velocityField );
+         // auto domainFlag = flagField->getFlag(fluidFlagUID);
+
+         for( auto it = flagField->beginWithGhostLayer(2); it != flagField->end(); ++it )
+         {
+            // if (!isFlagSet(it, domainFlag))
+            //  continue;
+
+            velField->get(it.cell(), 0) = VelocityField_T::value_type(setup.inflowVelocity);
+         }
+      }
+   }
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   gpu::fieldCpy< gpu::GPUField< VelocityField_T::value_type >, VelocityField_T>(blocks, ids.velocityFieldGPU, ids.velocityField);
+#endif
+
+   for (auto& block : *blocks)
+   {
+      sweepCollection.initialise(&block, cell_idx_c(1));
+   }
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   gpu::fieldCpy< PdfField_T, gpu::GPUField< PdfField_T::value_type > >(blocks, ids.pdfField, ids.pdfFieldGPU);
+   sweepCollection.initialiseBlockPointer();
+#endif
+
+   std::function< VelocityField_T::value_type (const Cell&, const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
+      wallDistanceFunctor = wallDistance(Sphere);
+
+   const real_t omegaFinestLevel = lbm_generated::relaxationRateScaling(setup.omega, setup.refinementLevels);
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfFieldGPU, setup.fluidUID, omegaFinestLevel, setup.inflowVelocity, wallDistanceFunctor, ids.pdfField);
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#else
+   BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfField, setup.fluidUID, omegaFinestLevel, setup.inflowVelocity, wallDistanceFunctor);
+#endif
+   WALBERLA_MPI_BARRIER()
+   WALBERLA_LOG_INFO_ON_ROOT("BOUNDARY HANDLING done")
+
+   //////////////////////////////////
+   /// SET UP SWEEPS AND TIMELOOP ///
+   //////////////////////////////////
+   WALBERLA_LOG_INFO_ON_ROOT("Start SWEEPS AND TIMELOOP")
+   // flow evaluation
+   auto EvaluationParameters      = config->getOneBlock("Evaluation");
+   const uint_t evaluationCheckFrequency       = EvaluationParameters.getParameter< uint_t >("evaluationCheckFrequency");
+   const uint_t rampUpTime                     = EvaluationParameters.getParameter< uint_t >("rampUpTime");
+   const bool evaluationLogToStream            = EvaluationParameters.getParameter< bool >("logToStream");
+   const bool evaluationLogToFile              = EvaluationParameters.getParameter< bool >("logToFile");
+
+   shared_ptr< Evaluation > evaluation( new Evaluation( blocks, evaluationCheckFrequency, rampUpTime, boundaryCollection,
+                                                        ids, setup, evaluationLogToStream, evaluationLogToFile));
+
+   // create time loop
+   SweepTimeloop timeloop(blocks->getBlockStorage(), setup.timesteps);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   int streamHighPriority = 0;
+   int streamLowPriority  = 0;
+   WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
+   sweepCollection.setOuterPriority(streamHighPriority);
+   auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+
+   lbm_generated::BasicRecursiveTimeStepGPU< GPUPdfField_T, SweepCollection_T, BoundaryCollection_T >
+      LBMMeshRefinement(blocks, ids.pdfFieldGPU, sweepCollection, boundaryCollection, nonUniformCommunication, nonUniformPackInfo);
+
+   LBMMeshRefinement.addRefinementToTimeLoop(timeloop, uint_c(0));
+   LBMMeshRefinement.addPostBoundaryHandlingBlockFunction(evaluation->forceCalculationFunctor());
+#else
+   std::shared_ptr< lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > >
+      LBMRefinement;
+
+   LBMRefinement = std::make_shared<
+   lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > >(
+   blocks, ids.pdfField, sweepCollection, boundaryCollection, nonUniformCommunication, nonUniformPackInfo);
+   LBMRefinement->addPostBoundaryHandlingBlockFunction(evaluation->forceCalculationFunctor());
+   LBMRefinement->addRefinementToTimeLoop(timeloop);
+#endif
+   //////////////////
+   /// VTK OUTPUT ///
+   //////////////////
+   WALBERLA_LOG_INFO_ON_ROOT("SWEEPS AND TIMELOOP done")
+
+   auto VTKWriter           = config->getOneBlock("VTKWriter");
+   const uint_t vtkWriteFrequency       = VTKWriter.getParameter< uint_t >("vtkWriteFrequency", 0);
+   const uint_t vtkGhostLayers          = VTKWriter.getParameter< uint_t >("vtkGhostLayers", 0);
+   const bool amrFileFormat             = VTKWriter.getParameter< bool >("amrFileFormat", false);
+   const bool oneFilePerProcess         = VTKWriter.getParameter< bool >("oneFilePerProcess", false);
+   const real_t samplingResolution      = VTKWriter.getParameter< real_t >("samplingResolution", real_c(-1.0));
+   const uint_t initialWriteCallsToSkip = VTKWriter.getParameter< uint_t >("initialWriteCallsToSkip", uint_c(0.0));
+   const real_t sliceThickness          = VTKWriter.getParameter< real_t >("sliceThickness", real_c(1.0));
+
+   auto finalDomain = blocks->getDomain();
+   if (vtkWriteFrequency > 0)
+   {
+      const std::string vtkFolder = "VTKSphereRE_" + std::to_string(uint64_c(setup.reynoldsNumber));
+      auto vtkOutput =
+         vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, vtkGhostLayers, false, vtkFolder,
+                                        "simulation_step", false, true, true, false, 0, amrFileFormat, oneFilePerProcess);
+
+      vtkOutput->setInitialWriteCallsToSkip(initialWriteCallsToSkip);
+
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+         {
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+         gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.velocityField, ids.velocityFieldGPU);
+         gpu::fieldCpy< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(blocks, ids.densityField, ids.densityFieldGPU);
+#endif
+      });
+
+      vtkOutput->setSamplingResolution(samplingResolution);
+
+      field::FlagFieldCellFilter<FlagField_T> fluidFilter( ids.flagField );
+      fluidFilter.addFlag( setup.obstacleUID );
+      vtkOutput->addCellExclusionFilter(fluidFilter);
+
+
+      if (VTKWriter.getParameter< bool >("writeOnlySlice", true)){
+         const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - sliceThickness * setup.dxF,
+                            finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + sliceThickness * setup.dxF);
+         vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
+
+         if (VTKWriter.getParameter< bool >("writeXZSlice", false)){
+            const AABB sliceXZ(finalDomain.xMin(), finalDomain.center()[1] - setup.dxF, finalDomain.zMin(),
+                               finalDomain.xMax(), finalDomain.center()[1] + setup.dxF, finalDomain.zMax());
+            vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXZ));
+         }
+      }
+
+      if (VTKWriter.getParameter< bool >("velocity"))
+      {
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(ids.velocityField, "velocity");
+         vtkOutput->addCellDataWriter(velWriter);
+      }
+      if (VTKWriter.getParameter< bool >("density"))
+      {
+         auto densityWriter = make_shared< field::VTKWriter< ScalarField_T, float32 > >(ids.densityField, "density");
+         vtkOutput->addCellDataWriter(densityWriter);
+      }
+      if (VTKWriter.getParameter< bool >("omega"))
+      {
+         auto omegaWriter = make_shared< field::VTKWriter< ScalarField_T, float32 > >(ids.omegaField, "omega");
+         vtkOutput->addCellDataWriter(omegaWriter);
+      }
+      if (VTKWriter.getParameter< bool >("flag"))
+      {
+         auto flagWriter = make_shared< field::VTKWriter< FlagField_T > >(ids.flagField, "flag");
+         vtkOutput->addCellDataWriter(flagWriter);
+      }
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   // log remaining time
+   const real_t remainingTimeLoggerFrequency =
+      loggingParameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   if (uint_c(remainingTimeLoggerFrequency) > 0)
+   {
+      timeloop.addFuncAfterTimeStep(
+         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+         "remaining time logger");
+   }
+
+   // LBM stability check
+   auto CheckerParameters      = config->getOneBlock("StabilityChecker");
+   const uint_t checkFrequency = CheckerParameters.getParameter< uint_t >("checkFrequency", uint_t(0));
+   if (checkFrequency > 0)
+   {
+      auto checkFunction = [](PdfField_T::value_type value) {  return value < math::abs(PdfField_T::value_type(10)); };
+      timeloop.addFuncAfterTimeStep(
+         makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >(
+            config, blocks, ids.pdfField, ids.flagField, setup.fluidUID, checkFunction)),
+         "Stability check");
+   }
+
+   timeloop.addFuncBeforeTimeStep( SharedFunctor< Evaluation >(evaluation), "evaluation" );
+
+   WALBERLA_MPI_BARRIER()
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+   // WALBERLA_LOG_INFO_ON_ROOT("Execute single timestep to fully complete the preprocessing done")
+
+   //////////////////////
+   /// RUN SIMULATION ///
+   //////////////////////
+   const lbm_generated::PerformanceEvaluation< FlagField_T > performance(blocks, ids.flagField, setup.fluidUID);
+   field::CellCounter< FlagField_T > fluidCells(blocks, ids.flagField, setup.fluidUID);
+   fluidCells();
+
+   WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks())
+   for (uint_t level = 0; level <= setup.refinementLevels; level++)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level))
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT( "Benchmark run data:"
+                             "\n- simulation parameters:"   << std::setprecision(16) <<
+                             "\n   + collision model:     " << infoMap["collisionOperator"] <<
+                             "\n   + stencil:             " << infoMap["stencil"] <<
+                             "\n   + streaming:           " << infoMap["streamingPattern"] <<
+                             "\n   + compressible:        " << ( StorageSpecification_T::compressible ? "yes" : "no" ) <<
+                             "\n   + mesh levels:         " << setup.refinementLevels + uint_c(1) <<
+                             "\n   + resolution:          " << setup.dxC << " - on the coarsest grid" <<
+                             "\n   + resolution:          " << setup.dxF << " - on the finest grid" <<
+                             "\n- simulation properties:  "
+                             "\n   + sphere pos.(x):      " << setup.sphereXPosition * setup.dxC << " [m]" <<
+                             "\n   + sphere pos.(y):      " << setup.sphereYPosition * setup.dxC << " [m]" <<
+                             "\n   + sphere pos.(z):      " << setup.sphereZPosition * setup.dxC << " [m]" <<
+                             "\n   + sphere radius:       " << setup.sphereRadius * setup.dxC << " [m]" <<
+                             "\n   + kin. viscosity:      " << setup.viscosity * setup.dxC * setup.dxC / setup.dt << " [m^2/s] (on the coarsest grid)" <<
+                             "\n   + viscosity LB:        " << setup.viscosity  << " [dx*dx/dt] (on the coarsest grid)" <<
+                             "\n   + omega:               " << setup.omega << " (on the coarsest grid)" <<
+                             "\n   + rho:                 " << setup.rho << " [kg/m^3]" <<
+                             "\n   + inflow velocity:     " << setup.referenceVelocity << " [m/s]" <<
+                             "\n   + lattice velocity:    " << setup.inflowVelocity << " [dx/dt]" <<
+                             "\n   + Reynolds number:     " << setup.reynoldsNumber <<
+                             "\n   + Mach number:         " << setup.machNumber <<
+                             "\n   + dt (coarsest grid):  " << setup.dt << " [s]"
+                             "\n   + #time steps:         " << timeloop.getNrOfTimeSteps() << " (on the coarsest grid, " << ( real_c(1.0) / setup.dt ) << " for 1s of real time)" <<
+                             "\n   + simulation time:     " << ( real_c( timeloop.getNrOfTimeSteps() ) * setup.dt ) << " [s]" )
+
+   WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation")
+   WcTimingPool timeloopTiming;
+   WcTimer simTimer;
+
+   WALBERLA_MPI_BARRIER()
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+
+   simTimer.start();
+   timeloop.run(timeloopTiming);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+   WALBERLA_MPI_BARRIER()
+   simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   double time = double_c(simTimer.max());
+   WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+   performance.logResultOnRoot(setup.timesteps, time);
+
+   const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+   WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+   printResidentMemoryStatistics();
+   return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/apps/showcases/FlowAroundSphere/FlowAroundSphere.prm b/apps/showcases/FlowAroundSphere/FlowAroundSphere.prm
new file mode 100644
index 0000000000000000000000000000000000000000..b7a8ecaa60f12c5f9fa57f30465670c66df49f34
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/FlowAroundSphere.prm
@@ -0,0 +1,86 @@
+Parameters
+{
+    reynoldsNumber 1000000;
+    diameterSphere 1.0;
+    sphereXPosition 12;
+
+    referenceVelocity 1.0;
+    latticeVelocity 0.05;
+    initialiseWithInletVelocity true;
+
+    coarseMeshSize 0.0625;
+    timesteps 60001;
+
+    processMemoryLimit 512; // MiB
+    innerOuterSplit <1, 1, 1>;
+
+    // GPU related Parameters, only used if GPU is enabled
+    gpuEnabledMPI true;
+}
+
+//! [domainSetup]
+DomainSetup
+{
+    cellsPerBlock < 64, 64, 64 >;
+    domainSize    < 40, 20, 20 >;
+    periodic    < false, false, false >;
+    refinementLevels 5;
+    numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
+    blockForestFilestem flowAroundSphereBlockForest;
+}
+//! [domainSetup]
+
+Boundaries
+{
+    sphere Obstacle;
+    inflow UBB;
+    outflow FixedDensity;
+    walls FreeSlip;
+}
+
+SpongeLayer
+{
+    deactivateSpongeLayer false;
+    targetOmega 1.9;
+    spongeStart 36;
+}
+
+StabilityChecker
+{
+    checkFrequency 0;
+    streamOutput   false;
+    vtkOutput      true;
+}
+
+VTKWriter
+{
+    vtkWriteFrequency 5000;
+    vtkGhostLayers 0;
+    velocity true;
+    density true;
+    flag false;
+    omega false;
+    writeOnlySlice true;
+    sliceThickness 1;
+    writeXZSlice false;
+    amrFileFormat true;
+    oneFilePerProcess false;
+    samplingResolution -1;
+    initialWriteCallsToSkip 0;
+}
+
+Logging
+{
+    logLevel info;  // info progress detail tracing
+    writeSetupForestAndReturn true; // When only one process is used the decomposition is writen and the program terminates
+    readSetupFromFile false;
+    remainingTimeLoggerFrequency 60; // in seconds
+}
+
+Evaluation
+{
+    evaluationCheckFrequency 500;
+    rampUpTime 0;
+    logToStream true;
+    logToFile true;
+}
diff --git a/apps/showcases/FlowAroundSphere/FlowAroundSphere.py b/apps/showcases/FlowAroundSphere/FlowAroundSphere.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac15244c1b9c9bca9e7021193d1b76a00780f244
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/FlowAroundSphere.py
@@ -0,0 +1,115 @@
+import sympy as sp
+from pystencils import Target
+
+from pystencils.field import fields
+from pystencils.simp import insert_aliases, insert_constants
+
+from lbmpy import LBStencil, LBMConfig, LBMOptimisation
+from lbmpy.boundaries.boundaryconditions import (ExtrapolationOutflow, UBB, QuadraticBounceBack,
+                                                 FreeSlip, NoSlip, FixedDensity)
+from lbmpy.creationfunctions import create_lb_collision_rule
+from lbmpy.enums import Method, Stencil, SubgridScaleModel
+
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+info_header = """
+#pragma once
+#include <map>
+std::map<std::string, std::string> infoMap{{{{"stencil", "{stencil}"}},
+                                           {{"streamingPattern", "{streaming_pattern}"}}, 
+                                           {{"collisionOperator", "{collision_operator}"}}}};
+"""
+
+omega = sp.symbols("omega")
+inlet_velocity = sp.symbols("u_x")
+max_threads = 256
+
+with CodeGeneration() as ctx:
+    target = Target.GPU if ctx.gpu else Target.CPU
+    sweep_params = {'block_size': (128, 1, 1)} if ctx.gpu else {}
+
+    # The application is meant to be compiled with double precision. For single precision, the pdf_dtype can be switched
+    # to float32. In this way the calculations are still performed in double precision while the application can profit
+    # from enhanced performance due to the lower precision of the PDF field
+    dtype = 'float64' if ctx.double_accuracy else 'float32'
+    pdf_dtype = 'float64'
+
+    stencil = LBStencil(Stencil.D3Q27)
+    q = stencil.Q
+    dim = stencil.D
+
+    streaming_pattern = 'esopull'
+
+    pdfs = fields(f"pdfs_src({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
+    velocity_field, density_field = fields(f"velocity({dim}), density(1) : {dtype}[{dim}D]", layout='fzyx')
+    omega_field = fields(f"rr(1) : {dtype}[{dim}D]", layout='fzyx')
+
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    method_enum = Method.CUMULANT
+    lbm_config = LBMConfig(
+        method=method_enum,
+        stencil=stencil,
+        relaxation_rate=omega_field.center,
+        compressible=True,
+        # subgrid_scale_model=SubgridScaleModel.QR,
+        fourth_order_correction=0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False,
+        field_name='pdfs',
+        streaming_pattern=streaming_pattern,
+    )
+
+    lbm_opt = LBMOptimisation(cse_global=False, cse_pdfs=False, field_layout="fzyx",
+                              symbolic_field=pdfs)
+
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+    if lbm_config.method == Method.CUMULANT:
+        collision_rule = insert_constants(collision_rule)
+        collision_rule = insert_aliases(collision_rule)
+    lb_method = collision_rule.method
+
+    freeslip = lbm_boundary_generator("FreeSlip", flag_uid="FreeSlip", boundary_object=FreeSlip(stencil),
+                                      field_data_type=pdf_dtype)
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip(), field_data_type=pdf_dtype)
+
+    quadratic_bounce_back = QuadraticBounceBack(omega, calculate_force_on_boundary=True, data_type=dtype)
+    no_slip_interpolated = lbm_boundary_generator(class_name='Obstacle', flag_uid='Obstacle',
+                                                  boundary_object=quadratic_bounce_back,
+                                                  field_data_type=pdf_dtype)
+
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB((inlet_velocity, 0.0, 0.0), density=1.0, data_type=dtype),
+                                 field_data_type=pdf_dtype)
+
+    outflow_boundary = ExtrapolationOutflow(stencil[4], lb_method, data_type=pdf_dtype)
+    outflow = lbm_boundary_generator(class_name='Outflow', flag_uid='Outflow',
+                                     boundary_object=outflow_boundary,
+                                     field_data_type=pdf_dtype)
+
+    fixed_density = lbm_boundary_generator(class_name='FixedDensity', flag_uid='FixedDensity',
+                                           boundary_object=FixedDensity(1.0),
+                                           field_data_type=pdf_dtype)
+
+    generate_lbm_package(ctx, name="FlowAroundSphere", collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[freeslip, no_slip, no_slip_interpolated,
+                                                      ubb, outflow, fixed_density],
+                         macroscopic_fields=macroscopic_fields, gpu_indexing_params=sweep_params,
+                         target=target, data_type=dtype, pdfs_data_type=pdf_dtype,
+                         max_threads=max_threads)
+
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'FlowAroundSphereInfoHeader',
+                         field_typedefs=field_typedefs)
+
+    infoHeaderParams = {
+        'stencil': stencil.name.lower(),
+        'streaming_pattern': streaming_pattern,
+        'collision_operator': lbm_config.method.name.lower(),
+    }
+
+    ctx.write_file("FlowAroundSphereStaticDefines.h", info_header.format(**infoHeaderParams))
diff --git a/apps/showcases/FlowAroundSphere/GridGeneration.h b/apps/showcases/FlowAroundSphere/GridGeneration.h
new file mode 100644
index 0000000000000000000000000000000000000000..2758fda0866ff4c102f9caea86ece6a22674a7d6
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/GridGeneration.h
@@ -0,0 +1,145 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GridGeneration.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include <string>
+#include <fstream>
+
+#include "Types.h"
+#include "Setup.h"
+#include "Sphere.h"
+
+using namespace walberla;
+
+uint_t blockCells(const Setup& setup, const bool withGhostLayers){
+   uint_t cells;
+   if(!withGhostLayers){
+      cells = setup.cellsPerBlock[0] * setup.cellsPerBlock[1] * setup.cellsPerBlock[2];
+   }
+   else{
+      cells = (setup.cellsPerBlock[0] + 2 * setup.numGhostLayers) *
+              (setup.cellsPerBlock[1] + 2 * setup.numGhostLayers) *
+              (setup.cellsPerBlock[2] + 2 * setup.numGhostLayers);
+   }
+   return cells;
+}
+
+
+void createSetupBlockForest(SetupBlockForest& setupBfs, const Setup& setup, const bool useMPIManager=false)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+
+   uint_t numProcesses = setup.numProcesses;
+   const std::string blockForestFilestem = setup.blockForestFilestem;
+
+   if(useMPIManager) {numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());}
+
+   Sphere Sphere( setup );
+   SphereRefinementSelection SphereRefinementSelection( Sphere, setup.refinementLevels );
+   SphereBlockExclusion SphereBlockExclusion( Sphere );
+
+   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(SphereRefinementSelection));
+   setupBfs.addBlockExclusionFunction(SphereBlockExclusion);
+
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), setup.domainSize[0], setup.domainSize[1], setup.domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+   setupBfs.init(domain, setup.rootBlocks[0], setup.rootBlocks[1], setup.rootBlocks[2],
+                         setup.periodic[0], setup.periodic[1], setup.periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+   WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+   WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+   for (uint_t level = 0; level <= setup.refinementLevels; level++){
+      const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+      WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+   }
+
+   const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
+   WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+
+   const uint_t cellsPerBlock        = blockCells(setup, false);
+   const uint_t totalNumberCells     = setupBfs.getNumberOfBlocks() * cellsPerBlock;
+   const real_t averageCellsPerGPU   = avgBlocksPerProc * real_c(cellsPerBlock);
+   const uint_t cellsPerBlockGL      = blockCells(setup, true);
+   const uint_t totalNumberCellsGL   = setupBfs.getNumberOfBlocks() * cellsPerBlockGL;
+   const real_t averageCellsPerGPUGL = avgBlocksPerProc * real_c(cellsPerBlockGL);
+
+   const uint_t PDFsPerCell            = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+   const uint_t valuesPerCell          = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+   const uint_t sizePerValue           = sizeof(StorageSpecification_T::value_type);
+   const double expectedMemory         = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+   const double expectedMemoryPerGPU   = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
+   const double expectedMemoryGL       = double_c(totalNumberCellsGL * valuesPerCell * sizePerValue) * 1e-9;
+   const double expectedMemoryPerGPUGL = double_c(averageCellsPerGPUGL * valuesPerCell * sizePerValue) * 1e-9;
+
+   WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+   WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+   WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
+   WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand (with GL) will be " << expectedMemoryGL << " GB")
+   WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU (with GL) will be " << expectedMemoryPerGPUGL << " GB")
+
+   WALBERLA_LOG_INFO_ON_ROOT("=================================================================================")
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
+   if(setup.writeSetupForestAndReturn){
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      setupBfs.saveToFile(oss.str().c_str());
+
+      setupBfs.writeVTKOutput(blockForestFilestem);
+   }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs, const Setup& setup){
+   if (mpi::MPIManager::instance()->numProcesses() > 1){
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << setup.blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      std::ifstream infile(setupBlockForestFilepath.c_str());
+      if(!infile.good()){
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, setup, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else{
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else{
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, setup);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
+}
\ No newline at end of file
diff --git a/apps/showcases/FlowAroundSphere/Setup.h b/apps/showcases/FlowAroundSphere/Setup.h
new file mode 100644
index 0000000000000000000000000000000000000000..c33049e96f5d5c5fc925e07c607ab67e29e3a84a
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/Setup.h
@@ -0,0 +1,160 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Setup.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "core/config/Config.h"
+#include "core/DataTypes.h"
+#include "core/math/Vector3.h"
+
+#include <string>
+
+namespace walberla
+{
+struct Setup
+{
+
+   Setup(const Config::BlockHandle & parameters, const Config::BlockHandle & domainParameters,
+         const Config::BlockHandle & logging, const Config::BlockHandle & boundaries,
+         std::map<std::string, std::string>& infoMap)
+   {
+      blockForestFilestem         = domainParameters.getParameter< std::string >("blockForestFilestem", "blockforest");
+      numProcesses                = domainParameters.getParameter< uint_t >("numberProcesses");
+      const Vector3< real_t > ds  = domainParameters.getParameter< Vector3< real_t > >("domainSize");
+      const real_t coarseMeshSize = parameters.getParameter< real_t >("coarseMeshSize");
+      cellsPerBlock               = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+
+      rootBlocks[0] = uint_c(std::ceil( (ds[0] / coarseMeshSize) / real_c(cellsPerBlock[0])));
+      rootBlocks[1] = uint_c(std::ceil( (ds[1] / coarseMeshSize) / real_c(cellsPerBlock[1])));
+      rootBlocks[2] = uint_c(std::ceil( (ds[2] / coarseMeshSize) / real_c(cellsPerBlock[2])));
+
+      cells      = Vector3<uint_t>(rootBlocks[0] * cellsPerBlock[0], rootBlocks[1] * cellsPerBlock[1], rootBlocks[2] * cellsPerBlock[2]);
+      domainSize = Vector3<real_t>(cells) * coarseMeshSize;
+
+      periodic = domainParameters.getParameter< Vector3< bool > >("periodic");
+      refinementLevels = domainParameters.getParameter< uint_t >("refinementLevels");
+
+      sphereDiameter = parameters.getParameter< real_t >("diameterSphere") / coarseMeshSize;
+      sphereRadius   = sphereDiameter / real_c(2.0);
+
+      sphereXPosition = parameters.getParameter< real_t >("SphereXPosition") / coarseMeshSize;
+      sphereYPosition = real_c(cells[1]) / real_c(2.0);
+      sphereZPosition = real_c(cells[2]) / real_c(2.0);
+
+      reynoldsNumber    = parameters.getParameter< real_t >("reynoldsNumber");
+      referenceVelocity = parameters.getParameter< real_t >("referenceVelocity");
+      inflowVelocity    = parameters.getParameter< real_t >("latticeVelocity");
+
+      const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) ));
+      machNumber = inflowVelocity / speedOfSound;
+      viscosity  = real_c((inflowVelocity * sphereDiameter) / reynoldsNumber);
+      omega      = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5)));
+
+      rho = real_c(1.0);
+      dxC = coarseMeshSize;
+      dxF = real_c(coarseMeshSize) / real_c( 1 << refinementLevels );
+      dt = inflowVelocity / referenceVelocity * coarseMeshSize;
+
+      resolutionSphere = parameters.getParameter< real_t >("diameterSphere") / dxF;
+
+      timesteps = parameters.getParameter< uint_t >("timesteps");
+      numGhostLayers = uint_c(2);
+      nbrOfEvaluationPointsForCoefficientExtremas = 100;
+
+      valuesPerCell = (Stencil_T::Q + VelocityField_T::F_SIZE + uint_c(2) * ScalarField_T::F_SIZE);
+      memoryPerCell = memory_t(valuesPerCell * sizeof(PdfField_T::value_type) + uint_c(1));
+      processMemoryLimit = parameters.getParameter< memory_t >( "processMemoryLimit", memory_t( 512 ) ) * memory_t( 1024 * 1024 );
+
+      stencil          = infoMap["stencil"];
+      streamingPattern = infoMap["streamingPattern"];
+      collisionModel   = infoMap["collisionOperator"];
+
+      fluidUID    = FlagUID("Fluid");
+      obstacleUID = FlagUID(boundaries.getParameter< std::string >("sphere"));
+      inflowUID   = FlagUID(boundaries.getParameter< std::string >("inflow"));
+      outflowUID  = FlagUID(boundaries.getParameter< std::string >("outflow"));
+      wallUID     = FlagUID(boundaries.getParameter< std::string >("walls"));
+
+      writeSetupForestAndReturn = logging.getParameter< bool >("writeSetupForestAndReturn", false);
+
+   }
+
+   void writeParameterHeader(std::ofstream& file)
+   {
+      file << "ReynoldsNumber" << "," << "machNumber" << "," << "coarseMeshSize" << "," << "fineMeshSize" << "," << "resolutionSphere" << ",";
+      file << "refinementLevels" << "," << "stencil" << "," << "streamingPattern" << "," << "collisionOperator" << "," << "omega-coarse" << "\n";
+
+      file << reynoldsNumber << "," << machNumber << "," << dxC << "," << dxF << "," << resolutionSphere << ",";
+      file << refinementLevels << "," << stencil << "," << streamingPattern << "," << collisionModel << "," << omega << "\n";
+   }
+
+   std::string blockForestFilestem;
+   uint_t numProcesses;
+
+   Vector3<uint_t> rootBlocks;
+   Vector3<uint_t> cells;
+   Vector3<real_t> domainSize;
+
+   Vector3< bool > periodic;
+   uint_t refinementLevels;
+
+   Vector3< uint_t > cellsPerBlock;
+   uint_t numGhostLayers;
+
+   real_t sphereXPosition;
+   real_t sphereYPosition;
+   real_t sphereZPosition;
+   real_t sphereRadius;
+   real_t sphereDiameter;
+   real_t resolutionSphere;
+
+   uint_t nbrOfEvaluationPointsForCoefficientExtremas;
+
+   real_t machNumber;
+   real_t reynoldsNumber;
+
+   real_t viscosity;
+   real_t omega;
+   real_t rho;
+   real_t inflowVelocity;
+   real_t referenceVelocity;
+   real_t dxC;
+   real_t dxF;
+   real_t dt;
+
+   uint_t timesteps;
+
+   uint_t valuesPerCell;
+   memory_t memoryPerCell;
+   memory_t processMemoryLimit;
+
+   std::string stencil;
+   std::string streamingPattern;
+   std::string collisionModel;
+
+   FlagUID fluidUID;
+   FlagUID obstacleUID;
+   FlagUID inflowUID;
+   FlagUID outflowUID;
+   FlagUID wallUID;
+
+   bool writeSetupForestAndReturn;
+};
+
+}
\ No newline at end of file
diff --git a/apps/showcases/FlowAroundSphere/Sphere.cpp b/apps/showcases/FlowAroundSphere/Sphere.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e31c3d3eb97ca19f8eb2d2c8ac5b3911da438f2
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/Sphere.cpp
@@ -0,0 +1,193 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Sphere.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "Sphere.h"
+
+namespace walberla
+{
+
+bool Sphere::contains(const Vector3< real_t >& point) const
+{
+   return (point - center_).sqrLength() <= radius2_;
+}
+
+bool Sphere::contains(const AABB& aabb) const
+{
+   Vector3< real_t > p[8];
+   p[0].set(aabb.xMin(), aabb.yMin(), aabb.zMin());
+   p[1].set(aabb.xMax(), aabb.yMin(), aabb.zMin());
+   p[2].set(aabb.xMin(), aabb.yMax(), aabb.zMin());
+   p[3].set(aabb.xMax(), aabb.yMax(), aabb.zMin());
+   p[4].set(aabb.xMin(), aabb.yMin(), aabb.zMax());
+   p[5].set(aabb.xMax(), aabb.yMin(), aabb.zMax());
+   p[6].set(aabb.xMin(), aabb.yMax(), aabb.zMax());
+   p[7].set(aabb.xMax(), aabb.yMax(), aabb.zMax());
+   return contains(p[0]) && contains(p[1]) && contains(p[2]) && contains(p[3]) && contains(p[4]) && contains(p[5]) &&
+          contains(p[6]) && contains(p[7]);
+}
+
+real_t Sphere::delta(const Vector3< real_t >& fluid, const Vector3< real_t >& boundary) const
+{
+   WALBERLA_ASSERT(!contains(fluid))
+   WALBERLA_ASSERT(contains(boundary))
+
+   // http://devmag.org.za/2009/04/17/basic-collision-detection-in-2d-part-2/
+   const Vector3< real_t > f = fluid - center_;
+   const Vector3< real_t > d = (boundary - center_) - f;
+
+   const real_t dDotd = d[0] * d[0] + d[1] * d[1] + d[2] * d[2];
+   const real_t fDotf = f[0] * f[0] + f[1] * f[1] + f[2] * f[2];
+   const real_t dDotf = d[0] * f[0] + d[1] * f[1] + d[2] * f[2];
+
+   const real_t b = real_c(2.0) * dDotf;
+   const real_t c = fDotf - radius2_;
+
+   const real_t bb4ac = b * b - (real_c(4.0) * dDotd * c);
+   WALBERLA_CHECK_GREATER_EQUAL(bb4ac, real_c(0.0))
+
+   const real_t sqrtbb4ac = std::sqrt(bb4ac);
+   const real_t alpha = std::min((-b + sqrtbb4ac) / (real_c(2.0) * dDotd), (-b - sqrtbb4ac) / (real_c(2.0) * dDotd));
+
+   WALBERLA_CHECK_GREATER_EQUAL(alpha, real_c(0.0))
+   WALBERLA_CHECK_LESS_EQUAL(alpha, real_c(1.0))
+
+   return alpha;
+}
+
+void Sphere::setupBoundary(const std::shared_ptr< StructuredBlockForest >& sbfs, const BlockDataID flagFieldID)
+{
+
+   for (auto bIt = sbfs->begin(); bIt != sbfs->end(); ++bIt)
+   {
+      auto flagField       = bIt->getData< FlagField_T >(flagFieldID);
+      const FlagField_T::flag_t inflowFlag  = flagField->registerFlag(setup_.inflowUID);
+      const FlagField_T::flag_t outflowFlag = flagField->registerFlag(setup_.outflowUID);
+      const FlagField_T::flag_t wallFlag    = flagField->registerFlag(setup_.wallUID);
+
+      const cell_idx_t gls = cell_idx_c(flagField->nrOfGhostLayers()) - cell_idx_c(1);
+
+      CellInterval blockBB(-1, -1, -1,
+                           cell_idx_c(setup_.cellsPerBlock[0]), cell_idx_c(setup_.cellsPerBlock[1]), cell_idx_c(setup_.cellsPerBlock[2]));
+
+      // inflow WEST
+      if(sbfs->atDomainXMinBorder(*bIt)){
+         CellInterval west(blockBB.xMin() - gls, blockBB.yMin() - gls, blockBB.zMin() - gls, blockBB.xMin(),
+                           blockBB.yMax() + gls, blockBB.zMax() + gls);
+         setBoundaryFromCellInterval(west, inflowFlag, flagField);
+      }
+
+      // outflow EAST
+      if(sbfs->atDomainXMaxBorder(*bIt)){
+         CellInterval east(blockBB.xMax(), blockBB.yMin() - gls, blockBB.zMin() - gls, blockBB.xMax() + gls,
+                           blockBB.yMax() + gls, blockBB.zMax() + gls);
+         setBoundaryFromCellInterval(east, outflowFlag, flagField);
+      }
+
+      // SOUTH
+      if(sbfs->atDomainYMinBorder(*bIt))
+      {
+         CellInterval south(blockBB.xMin() - gls, blockBB.yMin() - gls, blockBB.zMin() - gls, blockBB.xMax() + gls,
+                            blockBB.yMin(), blockBB.zMax() + gls);
+         setBoundaryFromCellInterval(south, wallFlag, flagField);
+      }
+
+      // NORTH
+      if(sbfs->atDomainYMaxBorder(*bIt)){
+         CellInterval north( blockBB.xMin() - gls, blockBB.yMax(), blockBB.zMin() - gls, blockBB.xMax() + gls,
+                             blockBB.yMax() + gls, blockBB.zMax() + gls );
+         setBoundaryFromCellInterval(north, wallFlag, flagField);
+      }
+
+      // BOTTOM
+      if(sbfs->atDomainZMinBorder(*bIt)){
+         CellInterval bottom(blockBB.xMin() - gls, blockBB.yMin() - gls, blockBB.zMin() - gls, blockBB.xMax() + gls,
+                             blockBB.yMax() + gls, blockBB.zMin());
+         setBoundaryFromCellInterval(bottom, wallFlag, flagField);
+      }
+
+      // TOP
+      if(sbfs->atDomainZMaxBorder(*bIt)){
+         CellInterval top(blockBB.xMin() - gls, blockBB.yMin() - gls, blockBB.zMax(), blockBB.xMax() + gls,
+                          blockBB.yMax() + gls, blockBB.zMax() + gls);
+         setBoundaryFromCellInterval(top, wallFlag, flagField);
+      }
+   }
+
+   checkConsistency(sbfs, flagFieldID);
+   setupSphereBoundary(sbfs, flagFieldID);
+}
+
+void Sphere::setBoundaryFromCellInterval(CellInterval& cells, const FlagField_T::flag_t flag, FlagField_T* flagField)
+{
+
+   for (auto cell = cells.begin(); cell != cells.end(); ++cell){
+      if(flagField->get(cell->x(), cell->y(), cell->z()) == FlagField_T::flag_t(0))
+         flagField->addFlag(cell->x(), cell->y(), cell->z(), flag);
+   }
+}
+
+void Sphere::checkConsistency(const std::shared_ptr< StructuredBlockForest >& sbfs, const BlockDataID flagFieldID){
+   const uint_t depth = sbfs->getDepth();
+   for (auto bIt = sbfs->begin(); bIt != sbfs->end(); ++bIt)
+   {
+      Block& b             = dynamic_cast< Block& >(*bIt);
+      auto flagField       = b.getData< FlagField_T >(flagFieldID);
+      if (sbfs->getLevel(b) < depth){
+         for( auto it = flagField->beginWithGhostLayer(1); it != flagField->end(); ++it ){
+            Vector3< real_t > cellCenter = sbfs->getBlockLocalCellCenter( b, it.cell() );
+            sbfs->mapToPeriodicDomain(cellCenter);
+            WALBERLA_CHECK(!contains(cellCenter), "The sphere must be completely located on the finest level")
+         }
+      }
+   }
+}
+
+void Sphere::setupSphereBoundary(const std::shared_ptr< StructuredBlockForest >& sbfs, const BlockDataID flagFieldID){
+   const uint_t depth = sbfs->getDepth();
+   for (auto bIt = sbfs->begin(); bIt != sbfs->end(); ++bIt)
+   {
+      Block& b             = dynamic_cast< Block& >(*bIt);
+      auto flagField       = b.getData< FlagField_T >(flagFieldID);
+      uint8_t obstacleFlag = flagField->registerFlag(setup_.obstacleUID);
+
+      if (sbfs->getLevel(b) == depth){
+         for( auto it = flagField->beginWithGhostLayer(1); it != flagField->end(); ++it ){
+            Vector3< real_t > cellCenter = sbfs->getBlockLocalCellCenter( b, it.cell() );
+            sbfs->mapToPeriodicDomain(cellCenter);
+            if (contains(cellCenter)) { flagField->addFlag(it.x(), it.y(), it.z(), obstacleFlag); }
+         }
+      }
+   }
+}
+
+real_t wallDistance::operator()(const Cell& fluidCell, const Cell& boundaryCell,
+                                const shared_ptr< StructuredBlockForest >& SbF, IBlock& block) const
+{
+   Vector3< real_t > boundary = SbF->getBlockLocalCellCenter( block, boundaryCell );
+   Vector3< real_t > fluid = SbF->getBlockLocalCellCenter( block, fluidCell );
+   SbF->mapToPeriodicDomain(boundary);
+   SbF->mapToPeriodicDomain(fluid);
+
+   WALBERLA_CHECK(!sphere_.contains(fluid), "fluid cell is in contained in sphere (" << fluid[0] << ", " << fluid[1] << ", " << fluid[2] << "). The block local cell is " << fluidCell)
+   WALBERLA_CHECK(sphere_.contains(boundary), "boundary cell is not in contained in sphere (" << boundary[0] << ", " << boundary[1] << ", " << boundary[2] << "). The block local cell is " << boundaryCell)
+
+   return sphere_.delta( fluid, boundary );
+}
+} // namespace walberla
\ No newline at end of file
diff --git a/apps/showcases/FlowAroundSphere/Sphere.h b/apps/showcases/FlowAroundSphere/Sphere.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ebb802a94561a0b35c0e7b226428be338dd2076
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/Sphere.h
@@ -0,0 +1,177 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Sphere.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/StructuredBlockForest.h"
+
+#include "domain_decomposition/IBlock.h"
+
+#include "field/FlagUID.h"
+
+#include "core/DataTypes.h"
+#include "core/math/AABB.h"
+#include "core/math/Vector3.h"
+#include "core/cell/Cell.h"
+
+#include "stencil/D3Q7.h"
+#include "stencil/D3Q27.h"
+
+#include "Types.h"
+#include "Setup.h"
+
+namespace walberla
+{
+
+class Sphere
+{
+ public:
+   Sphere(const Setup& setup) : setup_(setup)
+   {
+      const real_t px = setup_.sphereXPosition * setup_.dxC;
+      const real_t py = setup_.sphereYPosition * setup_.dxC;
+      const real_t pz = setup_.sphereZPosition * setup_.dxC;
+
+      center_  = Vector3< real_t >(px, py, pz);
+      radius_  = setup_.sphereRadius * setup_.dxC;
+      radius2_ = radius_ * radius_;
+   }
+
+   bool operator()(const Vector3< real_t >& point) const { return contains(point); }
+
+   bool contains(const Vector3< real_t >& point) const;
+   bool contains(const AABB& aabb) const;
+
+   real_t delta(const Vector3< real_t >& fluid, const Vector3< real_t >& boundary) const;
+   Setup getSetup(){ return setup_; }
+   void setupBoundary(const std::shared_ptr< StructuredBlockForest >& sbfs, const BlockDataID flagFieldID);
+   void checkConsistency(const std::shared_ptr< StructuredBlockForest >& sbfs, const BlockDataID flagFieldID);
+   void setupSphereBoundary(const std::shared_ptr< StructuredBlockForest >& sbfs, const BlockDataID flagFieldID);
+   void setBoundaryFromCellInterval(CellInterval& cells, const FlagField_T::flag_t flag, FlagField_T* flagField);
+
+ private:
+   Setup setup_;
+   Vector3< real_t > center_;
+   real_t radius_;
+   real_t radius2_;
+
+}; // class Sphere
+
+class SphereRefinementSelection
+{
+ public:
+   SphereRefinementSelection(const Sphere& sphere, const uint_t level)
+      : sphere_(sphere), level_(level)
+   {
+      auto setup = sphere_.getSetup();
+      const real_t px = setup.sphereXPosition * setup.dxC;
+      const real_t py = setup.sphereYPosition * setup.dxC;
+      const real_t pz = setup.sphereZPosition * setup.dxC;
+
+      center_  = Vector3< real_t >(px, py, pz);
+      const real_t sphereRadius = setup.sphereRadius * setup.dxC;
+      const real_t bufferDistance = setup.dxF;
+      const real_t d = sphereRadius + bufferDistance;
+      radius2_ = d * d;
+
+      sphereBoundingBox1_ = AABB(center_[0], center_[1] - d, center_[2] - d,
+                                center_[0] + (real_c(2.5) * sphereRadius), center_[1] + d, center_[2] + d);
+
+      sphereBoundingBox2_ = AABB(center_[0], center_[1] - d, center_[2] - d,
+                                 center_[0] + (real_c(5) * sphereRadius), center_[1] + d, center_[2] + d);
+   }
+
+   bool contains(const Vector3< real_t >& point) const
+   {
+      return (point - center_).sqrLength() <= radius2_;
+   }
+
+   bool intersects(const AABB& aabb) const
+   {
+      Vector3< real_t > p[8];
+      p[0].set(aabb.xMin(), aabb.yMin(), aabb.zMin());
+      p[1].set(aabb.xMax(), aabb.yMin(), aabb.zMin());
+      p[2].set(aabb.xMin(), aabb.yMax(), aabb.zMin());
+      p[3].set(aabb.xMax(), aabb.yMax(), aabb.zMin());
+      p[4].set(aabb.xMin(), aabb.yMin(), aabb.zMax());
+      p[5].set(aabb.xMax(), aabb.yMin(), aabb.zMax());
+      p[6].set(aabb.xMin(), aabb.yMax(), aabb.zMax());
+      p[7].set(aabb.xMax(), aabb.yMax(), aabb.zMax());
+      return contains(p[0]) || contains(p[1]) || contains(p[2]) || contains(p[3]) || contains(p[4]) || contains(p[5]) ||
+             contains(p[6]) || contains(p[7]);
+   }
+
+   void operator()(SetupBlockForest& forest)
+   {
+      if(level_ == 0)
+         return;
+      for (auto block = forest.begin(); block != forest.end(); ++block)
+      {
+         const AABB& aabb = block->getAABB();
+
+         if (block->getLevel() < level_ && (intersects(aabb) || sphereBoundingBox1_.intersects(aabb)) )
+            block->setMarker(true);
+
+         if (block->getLevel() < (level_ - 1) && (intersects(aabb) || sphereBoundingBox2_.intersects(aabb)) )
+            block->setMarker(true);
+      }
+   }
+
+ private:
+   Sphere sphere_;
+   Vector3< real_t > center_;
+   uint_t level_;
+   AABB sphereBoundingBox1_;
+   AABB sphereBoundingBox2_;
+   real_t radius2_;
+
+}; // class SphereRefinementSelection
+
+class SphereBlockExclusion
+{
+ public:
+   SphereBlockExclusion(const Sphere& sphere) : sphere_(sphere) {}
+
+   bool operator()(const blockforest::SetupBlock& block)
+   {
+      const AABB aabb = block.getAABB();
+      return static_cast< bool >(sphere_.contains(aabb));
+   }
+
+ private:
+   Sphere sphere_;
+
+}; // class SphereBlockExclusion
+
+
+class wallDistance
+{
+ public:
+   wallDistance(const Sphere& sphere) : sphere_(sphere) {}
+
+   real_t operator()(const Cell& fluidCell, const Cell& boundaryCell, const shared_ptr< StructuredBlockForest >& SbF,
+                     IBlock& block) const;
+
+ private:
+   Sphere sphere_;
+}; // class wallDistance
+
+} // namespace walberla
\ No newline at end of file
diff --git a/apps/showcases/FlowAroundSphere/Types.h b/apps/showcases/FlowAroundSphere/Types.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbf11d7d7bf2a08539354a293035a0aa401c7c68
--- /dev/null
+++ b/apps/showcases/FlowAroundSphere/Types.h
@@ -0,0 +1,47 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Types.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+# pragma once
+
+#include "domain_decomposition/BlockDataID.h"
+#include "lbm_generated/field/PdfField.h"
+#include "FlowAroundSphereInfoHeader.h"
+
+using namespace walberla;
+
+using StorageSpecification_T = lbm::FlowAroundSphereStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+
+using PdfField_T           = lbm_generated::PdfField< StorageSpecification_T >;
+using FlagField_T          = FlagField< uint8_t >;
+
+struct IDs
+{
+   BlockDataID pdfField;
+   BlockDataID velocityField;
+   BlockDataID densityField;
+   BlockDataID omegaField;
+   BlockDataID flagField;
+
+   BlockDataID pdfFieldGPU;
+   BlockDataID velocityFieldGPU;
+   BlockDataID densityFieldGPU;
+   BlockDataID omegaFieldGPU;
+};
diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h
index 7c6fdc828efd635abda7775bc46e078240d0f4b6..7096a5407b0da8ecb7066dd8a7661a918073bd1f 100644
--- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.h
+++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h
@@ -31,6 +31,7 @@
 namespace walberla {
 
 using blockforest::communication::NonUniformBufferedScheme;
+using BlockFunction = std::function<void (const uint_t)>; // parameters: block, level
 
 namespace lbm_generated {
 
@@ -73,11 +74,14 @@ class BasicRecursiveTimeStep
    void operator() () { timestep(0); };
    void addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level=0);
 
+   inline void addPostBoundaryHandlingBlockFunction( const BlockFunction & function );
+
  private:
    void timestep(uint_t level);
    void ghostLayerPropagation(Block * block);
-   std::function<void()> executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation=false);
-   std::function<void()> executeBoundaryHandlingOnLevel(uint_t level);
+   std::function< void() > executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation=false);
+   std::function< void() > executeBoundaryHandlingOnLevel(uint_t level);
+   std::function< void() > executePostBoundaryBlockFunctions(uint_t level);
 
    std::shared_ptr< StructuredBlockForest > sbfs_;
    uint_t maxLevel_;
@@ -89,6 +93,8 @@ class BasicRecursiveTimeStep
 
    SweepCollection_T & sweepCollection_;
    BoundaryCollection_T & boundaryCollection_;
+
+   std::vector< BlockFunction >  globalPostBoundaryHandlingBlockFunctions_;
 };
 
 } // namespace lbm_generated
diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
index 3abb8a911d68072211d4660a4ee68b24ada22979..2888628cef57c774964702f300c6af6d084f4a76 100644
--- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
+++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
@@ -111,6 +111,7 @@ void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T
 
    // 1.5 Boundary Handling and Coalescence Preparation
    timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level));
+   timeloop.addFuncBeforeTimeStep(executePostBoundaryBlockFunctions(level), "Refinement Cycle: post boundary handling block functions on level " + std::to_string(level));
 
    // 1.6 Fine to Coarse Communication, receiving end
    if(level < maxLevel_){
@@ -134,10 +135,12 @@ void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T
 
    // 2.5 Boundary Handling and Coalescence Preparation
    timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level));
+   timeloop.addFuncBeforeTimeStep(executePostBoundaryBlockFunctions(level), "Refinement Cycle: post boundary handling block functions on level " + std::to_string(level));
 
    // 2.6 Fine to Coarse Communication, receiving end
-   if(level < maxLevel_)
+   if(level < maxLevel_){
       timeloop.addFuncBeforeTimeStep(commScheme_->communicateFineToCoarseFunctor(level + 1), "Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1));
+   }
 
 }
 
@@ -176,6 +179,16 @@ std::function<void()>  BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, Bo
    };
 }
 
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+std::function<void()>  BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executePostBoundaryBlockFunctions(uint_t level)
+{
+   return [level, this]() {
+      for( const auto& func : globalPostBoundaryHandlingBlockFunctions_ ){
+         func(level);
+      }
+   };
+}
+
 
 template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
 void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::ghostLayerPropagation(Block * block)
@@ -193,73 +206,11 @@ void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T
    }
 }
 
-// Refinement Timestep from post collision state:
-//template< typename PdfField_T, typename LbSweep_T >
-//void BasicRecursiveTimeStep< PdfField_T, LbSweep_T >::timestep(uint_t level)
-//{
-//   std::vector<Block *> blocks;
-//   sbfs_->getBlocks(blocks, level);
-//
-//   uint_t maxLevel = sbfs_->getDepth();
-//
-//   // 1.1 Equal-Level Communication
-//   commScheme_->communicateEqualLevel(level);
-//
-//   // 1.2 Coarse to Fine Communication
-//   if(level < maxLevel){
-//      commScheme_->communicateCoarseToFine(level + 1);
-//   }
-//
-//   // 1.3 Boundary Handling and
-//   // 1.4 Prepare Coalescence (which happens during the recursive descent)
-//   for(auto b : blocks){
-//      boundaryFunctor_(b);
-//      if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b);
-//   }
-//
-//   // 1.5 Recursive Descent
-//   if(level < maxLevel){
-//      timestep(level + 1);
-//   }
-//
-//   // 1.6 First Collision and ghost-layer propagation
-//   for(auto b: blocks){
-//      if(level != 0) ghostLayerPropagation(b);  // GL-Propagation first without swapping arrays...
-//      sweepCollection_.streamCollide(b);                // then Stream-Collide on interior, and swap arrays
-//   }
-//
-//   // Stop here if on coarsest level.
-//   // Otherwise, continue to second subcycle.
-//   if(level == 0) return;
-//
-//   // 2.1 Equal-Level Communication
-//   commScheme_->communicateEqualLevel(level);
-//
-//   // 2.2 Coarse to Fine Communication
-//   if(level < maxLevel){
-//      commScheme_->communicateCoarseToFine(level + 1);
-//   }
-//
-//   // 2.3 Boundary Handling and
-//   // 2.4 Prepare Coalescence (which happens during the recursive descent)
-//   for(auto b : blocks){
-//      boundaryFunctor_(b);
-//      if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b);
-//   }
-//
-//   // 2.5 Recursive Descent
-//   if(level < maxLevel){
-//      timestep(level + 1);
-//   }
-//
-//   // 2.6 Fine to Coarse Communication
-//   commScheme_->communicateFineToCoarse(level);
-//
-//   // 2.7 Second Collision
-//   for(auto b: blocks){
-//      sweepCollection_.streamCollide(b);
-//   }
-//}
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+inline void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addPostBoundaryHandlingBlockFunction( const BlockFunction & function )
+{
+   globalPostBoundaryHandlingBlockFunctions_.emplace_back( function );
+}
 
 } // namespace lbm_generated
 } // namespace walberla