Compare revisions

efcd425a · c8978129 · 3cecbd2d · 5596bbaf · 24f6b569 · 5226f674
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ qrc_*

 # CLion indexing
 *.uuid
+.fleet


 # Generated files
@@ -32,11 +33,13 @@ qrc_*
 # Visual Studio Code
 /.vscode

+# Zed
+/.cache*
+
 # CLion
 *.idea
 *.clion*

-
 # QtCreator
 CMakeLists.txt.user.*


--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -22,7 +22,6 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
   add_subdirectory( FieldCommunication )

   if ( WALBERLA_BUILD_WITH_CODEGEN )
-      add_subdirectory( FlowAroundSphereCodeGen )
      add_subdirectory( UniformGridCPU )
      add_subdirectory( PhaseFieldAllenCahn )
      add_subdirectory( NonUniformGridCPU )

--- a/apps/benchmarks/CouetteFlow/CouetteFlow.cpp
+++ b/apps/benchmarks/CouetteFlow/CouetteFlow.cpp
@@ -988,7 +988,7 @@ int main( int argc, char **argv )
   }   

   logging::Logging::printHeaderOnStream();
-   //WALBERLA_ROOT_SECTION() { logging::Logging::instance()->setLogLevel( logging::Logging::PROGRESS ); }
+   WALBERLA_ROOT_SECTION() { logging::Logging::instance()->setLogLevel( logging::Logging::INFO ); }

 #ifdef _OPENMP
   if( std::getenv( "OMP_NUM_THREADS" ) == nullptr )

--- a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
-waLBerla_link_files_to_builddir( "*.py" )
-
-waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated
-        FILE FlowAroundSphereCodeGen.py
-        OUT_FILES FlowAroundSphereCodeGen_LbSweep.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_LbSweep.h
-        FlowAroundSphereCodeGen_MacroSetter.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_MacroSetter.h
-        FlowAroundSphereCodeGen_UBB.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_UBB.h
-        FlowAroundSphereCodeGen_NoSlip.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_NoSlip.h
-        FlowAroundSphereCodeGen_Outflow.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_Outflow.h
-        FlowAroundSphereCodeGen_PackInfoEven.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_PackInfoEven.h
-        FlowAroundSphereCodeGen_PackInfoOdd.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_PackInfoOdd.h
-        FlowAroundSphereCodeGen_InfoHeader.h)
-
-if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
-    waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
-            DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
-else ()
-    waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
-            DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
-endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
\ No newline at end of file
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file FlowAroundSphereCodeGen.cpp
-//! \author Frederik Hennig <frederik.hennig@fau.de>
-//! \author Markus Holzer <markus.holzer@fau.de>
-//
-//======================================================================================================================
-#include "blockforest/all.h"
-
-#include "core/all.h"
-
-#include "domain_decomposition/all.h"
-
-#include "field/all.h"
-
-#include "geometry/all.h"
-
-#include "lbm/inplace_streaming/TimestepTracker.h"
-#include "lbm/vtk/QCriterion.h"
-
-#include "python_coupling/CreateConfig.h"
-#include "python_coupling/PythonCallback.h"
-
-#include "timeloop/all.h"
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-#   include "gpu/AddGPUFieldToStorage.h"
-#   include "gpu/DeviceSelectMPI.h"
-#   include "gpu/HostFieldAllocator.h"
-#   include "gpu/NVTX.h"
-#   include "gpu/ParallelStreams.h"
-#   include "gpu/communication/GPUPackInfo.h"
-#   include "gpu/communication/UniformGPUScheme.h"
-#endif
-
-// CodeGen includes
-#include "FlowAroundSphereCodeGen_InfoHeader.h"
-
-namespace walberla
-{
-typedef lbm::FlowAroundSphereCodeGen_PackInfoEven PackInfoEven_T;
-typedef lbm::FlowAroundSphereCodeGen_PackInfoOdd PackInfoOdd_T;
-
-typedef walberla::uint8_t flag_t;
-typedef FlagField< flag_t > FlagField_T;
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-typedef gpu::GPUField< real_t > GPUField;
-#endif
-
-using namespace std::placeholders;
-
-auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) {
-   return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block),
-                         storage->getNumberOfZCells(*block), uint_t(1), field::fzyx,
-                         make_shared< field::AllocateAligned< real_t, 64 > >());
-};
-
-auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block,
-                           real_t inflow_velocity, const bool constant_inflow = true) {
-   if (constant_inflow)
-   {
-      Vector3< real_t > result(inflow_velocity, real_c(0.0), real_c(0.0));
-      return result;
-   }
-   else
-   {
-      Cell globalCell;
-      CellInterval domain = SbF->getDomainCellBB();
-      auto h_y          = real_c(domain.ySize());
-      auto h_z          = real_c(domain.zSize());
-      SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
-
-      auto y1 = real_c(globalCell[1] - (h_y / 2.0 - 0.5));
-      auto z1 = real_c(globalCell[2] - (h_z / 2.0 - 0.5));
-
-      real_t u = (inflow_velocity * real_c(16.0)) / (h_y * h_y * h_z * h_z) * (h_y / real_c(2.0) - y1) *
-                 (h_y / real_c(2.0) + y1) * (h_z / real_c(2.0) - z1) * (h_z / real_c(2.0) + z1);
-
-      Vector3< real_t > result(u, 0.0, 0.0);
-      return result;
-   }
-};
-
-class AlternatingBeforeFunction
-{
- public:
-   typedef std::function< void() > BeforeFunction;
-
-   AlternatingBeforeFunction(BeforeFunction evenFunc, BeforeFunction oddFunc,
-                             std::shared_ptr< lbm::TimestepTracker >& tracker)
-      : tracker_(tracker), funcs_{ evenFunc, oddFunc } {};
-
-   void operator()() { funcs_[tracker_->getCounter()](); }
-
- private:
-   std::shared_ptr< lbm::TimestepTracker > tracker_;
-   std::vector< BeforeFunction > funcs_;
-};
-
-class Filter
-{
- public:
-   explicit Filter(Vector3< uint_t > numberOfCells) : numberOfCells_(numberOfCells) {}
-
-   void operator()(const IBlock& /*block*/) {}
-
-   bool operator()(const cell_idx_t x, const cell_idx_t y, const cell_idx_t z) const
-   {
-      return x >= -1 && x <= cell_idx_t(numberOfCells_[0]) && y >= -1 && y <= cell_idx_t(numberOfCells_[1]) &&
-             z >= -1 && z <= cell_idx_t(numberOfCells_[2]);
-   }
-
- private:
-   Vector3< uint_t > numberOfCells_;
-};
-
-using FluidFilter_T = Filter;
-
-int main(int argc, char** argv)
-{
-   walberla::Environment walberlaEnv(argc, argv);
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-   gpu::selectDeviceBasedOnMpiRank();
-#endif
-
-   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
-   {
-      WALBERLA_MPI_WORLD_BARRIER()
-
-      auto config = *cfg;
-      logging::configureLogging(config);
-      auto blocks = blockforest::createUniformBlockGridFromConfig(config);
-
-      // read parameters
-      Vector3< uint_t > cellsPerBlock =
-         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
-      auto parameters = config->getOneBlock("Parameters");
-
-      const uint_t timesteps       = parameters.getParameter< uint_t >("timesteps", uint_c(10));
-      const real_t omega           = parameters.getParameter< real_t >("omega", real_c(1.9));
-      const real_t u_max           = parameters.getParameter< real_t >("u_max", real_c(0.05));
-      const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_c(1000.0));
-      const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
-      const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
-
-      const real_t remainingTimeLoggerFrequency =
-         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
-
-      // create fields
-      BlockDataID pdfFieldID     = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "PDFs");
-      BlockDataID velFieldID     = field::addToStorage< VelocityField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
-      BlockDataID densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx);
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      BlockDataID pdfFieldIDGPU = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "PDFs on GPU", true);
-      BlockDataID velFieldIDGPU =
-         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldID, "velocity on GPU", true);
-      BlockDataID densityFieldIDGPU =
-         gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldID, "density on GPU", true);
-#endif
-
-      BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
-
-      // initialise all PDFs
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldIDGPU, velFieldIDGPU);
-      for (auto& block : *blocks)
-         setterSweep(&block);
-      gpu::fieldCpy< PdfField_T, GPUField >(blocks, pdfFieldID, pdfFieldIDGPU);
-#else
-      pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldID, velFieldID);
-      for (auto& block : *blocks)
-         setterSweep(&block);
-#endif
-      // Create communication
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      // This way of using alternating pack infos is temporary and will soon be replaced
-      // by something more straight-forward
-
-      gpu::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false);
-      comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU));
-      auto evenComm = std::function< void() >([&]() { comEven.communicate(); });
-
-      gpu::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false);
-      comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU));
-      auto oddComm = std::function< void() >([&]() { comODD.communicate(); });
-#else
-      blockforest::communication::UniformBufferedScheme< Stencil_T > evenComm(blocks);
-      evenComm.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldID));
-
-      blockforest::communication::UniformBufferedScheme< Stencil_T > oddComm(blocks);
-      oddComm.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldID));
-#endif
-
-      // create and initialize boundary handling
-      const FlagUID fluidFlagUID("Fluid");
-
-      auto boundariesConfig = config->getOneBlock("Boundaries");
-
-      std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
-         velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max, constant_inflow);
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation);
-      lbm::FlowAroundSphereCodeGen_NoSlip noSlip(blocks, pdfFieldIDGPU);
-      lbm::FlowAroundSphereCodeGen_Outflow outflow(blocks, pdfFieldIDGPU, pdfFieldID);
-
-      lbm::FlowAroundSphereCodeGen_LbSweep lbSweep(densityFieldIDGPU, pdfFieldIDGPU, velFieldIDGPU, omega);
-#else
-      lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldID, velocity_initialisation);
-      lbm::FlowAroundSphereCodeGen_NoSlip noSlip(blocks, pdfFieldID);
-      lbm::FlowAroundSphereCodeGen_Outflow outflow(blocks, pdfFieldID);
-
-      lbm::FlowAroundSphereCodeGen_LbSweep lbSweep(densityFieldID, pdfFieldID, velFieldID, omega);
-#endif
-
-      geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
-      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
-
-      ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("UBB"), fluidFlagUID);
-      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
-      outflow.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("Outflow"), fluidFlagUID);
-
-      // create time loop
-      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
-
-      // Timestep Tracking and Sweeps
-      auto tracker = make_shared< lbm::TimestepTracker >(0);
-
-      AlternatingBeforeFunction communication(evenComm, oddComm, tracker);
-
-      // add LBM sweep and communication to time loop
-      timeloop.add() << BeforeFunction(communication, "communication") << Sweep(ubb.getSweep(tracker), "ubb boundary");
-      timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary");
-      timeloop.add() << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
-      timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement")
-                     << Sweep(lbSweep.getSweep(tracker), "LB update rule");
-
-      // LBM stability check
-      timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >(
-                                       config, blocks, pdfFieldID, flagFieldId, fluidFlagUID)),
-                                    "LBM stability check");
-
-      // log remaining time
-      timeloop.addFuncAfterTimeStep(
-         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
-         "remaining time logger");
-
-      // add VTK output to time loop
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
-      if (vtkWriteFrequency > 0)
-      {
-         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                         "simulation_step", false, true, true, false, 0);
-
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-         vtkOutput->addBeforeFunction([&]() {
-            gpu::fieldCpy< VelocityField_T, GPUField >(blocks, velFieldID, velFieldIDGPU);
-            gpu::fieldCpy< ScalarField_T, GPUField >(blocks, densityFieldID, densityFieldIDGPU);
-         });
-#endif
-         auto velWriter     = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "velocity");
-         auto densityWriter = make_shared< field::VTKWriter< ScalarField_T > >(densityFieldID, "density");
-         FluidFilter_T filter(cellsPerBlock);
-
-         auto QCriterionWriter = make_shared< lbm::QCriterionVTKWriter< VelocityField_T, FluidFilter_T > >(
-            blocks, filter, velFieldID, "Q-Criterion");
-
-         vtkOutput->addCellDataWriter(velWriter);
-         vtkOutput->addCellDataWriter(densityWriter);
-         vtkOutput->addCellDataWriter(QCriterionWriter);
-
-         timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
-      }
-
-      WcTimer simTimer;
-
-      WALBERLA_LOG_INFO_ON_ROOT("Simulating flow around sphere:"
-                                "\n timesteps:               "
-                                << timesteps << "\n reynolds number:         " << reynolds_number
-                                << "\n relaxation rate:         " << omega << "\n maximum inflow velocity: " << u_max
-                                << "\n diameter_sphere:         " << diameter_sphere)
-
-      simTimer.start();
-      timeloop.run();
-      simTimer.end();
-      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-      auto time            = real_c(simTimer.last());
-      auto nrOfCells       = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
-      auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
-      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
-      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
-   }
-
-   return EXIT_SUCCESS;
-}
-
-} // namespace walberla
-
-int main(int argc, char** argv) { walberla::main(argc, argv); }
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
-from pystencils import Target
-from pystencils.field import fields
-from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
-
-from lbmpy.advanced_streaming.utility import get_timesteps
-from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
-from lbmpy.creationfunctions import create_lb_collision_rule
-from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
-
-from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
-
-from lbmpy_walberla.additional_data_handler import UBBAdditionalDataHandler, OutflowAdditionalDataHandler
-from lbmpy_walberla import generate_lb_pack_info
-from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary
-
-import sympy as sp
-
-with CodeGeneration() as ctx:
-    data_type = "float64" if ctx.double_accuracy else "float32"
-    stencil = LBStencil(Stencil.D3Q27)
-    q = stencil.Q
-    dim = stencil.D
-    streaming_pattern = 'esotwist'
-    timesteps = get_timesteps(streaming_pattern)
-
-    pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : {data_type}[{dim}D]",
-                                                 layout='fzyx')
-    omega = sp.Symbol("omega")
-    u_max = sp.Symbol("u_max")
-
-    output = {
-        'density': density_field,
-        'velocity': velocity_field
-    }
-
-    lbm_config = LBMConfig(stencil=stencil, method=Method.CUMULANT, compressible=True,
-                           relaxation_rate=omega, galilean_correction=True,
-                           field_name='pdfs', streaming_pattern=streaming_pattern, output=output)
-
-    lbm_optimisation = LBMOptimisation(symbolic_field=pdfs, cse_global=False, cse_pdfs=False)
-
-    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_optimisation)
-    lb_method = collision_rule.method
-
-    # getter & setter
-    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs, density=1.0,
-                                                   streaming_pattern=streaming_pattern,
-                                                   previous_timestep=timesteps[0])
-    setter_assignments = setter_assignments.new_without_unused_subexpressions()
-
-    # opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
-
-    stencil_typedefs = {'Stencil_T': stencil}
-    field_typedefs = {'PdfField_T': pdfs,
-                      'VelocityField_T': velocity_field,
-                      'ScalarField_T': density_field}
-
-    if ctx.cuda:
-        target = Target.GPU
-    else:
-        target = Target.CPU
-
-    # sweeps
-    generate_alternating_lbm_sweep(ctx, 'FlowAroundSphereCodeGen_LbSweep',
-                                   collision_rule, lbm_config=lbm_config, lbm_optimisation=lbm_optimisation,
-                                   target=target)
-    generate_sweep(ctx, 'FlowAroundSphereCodeGen_MacroSetter', setter_assignments, target=target)
-
-    # boundaries
-    ubb = UBB(lambda *args: None, dim=dim, data_type=data_type)
-    ubb_data_handler = UBBAdditionalDataHandler(stencil, ubb)
-    outflow = ExtrapolationOutflow(stencil[4], lb_method, streaming_pattern=streaming_pattern, data_type=data_type)
-    outflow_data_handler = OutflowAdditionalDataHandler(stencil, outflow, target=target)
-
-    generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_UBB', ubb, lb_method,
-                                      target=target, streaming_pattern=streaming_pattern,
-                                      additional_data_handler=ubb_data_handler,
-                                      layout='fzyx')
-
-    generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_NoSlip', NoSlip(), lb_method,
-                                      target=target, streaming_pattern=streaming_pattern,
-                                      layout='fzyx')
-
-    generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_Outflow', outflow, lb_method,
-                                      target=target, streaming_pattern=streaming_pattern,
-                                      additional_data_handler=outflow_data_handler,
-                                      layout='fzyx')
-
-    # communication
-    generate_lb_pack_info(ctx, 'FlowAroundSphereCodeGen_PackInfo', stencil, pdfs,
-                          streaming_pattern=streaming_pattern, always_generate_separate_classes=True, target=target)
-
-    # Info header containing correct template definitions for stencil and field
-    generate_info_header(ctx, 'FlowAroundSphereCodeGen_InfoHeader',
-                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs)
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
-import waLBerla as wlb
-from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
-
-
-class Scenario:
-    def __init__(self):
-        self.timesteps = 10
-        self.vtkWriteFrequency = 100
-
-        self.cells = (64, 32, 32)
-        self.blocks = (1, 1, 1)
-        self.periodic = (0, 0, 0)
-
-        self.constant_inflow = True
-
-        self.diameter_sphere = min(self.cells) // 2
-        self.u_max = 0.1
-        self.reynolds_number = 1000000
-
-        kinematic_viscosity = (self.diameter_sphere * self.u_max) / self.reynolds_number
-
-        self.omega = relaxation_rate_from_lattice_viscosity(kinematic_viscosity)
-
-        self.total_cells = (self.cells[0] * self.blocks[0],
-                            self.cells[1] * self.blocks[1],
-                            self.cells[2] * self.blocks[2])
-
-    @wlb.member_callback
-    def config(self):
-        return {
-            'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
-                'periodic': self.periodic,
-                'oneBlockPerProcess': True
-            },
-            'Parameters': {
-                'timesteps': self.timesteps,
-                'vtkWriteFrequency': self.vtkWriteFrequency,
-                'omega': self.omega,
-                'u_max': self.u_max,
-                'reynolds_number': self.reynolds_number,
-                'diameter_sphere': self.diameter_sphere,
-                'constant_inflow': self.constant_inflow
-            },
-            'Boundaries': {
-                'Border': [
-                    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
-                    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
-                    {'direction': 'W', 'walldistance': -1, 'flag': 'UBB'},
-                    {'direction': 'E', 'walldistance': -1, 'flag': 'Outflow'},
-                    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
-                    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
-                ],
-                'Body': [
-                    {'shape': "sphere",
-                     'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2),
-                     'radius': self.diameter_sphere // 2,
-                     'flag': 'NoSlip'}
-                ],
-            },
-        }
-
-
-scenarios = wlb.ScenarioManager()
-scenarios.add(Scenario())
--- a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
@@ -11,11 +11,6 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
        NonUniformGridCPUBoundaryCollection.h
        NonUniformGridCPUInfoHeader.h)

-waLBerla_add_executable( NAME NonUniformGridGenerator
-                         FILES NonUniformGridGenerator.cpp LdcSetup.h
-                         DEPENDS blockforest core field python_coupling )
-
-
 waLBerla_add_executable( NAME NonUniformGridCPU
-                         FILES NonUniformGridCPU.cpp LdcSetup.h
+                         FILES NonUniformGridCPU.cpp LdcSetup.h GridGeneration.h
                         DEPENDS blockforest boundary core domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridCPUGenerated )
--- a/apps/benchmarks/NonUniformGridCPU/GridGeneration.h
+++ b/apps/benchmarks/NonUniformGridCPU/GridGeneration.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GridGeneration.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include <string>
+
+#include "LdcSetup.h"
+#include "NonUniformGridCPUInfoHeader.h"
+
+using StorageSpecification_T = lbm::NonUniformGridCPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+
+using namespace walberla;
+
+void createSetupBlockForest(SetupBlockForest& setupBfs,
+                            const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
+                            const bool useMPIManager=false)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+
+   Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+   Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
+
+   const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
+   uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
+   const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
+   const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
+   const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
+
+   if(useMPIManager)
+      numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
+
+   const LDC ldc(refinementDepth);
+
+   auto refSelection = ldc.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
+   {
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      setupBfs.saveToFile(oss.str().c_str());
+   }
+
+   if(writeVtk){
+      setupBfs.writeVTKOutput(blockForestFilestem);
+   }
+
+   if(outputStatistics){
+      WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++)
+      {
+         const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+      }
+
+      const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
+      WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+
+      const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
+      const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+      const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+      const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+      const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
+      const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+      const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+      WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+      WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
+
+      WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
+   }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs,
+                       const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
+{
+   if (mpi::MPIManager::instance()->numProcesses() > 1)
+   {
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      std::ifstream infile(setupBlockForestFilepath.c_str());
+      if(!infile.good())
+      {
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else
+      {
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else
+   {
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
+}
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
@@ -48,14 +48,8 @@ class LDCRefinement
   {
      const AABB & domain = forest.getDomain();

-      const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
-
-      const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
-                            domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
-
-      const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
-                             domain.xMax(), domain.yMin() + ySize, domain.zMax() );
+      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
+      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );

      for(auto & block : forest)
      {

--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
@@ -38,6 +38,7 @@

 #include <cmath>

+#include "GridGeneration.h"
 #include "LdcSetup.h"
 #include "NonUniformGridCPUInfoHeader.h"
 #include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
@@ -77,23 +78,25 @@ int main(int argc, char** argv)

      auto config = *cfg;
      logging::configureLogging(config);
-
+      auto domainSetup      = config->getOneBlock("DomainSetup");
      auto blockForestSetup = config->getOneBlock("SetupBlockForest");
+      const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);
+
      const std::string blockForestFilestem =
         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
      const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));

-      auto domainSetup                = config->getOneBlock("DomainSetup");
      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");

-      // Load structured block forest from file
-      std::ostringstream oss;
-      oss << blockForestFilestem << ".bfs";
-      const std::string setupBlockForestFilepath = oss.str();
+      shared_ptr< BlockForest > bfs;
+      createBlockForest(bfs, domainSetup, blockForestSetup);
+
+      if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+         return EXIT_SUCCESS;
+      }

-      WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
-      auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
-                                                 setupBlockForestFilepath.c_str(), false);
      auto blocks =
         std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
      blocks->createCellBoundingBoxes();
@@ -132,7 +135,7 @@ int main(int argc, char** argv)
      SweepCollection_T sweepCollection(blocks, pdfFieldID, densityFieldID, velFieldID, omega, innerOuterSplit);
      for (auto& block : *blocks)
      {
-         sweepCollection.initialise(&block, 2);
+         sweepCollection.initialise(&block, cell_idx_c(1));
      }
      WALBERLA_MPI_BARRIER()
      WALBERLA_LOG_INFO_ON_ROOT("Initialisation done")
@@ -173,6 +176,8 @@ int main(int argc, char** argv)
      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
      const bool useVTKAMRWriter = parameters.getParameter< bool >("useVTKAMRWriter", false);
      const bool oneFilePerProcess = parameters.getParameter< bool >("oneFilePerProcess", false);
+
+      auto finalDomain = blocks->getDomain();
      if (vtkWriteFrequency > 0)
      {
         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
@@ -180,6 +185,12 @@ int main(int argc, char** argv)
         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldID, "vel");
         vtkOutput->addCellDataWriter(velWriter);

+         if (parameters.getParameter< bool >("writeOnlySlice", true)){
+            const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - blocks->dz(refinementDepth),
+                               finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + blocks->dz(refinementDepth));
+            vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
+         }
+
         vtkOutput->addBeforeFunction([&]() {
            for (auto& block : *blocks)
               sweepCollection.calculateMacroscopicParameters(&block);
@@ -236,6 +247,8 @@ int main(int argc, char** argv)
               pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
               pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
               pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+               pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+               pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
               pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
               pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
               pythonCallbackResults.data().exposeValue("mlupsPerProcess",

--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
@@ -23,17 +23,22 @@ const bool infoCsePdfs = {cse_pdfs};

 with CodeGeneration() as ctx:
    field_type = "float64" if ctx.double_accuracy else "float32"
+    cpu_vec = {"instruction_set": None}

-    streaming_pattern = 'aa'
+    streaming_pattern = 'pull'
    timesteps = get_timesteps(streaming_pattern)
-    stencil = LBStencil(Stencil.D3Q19)
+    stencil = LBStencil(Stencil.D3Q27)
+    method_enum = Method.CUMULANT
+    fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
+    collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()

    assert stencil.D == 3, "This application supports only three-dimensional stencils"
    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}

-    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, compressible=True,
+    lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
+                           fourth_order_correction=fourth_order_correction,
                           streaming_pattern=streaming_pattern)
    lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx")

@@ -50,12 +55,12 @@ with CodeGeneration() as ctx:
                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
                         nonuniform=True, boundaries=[no_slip, ubb],
                         macroscopic_fields=macroscopic_fields,
-                         target=ps.Target.CPU)
+                         target=ps.Target.CPU, cpu_vectorize_info=cpu_vec,)

    infoHeaderParams = {
        'stencil': stencil.name.lower(),
        'streaming_pattern': streaming_pattern,
-        'collision_setup': lbm_config.method.name.lower(),
+        'collision_setup': collision_setup,
        'cse_global': int(lbm_opt.cse_global),
        'cse_pdfs': int(lbm_opt.cse_pdfs),
    }

--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file NonUniformGridGenerator.cpp
-//! \author Frederik Hennig <frederik.hennig@fau.de>
-//
-//======================================================================================================================
-
-#include "blockforest/Initialization.h"
-#include "blockforest/SetupBlock.h"
-#include "blockforest/SetupBlockForest.h"
-#include "blockforest/loadbalancing/StaticCurve.h"
-
-#include "core/all.h"
-
-#include "python_coupling/CreateConfig.h"
-
-#include <string>
-
-#include "LdcSetup.h"
-
-using namespace walberla;
-
-
-int main(int argc, char ** argv){
-   const mpi::Environment env(argc, argv);
-   mpi::MPIManager::instance()->useWorldComm();
-
-   if(mpi::MPIManager::instance()->numProcesses() > 1){
-      WALBERLA_ABORT("Commandment: Thou shalt not run thy grid generator with more than one process.");
-   }
-
-   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
-   {
-      auto config = *cfg;
-      auto domainSetup = config->getOneBlock("DomainSetup");
-
-      Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
-      Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
-      Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
-
-      auto blockForestSetup = config->getOneBlock("SetupBlockForest");
-      const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
-      const uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
-      const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
-      const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
-      const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
-
-      const LDC ldc(refinementDepth);
-      SetupBlockForest setupBfs;
-
-      auto refSelection = ldc.refinementSelector();
-      setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
-      const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
-      setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
-      setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
-      setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
-
-      {
-         std::ostringstream oss;
-         oss << blockForestFilestem << ".bfs";
-         setupBfs.saveToFile(oss.str().c_str());
-      }
-
-      if(writeVtk){
-         setupBfs.writeVTKOutput(blockForestFilestem);
-      }
-
-      if(outputStatistics){
-         WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
-         WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
-         for (uint_t level = 0; level <= refinementDepth; level++)
-         {
-            const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
-            WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
-         }
-
-         const uint_t avgBlocksPerProc = setupBfs.getNumberOfBlocks() / setupBfs.getNumberOfProcesses();
-         WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
-         WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
-      }
-
-
-      WALBERLA_LOG_INFO_ON_ROOT("Ending program")
-   }
-}
--- a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
 import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
 import sqlite3
 import os
 import sys

+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
 DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))


 class Scenario:
@@ -18,7 +33,8 @@ class Scenario:
                 vtk_write_frequency=0,
                 logger_frequency=0,
                 blockforest_filestem="blockforest",
-                 write_setup_vtk=False):
+                 write_setup_vtk=True,
+                 db_file_name=None):

        self.domain_size = domain_size
        self.root_blocks = root_blocks
@@ -34,6 +50,8 @@ class Scenario:
        self.vtk_write_frequency = vtk_write_frequency
        self.logger_frequency = logger_frequency

+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
+
        self.config_dict = self.config(print_dict=False)

    @wlb.member_callback
@@ -51,7 +69,8 @@ class Scenario:
                'numProcesses': self.num_processes,
                'blockForestFilestem': self.bfs_filestem,
                'writeVtk': self.write_setup_vtk,
-                'outputStatistics': False
+                'outputStatistics': True,
+                'writeSetupForestAndReturn': True,
            },
            'Parameters': {
                'omega': 1.95,
@@ -59,14 +78,15 @@ class Scenario:
                'remainingTimeLoggerFrequency': self.logger_frequency,
                'vtkWriteFrequency': self.vtk_write_frequency,
                'useVTKAMRWriter': True,
-                'oneFilePerProcess': False
+                'oneFilePerProcess': False,
+                'writeOnlySlice': False
            },
            'Logging': {
                'logLevel': "info",
            }
        }

-        if (print_dict):
+        if print_dict:
            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))

        return config_dict
@@ -82,6 +102,15 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
        sequenceValuesToScalars(data)

        result = data
@@ -92,52 +121,109 @@ class Scenario:
        table_name = table_name.replace("-", "_")
        for num_try in range(num_tries):
            try:
-                checkAndUpdateSchema(result, table_name, DB_FILE)
-                storeSingle(result, table_name, DB_FILE)
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
                break
            except sqlite3.OperationalError as e:
                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")


-def validation_run():
-    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
-    wlb.log_info_on_root("Validation run")
+def weak_scaling_ldc(num_proc, uniform=False):
+    wlb.log_info_on_root("Running weak scaling benchmark...")

-    domain_size = (96, 96, 96)
-    cells_per_block = (32, 32, 32)
+    # This benchmark must run from 16 processes onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if uniform:
+        factor = 3 * num_proc
+        name = "uniform"
+    else:
+        if num_proc % 16 != 0:
+            raise RuntimeError("Number of processes must be dividable by 16")
+        factor = int(num_proc // 16)
+        name = "nonuniform"
+
+    cells_per_block = (WeakX, WeakY, WeakZ)
+    domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)

    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])

    scenarios = wlb.ScenarioManager()
-    scenario = Scenario(domain_size=domain_size,
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
                        root_blocks=root_blocks,
-                        num_processes=1,
-                        refinement_depth=1,
+                        num_processes=num_proc,
                        cells_per_block=cells_per_block,
-                        timesteps=201,
-                        vtk_write_frequency=100,
-                        logger_frequency=5,
-                        write_setup_vtk=True)
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        db_file_name=f"weakScalingCPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+def strong_scaling_ldc(num_proc, uniform=False):
+    wlb.log_info_on_root("Running strong scaling benchmark...")
+
+    # This benchmark must run from 64 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if num_proc % 64 != 0:
+        raise RuntimeError("Number of processes must be dividable by 64")
+
+    cells_per_block = (StrongX, StrongY, StrongZ)
+
+    if uniform:
+        domain_size = (cells_per_block[0] * 2, cells_per_block[1] * 2, cells_per_block[2] * 16)
+        name = "uniform"
+    else:
+        factor = int(num_proc / 64)
+        blocks64 = block_decomposition(factor)
+        cells_per_block = tuple([int(c / b) for c, b in zip(cells_per_block, reversed(blocks64))])
+        domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+        name = "nonuniform"
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        db_file_name=f"strongScalingCPU{name}LDC.sqlite3")
    scenarios.add(scenario)


-def scaling():
-    wlb.log_info_on_root("Running scaling benchmark...")
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")

-    numProc = wlb.mpi.numProcesses()
+    domain_size = (96, 96, 32)
+    cells_per_block = (32, 32, 32)

-    domain_size = (256, 256, 128 * numProc)
-    cells_per_block = (64, 64, 64)
    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])

    scenarios = wlb.ScenarioManager()
    scenario = Scenario(domain_size=domain_size,
                        root_blocks=root_blocks,
+                        num_processes=1,
+                        refinement_depth=3,
                        cells_per_block=cells_per_block,
-                        refinement_depth=2,
-                        timesteps=10)
+                        timesteps=1001,
+                        vtk_write_frequency=100,
+                        logger_frequency=5,
+                        write_setup_vtk=True)
    scenarios.add(scenario)


-validation_run()
-# scaling()
+if BENCHMARK == 0:
+    validation_run()
+elif BENCHMARK == 1:
+    weak_scaling_ldc(1, False)
+elif BENCHMARK == 2:
+    strong_scaling_ldc(1, False)
+else:
+    print(f"Invalid benchmark case {BENCHMARK}")
--- a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
@@ -11,5 +11,5 @@ waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated
        NonUniformGridGPUBoundaryCollection.h
        NonUniformGridGPUInfoHeader.h)
 waLBerla_add_executable( NAME NonUniformGridGPU
-                         FILES NonUniformGridGPU.cpp LdcSetup.h
+                         FILES NonUniformGridGPU.cpp LdcSetup.h GridGeneration.h
                         DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridGPUGenerated )
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridGPU/GridGeneration.h
+++ b/apps/benchmarks/NonUniformGridGPU/GridGeneration.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GridGeneration.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include <string>
+
+#include "LdcSetup.h"
+#include "NonUniformGridGPUInfoHeader.h"
+
+using StorageSpecification_T = lbm::NonUniformGridGPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+
+using namespace walberla;
+
+void createSetupBlockForest(SetupBlockForest& setupBfs,
+                            const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
+                            const bool useMPIManager=false)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+
+   Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+   Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
+
+   const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
+   uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
+   const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
+   const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
+   const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
+
+   if(useMPIManager)
+      numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
+
+   const LDC ldc(refinementDepth);
+
+   auto refSelection = ldc.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
+   {
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      setupBfs.saveToFile(oss.str().c_str());
+   }
+
+   if(writeVtk){
+      setupBfs.writeVTKOutput(blockForestFilestem);
+   }
+
+   if(outputStatistics){
+      WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++){
+         const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+      }
+
+      const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
+      WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+
+      const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
+      const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+      const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+      const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+      const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
+      const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+      const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+      WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+      WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
+
+      WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
+   }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs,
+                       const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
+{
+   if (mpi::MPIManager::instance()->numProcesses() > 1){
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      std::ifstream infile(setupBlockForestFilepath.c_str());
+      if(!infile.good()){
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else{
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else{
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
+}
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
@@ -31,7 +31,9 @@
 #include "field/FlagUID.h"

 using namespace walberla;
+
 using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
 using FlagField_T          = FlagField< uint8_t >;

 class LDCRefinement
@@ -46,14 +48,8 @@ class LDCRefinement
   {
      const AABB & domain = forest.getDomain();

-      const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
-
-      const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
-                             domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
-
-      const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
-                              domain.xMax(), domain.yMin() + ySize, domain.zMax() );
+      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
+      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );

      for(auto & block : forest)
      {
@@ -99,8 +95,7 @@ class LDC
            Cell globalCell(localCell);
            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
-            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
-                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
+            else if (globalCell.y() < 0 || globalCell.x() < 0 || globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)))
            {
               flagField->addFlag(localCell, noslipFlag);
            }

--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
@@ -7,7 +7,7 @@ from pystencils.typing import TypedSymbol
 from lbmpy.advanced_streaming.utility import get_timesteps
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
-from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil, SubgridScaleModel

 from pystencils_walberla import CodeGeneration, generate_info_header
 from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
@@ -30,20 +30,24 @@ const char * infoCollisionSetup = "{collision_setup}";
 const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
- 
+
 with CodeGeneration() as ctx:
    field_type = "float64" if ctx.double_accuracy else "float32"

-    streaming_pattern = 'pull'
+    streaming_pattern = 'esotwist'
    timesteps = get_timesteps(streaming_pattern)
    stencil = LBStencil(Stencil.D3Q19)
+    method_enum = Method.TRT
+    fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
+    collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()

    assert stencil.D == 3, "This application supports only three-dimensional stencils"
    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}

-    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+    lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
+                           fourth_order_correction=fourth_order_correction,
                           streaming_pattern=streaming_pattern)
    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')

@@ -66,7 +70,7 @@ with CodeGeneration() as ctx:
    infoHeaderParams = {
        'stencil': stencil.name.lower(),
        'streaming_pattern': streaming_pattern,
-        'collision_setup': lbm_config.method.name.lower(),
+        'collision_setup': collision_setup,
        'cse_global': int(lbm_opt.cse_global),
        'cse_pdfs': int(lbm_opt.cse_pdfs),
    }

--- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
 import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sqlite3
+import os
+import sys
+
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
+DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))


 class Scenario:
-    def __init__(self, domain_size=(64, 64, 64), root_blocks=(2, 2, 2),
-                 cells_per_block=(32, 32, 32), refinement_depth=0):
+    def __init__(self,
+                 domain_size=(64, 64, 64),
+                 root_blocks=(2, 2, 2),
+                 num_processes=1,
+                 refinement_depth=0,
+                 cells_per_block=(32, 32, 32),
+                 timesteps=101,
+                 gpu_enabled_mpi=False,
+                 vtk_write_frequency=0,
+                 logger_frequency=30,
+                 blockforest_filestem="blockforest",
+                 write_setup_vtk=True,
+                 async_communication=False,
+                 db_file_name=None):

        self.domain_size = domain_size
        self.root_blocks = root_blocks
        self.cells_per_block = cells_per_block
+        self.periodic = (0, 0, 1)
+
        self.refinement_depth = refinement_depth
+        self.num_processes = num_processes
+        self.bfs_filestem = blockforest_filestem
+        self.write_setup_vtk = write_setup_vtk
+
+        self.timesteps = timesteps
+        self.gpu_enabled_mpi = gpu_enabled_mpi
+        self.vtk_write_frequency = vtk_write_frequency
+        self.logger_frequency = logger_frequency

-        self.periodic = (0, 0, 0)
+        self.async_communication = async_communication
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name

        self.config_dict = self.config(print_dict=False)

@@ -22,39 +66,80 @@ class Scenario:
                'domainSize': self.domain_size,
                'rootBlocks': self.root_blocks,
                'cellsPerBlock': self.cells_per_block,
-                'periodic': self.periodic
+                'periodic': self.periodic,
+            },
+            'SetupBlockForest': {
+                'refinementDepth': self.refinement_depth,
+                'numProcesses': self.num_processes,
+                'blockForestFilestem': self.bfs_filestem,
+                'writeVtk': self.write_setup_vtk,
+                'outputStatistics': True,
+                'writeSetupForestAndReturn': True,
            },
            'Parameters': {
                'omega': 1.95,
-                'timesteps': 30001,
-
-                'refinementDepth': self.refinement_depth,
-                'writeSetupForestAndReturn': False,
-                'numProcesses': 1,
-
-                'cudaEnabledMPI': False,
-                'benchmarkKernelOnly': False,
-
-                'remainingTimeLoggerFrequency': 3,
-
-                'vtkWriteFrequency': 10000,
+                'timesteps': self.timesteps,
+                'remainingTimeLoggerFrequency': self.logger_frequency,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'useVTKAMRWriter': True,
+                'oneFilePerProcess': False,
+                'writeOnlySlice': False,
+                'gpuEnabledMPI': self.gpu_enabled_mpi,
+                'gpuBlockSize': (128, 1, 1),
+                'asyncCommunication': self.async_communication
            },
            'Logging': {
                'logLevel': "info",
            }
        }

-        if print_dict and config_dict["Parameters"]["writeSetupForestAndReturn"] is False:
+        if print_dict:
            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+
        return config_dict

+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
+        sequenceValuesToScalars(data)
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs"
+        table_name = table_name.replace("-", "_")
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+

 def validation_run():
    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
    wlb.log_info_on_root("Validation run")

-    domain_size = (96, 96, 96)
-    cells_per_block = (32, 32, 32)
+    domain_size = (192, 192, 64)
+    cells_per_block = (64, 64, 64)

    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])

@@ -62,8 +147,96 @@ def validation_run():
    scenario = Scenario(domain_size=domain_size,
                        root_blocks=root_blocks,
                        cells_per_block=cells_per_block,
-                        refinement_depth=1)
+                        timesteps=0,
+                        vtk_write_frequency=0,
+                        refinement_depth=3,
+                        gpu_enabled_mpi=False,
+                        async_communication=False)
    scenarios.add(scenario)


-validation_run()
+def weak_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
+    wlb.log_info_on_root("Running weak scaling benchmark...")
+
+    # This benchmark must run from 16 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if uniform:
+        factor = 3 * num_proc
+        name = "uniform"
+    else:
+        if num_proc % 16 != 0:
+            raise RuntimeError("Number of processes must be dividable by 16")
+        factor = int(num_proc // 16)
+        name = "nonuniform"
+
+    cells_per_block = (WeakX, WeakY, WeakZ)
+    domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    for async_communication in [False, True]:
+        scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                            domain_size=domain_size,
+                            root_blocks=root_blocks,
+                            num_processes=num_proc,
+                            cells_per_block=cells_per_block,
+                            refinement_depth=0 if uniform else 3,
+                            timesteps=10,
+                            gpu_enabled_mpi=gpu_enabled_mpi,
+                            async_communication=async_communication,
+                            db_file_name=f"weakScalingGPU{name}LDC.sqlite3")
+        scenarios.add(scenario)
+
+
+def strong_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
+    wlb.log_info_on_root("Running strong scaling benchmark...")
+
+    # This benchmark must run from 64 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if num_proc % 64 != 0:
+        raise RuntimeError("Number of processes must be dividable by 64")
+
+    cells_per_block = (StrongX, StrongY, StrongZ)
+
+    if uniform:
+        domain_size = (cells_per_block[0] * 2, cells_per_block[1] * 2, cells_per_block[2] * 16)
+        name = "uniform"
+    else:
+        factor = int(num_proc / 64)
+        blocks64 = block_decomposition(factor)
+        cells_per_block = tuple([int(c / b) for c, b in zip(cells_per_block, reversed(blocks64))])
+        domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+        name = "nonuniform"
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    for async_communication in [False, True]:
+        scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                            domain_size=domain_size,
+                            root_blocks=root_blocks,
+                            num_processes=num_proc,
+                            cells_per_block=cells_per_block,
+                            refinement_depth=0 if uniform else 3,
+                            timesteps=10,
+                            gpu_enabled_mpi=gpu_enabled_mpi,
+                            async_communication=async_communication,
+                            db_file_name=f"strongScalingGPU{name}LDC.sqlite3")
+        scenarios.add(scenario)
+
+
+if BENCHMARK == 0:
+    validation_run()
+elif BENCHMARK == 1:
+    weak_scaling_ldc(1, True, False)
+elif BENCHMARK == 2:
+    strong_scaling_ldc(1, True, False)
+else:
+    print(f"Invalid benchmark case {BENCHMARK}")
+
+
No results found