Compare revisions

Philipp Suffa · ed94aqyc · Philipp Suffa · Philipp Suffa · Philipp Suffa · Philipp Suffa
--- a/.gitignore
+++ b/.gitignore
@@ -73,3 +73,4 @@ CMakeDefs.h
 /moduleStatistics.json
 /walberla-config.cmake
 cmake-build-*
+/apps/Sparse/
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -25,6 +25,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
      add_subdirectory( FlowAroundSphereCodeGen )
      add_subdirectory( UniformGridCPU )
      add_subdirectory( PhaseFieldAllenCahn )
+      add_subdirectory( ListLBM )
+
   endif()

   if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_CUDA )

--- a/apps/benchmarks/ListLBM/CMakeLists.txt
+++ b/apps/benchmarks/ListLBM/CMakeLists.txt
+add_subdirectory( LagoonSparse )
+add_subdirectory( HybridBenchmark )
+
+
--- a/apps/benchmarks/ListLBM/HybridBenchmark/Artery.obj
+++ b/apps/benchmarks/ListLBM/HybridBenchmark/Artery.obj
--- a/apps/benchmarks/ListLBM/HybridBenchmark/CMakeLists.txt
+++ b/apps/benchmarks/ListLBM/HybridBenchmark/CMakeLists.txt
+waLBerla_link_files_to_builddir( *.prm )
+waLBerla_link_files_to_builddir( *.png )
+waLBerla_link_files_to_builddir( *.obj )
+waLBerla_link_files_to_builddir( *.stl )
+waLBerla_link_files_to_builddir( *.mtl )
+waLBerla_link_files_to_builddir( input.py )
+
+
+if ( WALBERLA_BUILD_WITH_OPENMESH AND WALBERLA_BUILD_WITH_CODEGEN )
+
+    waLBerla_link_files_to_builddir( *.obj )
+    waLBerla_link_files_to_builddir( *.stl )
+    waLBerla_link_files_to_builddir( *.mtl )
+
+    if( WALBERLA_BUILD_WITH_CUDA)
+        waLBerla_generate_target_from_python(NAME HybridBenchmarkCodeGen
+                FILE HybridBenchmark.py
+                OUT_FILES SparseLBSweep.cu SparseLBSweep.h
+                ListLBMList.cpp ListLBMList.h
+                SparseMacroSetter.cpp SparseMacroSetter.h
+                SparseUBB.cu SparseUBB.h
+                SparsePressure.cu SparsePressure.h
+                SparseNoSlip.cu SparseNoSlip.h
+                SparsePackInfoEven.cu SparsePackInfoEven.h
+                SparsePackInfoOdd.cu SparsePackInfoOdd.h
+                SparseLBMInfoHeader.h
+                DenseLBSweep.cu DenseLBSweep.h
+                DenseMacroSetter.cpp DenseMacroSetter.h
+                DenseMacroGetter.cpp DenseMacroGetter.h
+                DenseUBB.cu DenseUBB.h
+                DensePressure.cu DensePressure.h
+                DenseNoSlip.cu DenseNoSlip.h
+                DensePackInfoEven.cu DensePackInfoEven.h
+                DensePackInfoOdd.cu DensePackInfoOdd.h
+                DenseLBMInfoHeader.h)
+
+        waLBerla_add_executable ( NAME HybridBenchmark
+                FILES HybridBenchmark.cpp SetupHybridCommunication.h InitSpherePacking.h ReadParticleBoundaiesFromFile.h
+                DEPENDS blockforest boundary core cuda field lbm mesh stencil timeloop vtk python_coupling HybridBenchmarkCodeGen)
+    else()
+        waLBerla_generate_target_from_python(NAME HybridBenchmarkCodeGen
+                FILE HybridBenchmark.py
+                OUT_FILES SparseLBSweep.cpp SparseLBSweep.h
+                ListLBMList.cpp ListLBMList.h
+                SparseMacroSetter.cpp SparseMacroSetter.h
+                SparseUBB.cpp SparseUBB.h
+                SparsePressure.cpp SparsePressure.h
+                SparseNoSlip.cpp SparseNoSlip.h
+                SparsePackInfoEven.cpp SparsePackInfoEven.h
+                SparsePackInfoOdd.cpp SparsePackInfoOdd.h
+                SparseLBMInfoHeader.h
+                DenseLBSweep.cpp DenseLBSweep.h
+                DenseMacroSetter.cpp DenseMacroSetter.h
+                DenseMacroGetter.cpp DenseMacroGetter.h
+                DenseUBB.cpp DenseUBB.h
+                DensePressure.cpp DensePressure.h
+                DenseNoSlip.cpp DenseNoSlip.h
+                DensePackInfoEven.cpp DensePackInfoEven.h
+                DensePackInfoOdd.cpp DensePackInfoOdd.h
+                DenseLBMInfoHeader.h)
+
+        waLBerla_add_executable ( NAME HybridBenchmark
+                FILES HybridBenchmark.cpp SetupHybridCommunication.h InitSpherePacking.h ReadParticleBoundaiesFromFile.h
+                DEPENDS blockforest boundary core field lbm mesh stencil timeloop vtk python_coupling HybridBenchmarkCodeGen)
+    endif()
+endif()
--- a/apps/benchmarks/ListLBM/HybridBenchmark/HybridBenchmark.cpp
+++ b/apps/benchmarks/ListLBM/HybridBenchmark/HybridBenchmark.cpp
--- a/apps/benchmarks/ListLBM/HybridBenchmark/HybridBenchmark.py
+++ b/apps/benchmarks/ListLBM/HybridBenchmark/HybridBenchmark.py
+import sympy as sp
+import numpy as np
+from dataclasses import replace
+
+from pystencils import Field, FieldType, TypedSymbol, Target
+from pystencils.field import fields
+
+from lbmpy import Stencil, LBStencil, Method, LBMConfig, LBMOptimisation
+from lbmpy.boundaries.boundaryconditions import FixedDensity, UBB, NoSlip
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy.sparse import create_macroscopic_value_setter_sparse, create_lb_update_rule_sparse
+from lbmpy.advanced_streaming import is_inplace, Timestep
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter, macroscopic_values_getter
+
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
+from lbmpy_walberla.sparse import generate_list_class, generate_sparse_sweep, generate_sparse_boundary, generate_sparse_pack_info, generate_alternating_sparse_lbm_sweep, generate_alternating_sparse_boundary, generate_alternating_sparse_pack_info
+from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary
+
+with CodeGeneration() as ctx:
+
+    dtype = 'float64' if ctx.double_accuracy else 'float32'
+
+    stencil = LBStencil(Stencil.D3Q19)
+    q = stencil.Q
+    dim = stencil.D
+
+    omega = sp.symbols("omega")
+    inlet_velocity = sp.symbols("u_x")
+
+    pdfs, pdfs_tmp = fields(f"pdf_field({q}), pdf_field_tmp({q}): {dtype}[1D]", layout='fzyx')
+    pdfs.field_type = FieldType.CUSTOM
+    pdfs_tmp.field_type = FieldType.CUSTOM
+    index_list = Field.create_generic("idx", spatial_dimensions=1, index_dimensions=1, dtype=np.uint32)
+
+    lbm_config = LBMConfig(
+        method=Method.SRT,
+        stencil=stencil,
+        relaxation_rate=omega,
+        streaming_pattern='pull'
+    )
+
+    lbm_opt = LBMOptimisation(
+        cse_global=True,
+        cse_pdfs=True,
+    )
+
+    if not is_inplace(lbm_config.streaming_pattern):
+        field_swaps=[(pdfs, pdfs_tmp)]
+    else:
+        field_swaps=[]
+
+    if ctx.cuda:
+        target = Target.GPU
+        vp = [('int32_t', 'cudaBlockSize0'),
+              ('int32_t', 'cudaBlockSize1'),
+              ('int32_t', 'cudaBlockSize2')]
+
+        sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                            TypedSymbol("cudaBlockSize1", np.int32),
+                            TypedSymbol("cudaBlockSize2", np.int32))
+
+        sweep_params = {'block_size': sweep_block_size}
+
+    else:
+        target = Target.CPU
+        sweep_params = {}
+        vp = ()
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+    inner_outer_split = True
+
+
+    ########################################## Sparse kernels ###################################################
+    generated_list_class_name = "ListLBMList"
+    stencil_typedefs = {'Stencil_T': stencil}
+    list_template = f"using List_T = walberla::lbmpy::{generated_list_class_name};"
+    generate_list_class(ctx, generated_list_class_name, index_list, pdfs, stencil, target=target)
+
+    sparse_setter_eqs = create_macroscopic_value_setter_sparse(method, pdfs, 1.0, (0.0, 0.0, 0.0))
+    generate_sparse_sweep(ctx, 'SparseMacroSetter', sparse_setter_eqs, stencil=stencil, target=Target.CPU)
+
+    generate_alternating_sparse_lbm_sweep(ctx, 'SparseLBSweep', collision_rule, lbm_config, pdfs, index_list, dst=pdfs_tmp, field_swaps=field_swaps, target=target, inner_outer_split=inner_outer_split, varying_parameters=vp, gpu_indexing_params=sweep_params)
+
+    ubb = UBB((inlet_velocity, 0, 0))
+    generate_alternating_sparse_boundary(ctx, 'SparseUBB', ubb, method, streaming_pattern=lbm_config.streaming_pattern, target=target)
+    fixed_density = FixedDensity(sp.Symbol("density"))
+    generate_alternating_sparse_boundary(ctx, 'SparsePressure', fixed_density, method, streaming_pattern=lbm_config.streaming_pattern, target=target)
+    generate_alternating_sparse_boundary(ctx, 'SparseNoSlip', NoSlip(), method, streaming_pattern=lbm_config.streaming_pattern, target=target)
+
+    generate_alternating_sparse_pack_info(ctx, 'SparsePackInfo', stencil, lbm_config.streaming_pattern, target=target)
+
+    generate_info_header(ctx, 'SparseLBMInfoHeader', stencil_typedefs=stencil_typedefs, additional_code=list_template)
+
+
+
+    ########################################## Dense kernels ###################################################
+
+    pdfs, pdfs_tmp, = fields(f"pdfs({q}), pdfs_tmp({q}): double[3D]",  layout='fzyx')
+    velocity_field, density_field = fields(f"velocity({stencil.D}), density(1): double[3D]", layout='fzyx')
+
+
+    setter_assignments = macroscopic_values_setter(method, density=1.0, velocity=(0.0, 0.0, 0.0), pdfs=pdfs, streaming_pattern=lbm_config.streaming_pattern)
+    generate_sweep(ctx, 'DenseMacroSetter', setter_assignments, target=Target.CPU)
+
+    getter_assignments = macroscopic_values_getter(method, density=density_field, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs,
+                                                   streaming_pattern=lbm_config.streaming_pattern,
+                                                   previous_timestep=Timestep.BOTH)
+    generate_sweep(ctx, 'DenseMacroGetter', getter_assignments, target=Target.CPU)
+
+
+
+    if not is_inplace(lbm_config.streaming_pattern):
+        field_swaps=[(pdfs, pdfs_tmp)]
+    else:
+        field_swaps=[]
+
+
+    lbm_opt = replace(lbm_opt, symbolic_field=pdfs, symbolic_temporary_field=pdfs_tmp)
+
+    generate_alternating_lbm_sweep(ctx, 'DenseLBSweep', collision_rule, lbm_config=lbm_config,
+                                   lbm_optimisation=lbm_opt, target=target,
+                                   varying_parameters=vp, gpu_indexing_params=sweep_params,
+                                   inner_outer_split=inner_outer_split, field_swaps=field_swaps)
+
+    generate_alternating_lbm_boundary(ctx, 'DenseUBB', ubb, method, field_name=pdfs.name, streaming_pattern=lbm_config.streaming_pattern, target=target)
+
+    generate_alternating_lbm_boundary(ctx, 'DensePressure', fixed_density, method, field_name=pdfs.name, streaming_pattern=lbm_config.streaming_pattern, target=target)
+
+    generate_alternating_lbm_boundary(ctx, 'DenseNoSlip', NoSlip(), method, field_name=pdfs.name, streaming_pattern=lbm_config.streaming_pattern, target=target)
+
+    generate_lb_pack_info(ctx, 'DensePackInfo', stencil, pdfs, streaming_pattern=lbm_config.streaming_pattern, target=target, always_generate_separate_classes=True)
+
+    field_typedefs = {'PdfField_T': pdfs, 'VelocityField_T': velocity_field, 'ScalarField_T': density_field}
+    generate_info_header(ctx, 'DenseLBMInfoHeader', stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs)
--- a/apps/benchmarks/ListLBM/HybridBenchmark/InitSpherePacking.h
+++ b/apps/benchmarks/ListLBM/HybridBenchmark/InitSpherePacking.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file InitializerFunctions.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "core/Environment.h"
+#include "core/logging/Logging.h"
+#include "core/math/Constants.h"
+
+#include "field/FlagField.h"
+
+#include <cmath>
+#pragma once
+
+namespace walberla
+{
+using FlagField_T  = FlagField< uint8_t >;
+
+void InitSpherePacking(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID flagFieldID,
+                       const field::FlagUID boundaryFlagUID, const real_t Radius, const real_t Shift, const Vector3<real_t> fillIn)
+{
+   // Check that too small spheres are not used
+   if(Radius < 1e-10)
+      return;
+
+   const real_t Diameter = 2 * Radius + Shift;
+   const real_t RadiusSquared = Radius * Radius;
+   const real_t SphereDistanceX = (Diameter / 3) * std::sqrt(6);
+
+   const cell_idx_t MinR = cell_idx_c(floor(Radius));
+   const cell_idx_t MaxR = cell_idx_c(ceil(Radius));
+
+   const uint_t Nx = uint_c(ceil((blocks->getDomainCellBB().xMax() / SphereDistanceX)));
+   const uint_t Ny = uint_c(ceil(blocks->getDomainCellBB().yMax() / Diameter)) + 1;
+   const uint_t Nz = uint_c(ceil(blocks->getDomainCellBB().zMax() / Diameter)) + 1;
+
+   for (auto& block : *blocks)
+   {
+      auto flagField    = block.template getData< FlagField_T >(flagFieldID);
+      auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+      CellInterval BlockBB = blocks->getBlockCellBB( block );
+      BlockBB.expand(1);
+      for (uint_t i = 0; i < Nx; ++i){
+         for (uint_t j = 0; j < Ny; ++j){
+            for (uint_t k = 0; k < Nz; ++k){
+
+               const real_t Offset = (i % 2 == 0) ? 0.0 : Radius;
+               Cell point(cell_idx_c(real_c(i) * SphereDistanceX), cell_idx_c(real_c(j) * Diameter + Offset), cell_idx_c(real_c(k) * Diameter + Offset));
+               CellInterval SphereBB(point.x() - MinR, point.y() - MinR, point.z() - MinR, point.x() + MaxR, point.y() + MaxR, point.z() + MaxR);
+
+               if(BlockBB.overlaps(SphereBB))
+               {
+                  SphereBB.intersect(BlockBB);
+                  Cell localCell;
+                  Cell localPoint;
+
+                  for(auto it = SphereBB.begin(); it != SphereBB.end(); ++it)
+                  {
+                     if(it->x() > (real_c(blocks->getDomainCellBB().xSize()) * fillIn[0]) || it->y() > (real_c(blocks->getDomainCellBB().ySize()) * fillIn[1]) || it->z() > (real_c(blocks->getDomainCellBB().zSize()) * fillIn[2])) continue;
+                     blocks->transformGlobalToBlockLocalCell(localCell, block, Cell(it->x(), it->y(), it->z()));
+                     blocks->transformGlobalToBlockLocalCell(localPoint, block, point);
+                     real_t Ri = (localCell[0] - localPoint.x()) * (localCell[0] - localPoint.x()) +
+                                 (localCell[1] - localPoint.y()) * (localCell[1] - localPoint.y()) +
+                                 (localCell[2] - localPoint.z()) * (localCell[2] - localPoint.z());
+
+                     if(Ri < RadiusSquared)
+                     {
+                        addFlag(flagField->get(localCell), boundaryFlag);
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+
+} // namespace walberla
--- a/apps/benchmarks/ListLBM/HybridBenchmark/ReadParticleBoundaiesFromFile.h
+++ b/apps/benchmarks/ListLBM/HybridBenchmark/ReadParticleBoundaiesFromFile.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file   ReadParticleBoundaiesFromFile.h
+//! \author Philipp Suffa philipp.suffa@fau.de
+//
+//======================================================================================================================
+
+#pragma once
+#include "blockforest/StructuredBlockForest.h"
+#include "core/logging/Logging.h"
+#include "field/FlagField.h"
+
+#include <algorithm>
+#include <core/mpi/Broadcast.h>
+#include <core/mpi/MPITextFile.h>
+#include <core/mpi/Reduce.h>
+#include <functional>
+#include <iterator>
+
+namespace walberla
+{
+
+void initSpheresFromFile(const std::string& filename, weak_ptr<StructuredBlockForest> blockForest, const BlockDataID flagFieldID, const field::FlagUID boundaryFlagUID, const real_t dx)
+{
+   using namespace walberla;
+   auto forest = blockForest.lock();
+
+   std::string textFile;
+
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ifstream t(filename.c_str());
+      if (!t) { WALBERLA_ABORT("Invalid input file " << filename << "\n"); }
+      std::stringstream buffer;
+      buffer << t.rdbuf();
+      textFile = buffer.str();
+   }
+   walberla::mpi::broadcastObject(textFile);
+
+   std::istringstream fileIss(textFile);
+   std::string line;
+
+   // first line contains generation domain sizes
+   std::getline(fileIss, line);
+   Vector3< real_t > generationDomainSize_SI(0_r);
+   std::istringstream firstLine(line);
+   firstLine >> generationDomainSize_SI[0] >> generationDomainSize_SI[1] >> generationDomainSize_SI[2];
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(generationDomainSize_SI)
+
+   std::vector<std::pair<Cell, cell_idx_t>> particleInfo;
+   while (std::getline(fileIss, line))
+   {
+      std::istringstream iss(line);
+      uint_t particleUID;
+      Vector3< real_t > particlePos;
+      real_t particleRadius;
+      iss >> particleUID >> particlePos[0] >> particlePos[1] >> particlePos[2] >> particleRadius;
+      Cell midPoint(cell_idx_c(particlePos[0] / dx), cell_idx_c(particlePos[1] / dx), cell_idx_c(particlePos[2] / dx));
+      cell_idx_t radiusInCells = cell_idx_c(particleRadius / dx);
+      std::pair particle(midPoint, radiusInCells);
+      particleInfo.push_back(particle);
+   }
+
+   for (auto &block : *forest) {
+      CellInterval BlockBB = forest->getBlockCellBB( block );
+      BlockBB.expand(1);
+      auto flagField    = block.template getData< FlagField_T >(flagFieldID);
+      auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+
+      for (auto particle : particleInfo)
+      {
+         CellInterval SphereBB(particle.first.x() - particle.second, particle.first.y() - particle.second,
+                               particle.first.z() - particle.second, particle.first.x() + particle.second,
+                               particle.first.y() + particle.second, particle.first.z() + particle.second);
+         if (BlockBB.overlaps(SphereBB)) {
+            SphereBB.intersect(BlockBB);
+            Cell localCell;
+            Cell localPoint;
+            for(auto it = SphereBB.begin(); it != SphereBB.end(); ++it) {
+               forest->transformGlobalToBlockLocalCell(localCell, block, Cell(it->x(), it->y(), it->z()));
+               forest->transformGlobalToBlockLocalCell(localPoint, block, particle.first);
+               real_t Ri = (localCell[0] - localPoint.x()) * (localCell[0] - localPoint.x()) +
+                           (localCell[1] - localPoint.y()) * (localCell[1] - localPoint.y()) +
+                           (localCell[2] - localPoint.z()) * (localCell[2] - localPoint.z());
+
+               if(Ri < particle.second * particle.second)
+               {
+                  addFlag(flagField->get(localCell), boundaryFlag);
+               }
+            }
+         }
+      }
+   }
+}
+
+} // namespace walberla
--- a/apps/benchmarks/ListLBM/HybridBenchmark/SetupHybridCommunication.h
+++ b/apps/benchmarks/ListLBM/HybridBenchmark/SetupHybridCommunication.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file SetupHybridCommunication.h
+//! \author Philipp Suffa <philipp.suffa@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/StructuredBlockForest.h"
+
+#include "lbm/list/CellDir.h"
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "core/cell/Cell.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/math/Vector3.h"
+#include "core/logging/Logging.h"
+
+#include "core/mpi/all.h"
+
+#include "field/FlagField.h"
+
+namespace walberla {
+
+
+template<typename FlagField_T, typename Stencil_T>
+class SetupHybridCommunication
+{
+ public:
+   SetupHybridCommunication( weak_ptr<StructuredBlockForest> blockForest, const BlockDataID flagFieldID, const FlagUID fluidFlagUID)
+      : blockForest_(blockForest), flagFieldID_(flagFieldID),  fluidFlagUID_(fluidFlagUID)
+   {
+      auto forest = blockForest_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to execute communication for a block storage object that doesn't exist anymore" )
+
+      WALBERLA_LOG_PROGRESS( "Setting up list communication" )
+
+      std::map< uint_t, uint_t > numBlocksToSend;
+
+      blockSize_ = Vector3<cell_idx_t>( cell_idx_c(forest->getNumberOfXCellsPerBlock()), cell_idx_c(forest->getNumberOfYCellsPerBlock()), cell_idx_c(forest->getNumberOfZCellsPerBlock()) );
+
+      for( auto senderIt = forest->begin(); senderIt != forest->end(); ++senderIt )
+      {
+         blockforest::Block & sender = dynamic_cast<blockforest::Block &>( *senderIt );
+
+         for (size_t f = 1; f < Stencil_T::Size; ++f)
+         {
+            auto neighborhood = sender.getNeighborhoodSection( blockforest::getBlockNeighborhoodSectionIndex( (stencil::Direction) f ) );
+            WALBERLA_ASSERT_LESS_EQUAL( neighborhood.size(), size_t( 1 ) )
+            if( neighborhood.empty() )
+               continue;
+            auto * receiver = neighborhood.front();
+            numBlocksToSend[receiver->getProcess()] += uint_t(1);
+         }
+      }
+
+      mpi::BufferSystem bufferSystem( mpi::MPIManager::instance()->comm() );
+
+      for( auto it = numBlocksToSend.begin(); it != numBlocksToSend.end(); ++it )
+      {
+         WALBERLA_LOG_DETAIL( "Packing information for " << it->second << " blocks to send to process " << it->first );
+         bufferSystem.sendBuffer( it->first ) << it->second;
+      }
+
+      for( auto senderIt = forest->begin(); senderIt != forest->end(); ++senderIt )
+      {
+         blockforest::Block & sender = dynamic_cast<blockforest::Block &>( *senderIt );
+         auto *flagField = sender.getData<FlagField_T>(flagFieldID_);
+         auto fluidFlag = flagField->getFlag( fluidFlagUID_ );
+
+         for (size_t sendDir = 1; sendDir < Stencil_T::Size; ++sendDir)
+         {
+            auto neighborhood = sender.getNeighborhoodSection( blockforest::getBlockNeighborhoodSectionIndex( (stencil::Direction) sendDir ) );
+            WALBERLA_ASSERT_LESS_EQUAL( neighborhood.size(), size_t( 1 ) )
+            if( neighborhood.empty() )
+               continue;
+            auto * receiver = neighborhood.front();
+            receiver->getId().toBuffer( bufferSystem.sendBuffer( receiver->getProcess() ) );
+            bufferSystem.sendBuffer( receiver->getProcess() ) << stencil::inverseDir[sendDir];
+            WALBERLA_LOG_DETAIL( "Packing information for block " << receiver->getId().getID() << " in direction " << stencil::dirToString[stencil::inverseDir[f]] );
+
+            auto flagIt = flagField->beginSliceBeforeGhostLayerXYZ(Stencil_T::dir[sendDir]);
+            std::vector< Cell > isBoundary;
+            while( flagIt != flagField->end() )
+            {
+               //get send information
+               Cell cell(flagIt.x(), flagIt.y(), flagIt.z());
+               if (!flagField->isFlagSet(cell, fluidFlag)) {
+                  isBoundary.push_back(cell);
+               }
+               ++flagIt;
+            }
+
+            std::sort( isBoundary.begin(), isBoundary.end() );
+            bufferSystem.sendBuffer( receiver->getProcess() ) << isBoundary.size();
+            for (auto boundaryCell : isBoundary) {
+               bufferSystem.sendBuffer( receiver->getProcess() ) << mapToNeighbor( boundaryCell, (stencil::Direction) sendDir );
+            }
+         }
+      }
+
+      bufferSystem.setReceiverInfoFromSendBufferState( false, false );
+      WALBERLA_LOG_PROGRESS( "MPI exchange of structure data" )
+      bufferSystem.sendAll();
+      WALBERLA_LOG_PROGRESS( "MPI exchange of structure data finished" )
+
+      for( auto recvBufferIt = bufferSystem.begin(); recvBufferIt != bufferSystem.end(); ++recvBufferIt )
+      {
+         recvBufferIt.buffer().clear();
+         continue;
+      }
+   }
+
+
+   Cell mapToNeighbor( Cell cell, const stencil::Direction dir )
+   {
+      switch( stencil::cx[dir] )
+      {
+      case -1: cell.x() += blockSize_[0]; break;
+      case  1: cell.x() -= blockSize_[0]; break;
+      default: ;
+      }
+
+      switch( stencil::cy[dir] )
+      {
+      case -1: cell.y() += blockSize_[1]; break;
+      case  1: cell.y() -= blockSize_[1]; break;
+      default: ;
+      }
+
+      switch( stencil::cz[dir] )
+      {
+      case -1: cell.z() += blockSize_[2]; break;
+      case  1: cell.z() -= blockSize_[2]; break;
+      default: ;
+      }
+
+      return cell;
+   }
+
+ protected:
+   weak_ptr<StructuredBlockForest> blockForest_;
+   const BlockDataID flagFieldID_;
+   const FlagUID fluidFlagUID_;
+   Vector3< cell_idx_t > blockSize_;
+};
+
+} // namespace walberla
\ No newline at end of file
--- a/apps/benchmarks/ListLBM/HybridBenchmark/input.py
+++ b/apps/benchmarks/ListLBM/HybridBenchmark/input.py
+import os
+from math import prod
+import sys
+import sqlite3
+
+import numpy as np
+
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+
+DB_FILE = os.environ.get('DB_FILE', "ListLBMBenchmark.sqlite3")
+
+class Scenario:
+    def __init__(self, cells_per_block=(64, 64, 20),
+                 timesteps=1000, time_step_strategy="noOverlap", omega=0.8, cuda_enabled_mpi=True,
+                 inner_outer_split=(1, 1, 1), vtk_write_frequency=0, inflow_velocity=(0.01,0,0),
+                 porosity=0.5, porositySwitch=0.8, geometry_setup="randomNoslip",
+                 spheres_radius=9, sphere_shift = 10, sphere_fill = (1.0, 1.0, 1.0), mesh_file="None", run_boundaries=True):
+
+        self.timesteps = timesteps
+        self.vtkWriteFrequency = vtk_write_frequency
+
+        self.inflow_velocity = inflow_velocity
+
+        self.cells_per_block = cells_per_block
+        self.porositySwitch = porositySwitch
+        self.porosity = porosity
+
+        self.inner_outer_split = inner_outer_split
+        self.time_step_strategy = time_step_strategy
+        self.cuda_enabled_mpi = cuda_enabled_mpi
+        self.run_boundaries = run_boundaries
+
+        self.omega = omega
+
+        self.geometry_setup = geometry_setup
+
+        self.spheres_radius = spheres_radius
+        self.sphere_shift = sphere_shift
+        self.sphere_fill = sphere_fill
+
+        self.mesh_file = mesh_file
+
+        self.config_dict = self.config()
+
+
+
+    @wlb.member_callback
+    def config(self):
+        return {
+            'DomainSetup': {
+                'cellsPerBlock': self.cells_per_block,
+                'weakScaling': True,
+                'geometrySetup': self.geometry_setup,
+                'meshFile': self.mesh_file
+        },
+            'Parameters': {
+                'timesteps': self.timesteps,
+                'omega': self.omega,
+                'initialVelocity':self.inflow_velocity,
+                'timeStepStrategy': self.time_step_strategy,
+                'innerOuterSplit': self.inner_outer_split,
+                'cudaEnabledMPI': self.cuda_enabled_mpi,
+                'vtkWriteFrequency': self.vtkWriteFrequency,
+                'porositySwitch': self.porositySwitch,
+                'porosity': self.porosity,
+                'runBoundaries': self.run_boundaries,
+                'remainingTimeLoggerFrequency': 10,
+                'SpheresRadius': self.spheres_radius,
+                'SphereShift': self.sphere_shift,
+                'SphereFillDomainRatio':self.sphere_fill,
+
+
+        },
+            'Boundaries': {
+                'Border': [
+                    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'W', 'walldistance': -1, 'flag': 'UBB'},
+                    {'direction': 'E', 'walldistance': -1, 'flag': 'PressureOutflow'},
+                    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+                ]
+            }
+        }
+
+
+def porosity_benchmark():
+    wlb.log_info_on_root("Running different porosities")
+    scenarios = wlb.ScenarioManager()
+    porosities = [0.1 * i for i in range(10+1)]
+    for porosity in porosities:
+        scenario = Scenario(porosity=porosity, geometry_setup="randomNoslip", inflow_velocity=(0,0,0), run_boundaries=False)
+        scenarios.add(scenario)
+
+
+def emptyChannel():
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(porosity=1.0, vtk_write_frequency=0, geometry_setup="randomNoslip", cells_per_block=(64, 64, 64), time_step_strategy="kernelOnly", run_boundaries=False, porositySwitch=1.1)
+    scenarios.add(scenario)
+
+def randomNoslip():
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(porosity=0.9, vtk_write_frequency=50, geometry_setup="randomNoslip", inflow_velocity=(0,0,0))
+    scenarios.add(scenario)
+
+def spheres():
+    scenarios = wlb.ScenarioManager()
+    spheres_radius = 7
+    sphere_shift = 3
+    sphere_fill = (0.55, 1.0, 1.0)
+    scenario = Scenario(vtk_write_frequency=50, geometry_setup="spheres", spheres_radius=spheres_radius,
+                        sphere_shift=sphere_shift, sphere_fill=sphere_fill, porositySwitch=0.75, cells_per_block=(20, 20, 20), timesteps=1000)
+    scenarios.add(scenario)
+
+def Artery():
+    scenarios = wlb.ScenarioManager()
+    mesh_file = "Artery.obj"
+    scenario = Scenario(vtk_write_frequency=1000, geometry_setup="artery", mesh_file=mesh_file, timesteps=1000, omega=1.9, cells_per_block=(10, 10, 10), porositySwitch=0.5)
+    scenarios.add(scenario)
+
+def particleBed():
+    scenarios = wlb.ScenarioManager()
+    mesh_file = "Artery.obj"
+    scenario = Scenario(geometry_setup="particleBed", vtk_write_frequency=100, timesteps=1000, omega=1.9, cells_per_block=(50, 100, 50), porositySwitch=0.8)
+    scenarios.add(scenario)
+
+#randomNoslip()
+#spheres()
+#Artery()
+#particleBed()
+emptyChannel()
\ No newline at end of file
--- a/apps/benchmarks/ListLBM/LagoonSparse/CMakeLists.txt
+++ b/apps/benchmarks/ListLBM/LagoonSparse/CMakeLists.txt
+waLBerla_link_files_to_builddir( *.prm )
+waLBerla_link_files_to_builddir( *.png )
+
+if ( WALBERLA_BUILD_WITH_OPENMESH AND WALBERLA_BUILD_WITH_CODEGEN )
+
+    waLBerla_link_files_to_builddir( *.obj )
+    waLBerla_link_files_to_builddir( *.stl )
+    waLBerla_link_files_to_builddir( *.mtl )
+
+    if( WALBERLA_BUILD_WITH_CUDA)
+        waLBerla_generate_target_from_python(NAME LagoonSparseCodeGen
+                FILE LagoonSparse.py
+                OUT_FILES LBSweep.cu LBSweep.h
+                ListLBMList.cpp ListLBMList.h
+                MacroSetter.cpp MacroSetter.h
+                UBB.cu UBB.h
+                Pressure.cu Pressure.h
+                NoSlip.cu NoSlip.h
+                PackInfoEven.cu PackInfoEven.h
+                PackInfoOdd.cu PackInfoOdd.h
+                ListLBMInfoHeader.h)
+
+        waLBerla_add_executable ( NAME LagoonSparse
+                FILES LagoonSparse.cpp
+                DEPENDS blockforest boundary core cuda field lbm mesh stencil timeloop vtk LagoonSparseCodeGen)
+    else()
+        waLBerla_generate_target_from_python(NAME LagoonSparseCodeGen
+                FILE LagoonSparse.py
+                OUT_FILES LBSweep.cpp LBSweep.h
+                ListLBMList.cpp ListLBMList.h
+                MacroSetter.cpp MacroSetter.h
+                UBB.cpp UBB.h
+                Pressure.cpp Pressure.h
+                NoSlip.cpp NoSlip.h
+                PackInfoEven.cpp PackInfoEven.h
+                PackInfoOdd.cpp PackInfoOdd.h
+                ListLBMInfoHeader.h)
+
+        waLBerla_add_executable ( NAME LagoonSparse
+                FILES LagoonSparse.cpp
+                DEPENDS blockforest boundary core field lbm mesh stencil timeloop vtk LagoonSparseCodeGen)
+    endif()
+endif()
--- a/apps/benchmarks/ListLBM/LagoonSparse/Lagoon.mtl
+++ b/apps/benchmarks/ListLBM/LagoonSparse/Lagoon.mtl
+# Blender MTL File: 'None'
+# Material Count: 1
+
+newmtl None
+Ns 500
+Ka 0.8 0.8 0.8
+Kd 0.8 0.8 0.8
+Ks 0.8 0.8 0.8
+d 1
+illum 2
--- a/apps/benchmarks/ListLBM/LagoonSparse/Lagoon.obj
+++ b/apps/benchmarks/ListLBM/LagoonSparse/Lagoon.obj
--- a/apps/benchmarks/ListLBM/LagoonSparse/LagoonSparse.cpp
+++ b/apps/benchmarks/ListLBM/LagoonSparse/LagoonSparse.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ListLBM.cpp
+//! \author Philipp Suffa
+//
+//======================================================================================================================
+
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+#include "core/SharedFunctor.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/MemoryUsage.h"
+
+#include "field/AddToStorage.h"
+#include "field/StabilityChecker.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/PerformanceEvaluation.h"
+#include "lbm/list/AddToStorage.h"
+#include "lbm/list/ListVTK.h"
+
+#include "mesh/blockforest/BlockExclusion.h"
+#include "mesh/blockforest/BlockForestInitialization.h"
+#include "mesh/boundary/BoundaryInfo.h"
+#include "mesh/boundary/BoundaryLocation.h"
+#include "mesh/boundary/BoundaryLocationFunction.h"
+#include "mesh/boundary/BoundarySetup.h"
+#include "mesh/boundary/BoundaryUIDFaceDataSource.h"
+#include "mesh/boundary/ColorToBoundaryMapper.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "mesh_common/DistanceComputations.h"
+#include "mesh_common/DistanceFunction.h"
+#include "mesh_common/MatrixVectorOperations.h"
+#include "mesh_common/MeshIO.h"
+#include "mesh_common/MeshOperations.h"
+#include "mesh_common/TriangleMeshes.h"
+#include "mesh_common/distance_octree/DistanceOctree.h"
+#include "mesh_common/vtk/CommonDataSources.h"
+#include "mesh_common/vtk/VTKMeshWriter.h"
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+#   include "cuda/AddGPUFieldToStorage.h"
+#   include "cuda/DeviceSelectMPI.h"
+#   include "cuda/HostFieldAllocator.h"
+#   include "cuda/NVTX.h"
+#   include "cuda/ParallelStreams.h"
+#   include "cuda/communication/UniformGPUScheme.h"
+#   include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
+#else
+#   include "lbm/communication/CombinedInPlaceCpuPackInfo.h"
+#endif
+
+#include "ListLBMInfoHeader.h"
+#include <iostream>
+#include <fstream>
+
+using namespace walberla;
+using PackInfoEven_T = lbmpy::PackInfoEven;
+using PackInfoOdd_T = lbmpy::PackInfoOdd;
+
+uint_t numGhostLayers = uint_t(1);
+
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+using GPUField = cuda::GPUField< real_t >;
+#endif
+
+template<typename MeshType>
+void vertexToFaceColor(MeshType &mesh, const typename MeshType::Color &defaultColor) {
+   WALBERLA_CHECK(mesh.has_vertex_colors())
+   mesh.request_face_colors();
+
+   for (auto faceIt = mesh.faces_begin(); faceIt != mesh.faces_end(); ++faceIt) {
+      typename MeshType::Color vertexColor;
+
+      bool useVertexColor = true;
+
+      auto vertexIt = mesh.fv_iter(*faceIt);
+      WALBERLA_ASSERT(vertexIt.is_valid())
+
+      vertexColor = mesh.color(*vertexIt);
+
+      ++vertexIt;
+      while (vertexIt.is_valid() && useVertexColor) {
+         if (vertexColor != mesh.color(*vertexIt)) useVertexColor = false;
+         ++vertexIt;
+      }
+
+      mesh.set_color(*faceIt, useVertexColor ? vertexColor : defaultColor);
+   }
+}
+
+int main(int argc, char **argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   cuda::selectDeviceBasedOnMpiRank();
+   WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+#endif
+
+   mpi::MPIManager::instance()->useWorldComm();
+
+   ///////////////////////
+   /// PARAMETER INPUT ///
+   ///////////////////////
+   auto parameters = walberlaEnv.config()->getOneBlock("Parameters");
+   real_t reference_velocity = parameters.getParameter< real_t >("reference_velocity", real_c(0.9));
+   real_t viscosity = parameters.getParameter< real_t >("viscosity", real_c(0.9));
+   const Vector3< real_t > initialVelocity = parameters.getParameter< Vector3< real_t > >("initialVelocity", Vector3< real_t >());
+   const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
+   Vector3< int > InnerOuterSplit = parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));
+   const bool weak_scaling = parameters.getParameter< bool >("weakScaling", false); // weak or strong scaling
+   const real_t remainingTimeLoggerFrequency = parameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   auto loggingParameters         = walberlaEnv.config()->getOneBlock("Logging");
+   const bool WriteDistanceOctree = loggingParameters.getParameter< bool >("WriteDistanceOctree", false);
+   auto domainParameters = walberlaEnv.config()->getOneBlock("DomainSetup");
+   std::string meshFile  = domainParameters.getParameter< std::string >("meshFile");
+   const Vector3< bool > periodicity = domainParameters.getParameter< Vector3< bool > >("periodic", Vector3< bool >(false));
+
+
+   Vector3< uint_t > cellsPerBlock;
+   Vector3< uint_t > blocksPerDimension;
+   uint_t nrOfProcesses = uint_c(MPIManager::instance()->numProcesses());
+
+   if (!domainParameters.isDefined("blocks"))
+   {
+      if (weak_scaling)
+      {
+         Vector3< uint_t > cells = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+         blockforest::calculateCellDistribution(cells, nrOfProcesses, blocksPerDimension, cellsPerBlock);
+         cellsPerBlock = cells;
+      }
+      else
+      {
+         Vector3< uint_t > cells = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+         blockforest::calculateCellDistribution(cells, nrOfProcesses, blocksPerDimension, cellsPerBlock);
+      }
+   }
+   else
+   {
+      cellsPerBlock      = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+      blocksPerDimension = domainParameters.getParameter< Vector3< uint_t > >("blocks");
+   }
+
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(cellsPerBlock)
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(blocksPerDimension)
+
+   const Vector3< uint_t > TotalCells(cellsPerBlock[0] * blocksPerDimension[0],
+                                      cellsPerBlock[1] * blocksPerDimension[1],
+                                      cellsPerBlock[2] * blocksPerDimension[2]);
+   const real_t scalingFactor = 0.03;// (32.0 / real_c(TotalCells.min())) * 0.1;
+
+   const Vector3< real_t > dx(scalingFactor, scalingFactor, scalingFactor);
+
+   ////////////////////
+   /// PROCESS MESH ///
+   ////////////////////
+
+   WALBERLA_LOG_INFO_ON_ROOT("Using mesh from " << meshFile << ".")
+
+   // read in mesh with vertex colors on a single process and broadcast it
+   auto mesh = make_shared< mesh::TriangleMesh >();
+   mesh->request_vertex_colors();
+   mesh::readAndBroadcast(meshFile, *mesh);
+
+   vertexToFaceColor(*mesh, mesh::TriangleMesh::Color(255, 255, 255));
+   auto triDist = make_shared< mesh::TriangleDistance< mesh::TriangleMesh > >(mesh);
+   auto distanceOctree = make_shared< mesh::DistanceOctree< mesh::TriangleMesh > >(triDist);
+   if (WriteDistanceOctree) { distanceOctree->writeVTKOutput("distanceOctree"); }
+
+   const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", uint_t(0));
+
+   ///////////////////////////
+   /// CREATE BLOCK FOREST ///
+   ///////////////////////////
+
+   auto aabb = computeAABB(*mesh);
+   mesh::ComplexGeometryStructuredBlockforestCreator bfc(aabb, dx, mesh::makeExcludeMeshInterior(distanceOctree, dx[0]));
+
+   bfc.setPeriodicity(periodicity);
+   auto blocks = bfc.createStructuredBlockForest(cellsPerBlock, blocksPerDimension);
+   WALBERLA_LOG_INFO_ON_ROOT("Created Blockforest")
+   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+   uint_t cells_y          = blocks->getYSize() * cellsPerBlock[1];
+   real_t maximum_veloctiy = initialVelocity[0];
+
+   real_t Cu = reference_velocity / maximum_veloctiy;
+   real_t Ct = dx.min() / Cu;
+
+   real_t viscosityLattice = viscosity * Ct / (dx.min() * dx.min());
+   real_t omega            = real_c(1.0 / (3.0 * viscosityLattice + 0.5));
+
+   real_t reynolds_number = (real_t(cells_y) * maximum_veloctiy) / viscosityLattice;
+
+   /////////////////////////
+   /// BOUNDARY HANDLING ///
+   /////////////////////////
+
+   // create and initialize boundary handling
+   const FlagUID fluidFlagUID("Fluid");
+   const FlagUID noslipFlagUID("NoSlip");
+   static walberla::BoundaryUID wallFlagUID("NoSlip");
+   const FlagUID inflowUID("UBB");
+   const FlagUID PressureOutflowUID("PressureOutflow");
+
+   auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries");
+
+   // set NoSlip UID to boundaries that we colored
+   mesh::ColorToBoundaryMapper< mesh::TriangleMesh > colorToBoundaryMapper((mesh::BoundaryInfo(wallFlagUID)));
+   colorToBoundaryMapper.set(mesh::TriangleMesh::Color(255, 255, 255), mesh::BoundaryInfo(wallFlagUID));
+
+   // mark boundaries
+   auto boundaryLocations = colorToBoundaryMapper.addBoundaryInfoToMesh(*mesh);
+
+   // write mesh info to file
+   if(vtkWriteFrequency > 0) {
+      mesh::VTKMeshWriter< mesh::TriangleMesh > meshWriter(mesh, "meshBoundaries", 1);
+      meshWriter.addDataSource(make_shared< mesh::BoundaryUIDFaceDataSource< mesh::TriangleMesh > >(boundaryLocations));
+      meshWriter.addDataSource(make_shared< mesh::ColorFaceDataSource< mesh::TriangleMesh > >());
+      meshWriter.addDataSource(make_shared< mesh::ColorVertexDataSource< mesh::TriangleMesh > >());
+      meshWriter();
+   }
+
+   // voxelize mesh
+   mesh::BoundarySetup boundarySetup(blocks, makeMeshDistanceFunction(distanceOctree), numGhostLayers);
+
+   geometry::initBoundaryHandling<FlagField_T>(*blocks, flagFieldId, boundariesConfig);
+   boundarySetup.setFlag<FlagField_T>(flagFieldId, FlagUID("NoSlip"), mesh::BoundarySetup::INSIDE);
+   geometry::setNonBoundaryCellsToDomain<FlagField_T>(*blocks, flagFieldId, fluidFlagUID);
+
+   for (uint_t i = 0; i < 3; ++i) {
+      if (int_c(cellsPerBlock[i]) <= InnerOuterSplit[i] * 2) {
+         WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
+      }
+   }
+
+   ////////////////////////////////////
+   /// CREATE AND INITIALIZE FIELDS ///
+   ////////////////////////////////////
+
+   // create fields
+   WALBERLA_LOG_INFO_ON_ROOT("Running Simulation with indirect addressing")
+   BlockDataID pdfListId   = lbm::addListToStorage< List_T >(blocks, "LBM list (FIdx)", InnerOuterSplit);
+   WALBERLA_LOG_INFO_ON_ROOT("Start initialisation of the linked-list structure")
+
+   WcTimer lbmTimer;
+   for (auto& block : *blocks)
+   {
+      auto* lbmList = block.getData< List_T >(pdfListId);
+      WALBERLA_CHECK_NOT_NULLPTR(lbmList)
+      lbmList->fillFromFlagField< FlagField_T >(block, flagFieldId, fluidFlagUID);
+   }
+
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   const Vector3< int32_t > gpuBlockSize = parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(128, 1, 1));
+   lbmpy::LBSweep kernel(pdfListId, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
+#else
+   lbmpy::LBSweep kernel(pdfListId, omega);
+#endif
+
+
+   auto tracker = make_shared<lbm::TimestepTracker>(0);
+   lbmpy::MacroSetter setterSweep(pdfListId);
+   lbmpy::UBB ubb(blocks, pdfListId, initialVelocity[0]);
+   lbmpy::Pressure pressureOutflow(blocks, pdfListId, 1.0);
+   //lbm::NoSlip noSlip(blocks, pdfListId);
+
+   ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldId, inflowUID, fluidFlagUID);
+   pressureOutflow.fillFromFlagField< FlagField_T >(blocks, flagFieldId, PressureOutflowUID, fluidFlagUID);
+   //noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, noslipFlagUID, fluidFlagUID);
+   for (auto& block : *blocks)
+   {
+      setterSweep(&block);
+   }
+   lbmpy::ListCommunicationSetup(pdfListId, blocks);
+   lbmTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the list structures needed " << lbmTimer.last() << " s")
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+
+   const bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", true);
+   auto packInfo = make_shared<lbm::CombinedInPlaceGpuPackInfo<PackInfoEven_T, PackInfoOdd_T> >(tracker, pdfListId);
+   cuda::communication::UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
+   comm.addPackInfo(packInfo);
+   auto communicate = std::function< void() >([&]() { comm.communicate(nullptr); });
+   auto start_communicate = std::function< void() >([&]() { comm.startCommunication(nullptr); });
+   auto wait_communicate = std::function< void() >([&]() { comm.wait(nullptr); });
+   WALBERLA_LOG_INFO_ON_ROOT("Finished setting up communication and start first communication")
+
+   // TODO: Data for List LBM is synced at first communication. Should be fixed ...
+   comm.communicate();
+   WALBERLA_LOG_INFO_ON_ROOT("Finished first communication")
+#else
+   auto packInfo = make_shared<lbm::CombinedInPlaceCpuPackInfo<PackInfoEven_T, PackInfoOdd_T> >(tracker, pdfListId);
+   blockforest::communication::UniformBufferedScheme< Stencil_T > comm(blocks);
+   comm.addPackInfo(packInfo);
+   auto communicate = std::function< void() >([&]() { comm.communicate(); });
+   auto start_communicate = std::function< void() >([&]() { comm.startCommunication(); });
+   auto wait_communicate = std::function< void() >([&]() { comm.wait(); });
+#endif
+
+
+
+   //////////////////////////////////
+   /// SET UP SWEEPS AND TIMELOOP ///
+   //////////////////////////////////
+
+   const std::string timeStepStrategy = parameters.getParameter<std::string>("timeStepStrategy", "noOverlap");
+   const bool runBoundaries = parameters.getParameter<bool>("runBoundaries", true);
+
+   auto normalTimeStep = [&]() {
+      communicate();
+      for (auto& block : *blocks)
+      {
+         if (runBoundaries) {
+            ubb(&block, tracker->getCounter());
+            pressureOutflow(&block, tracker->getCounter());
+         }
+         kernel(&block, tracker->getCounterPlusOne());
+      }
+      tracker->advance();
+   };
+
+   auto simpleOverlapTimeStep = [&]() {
+      start_communicate();
+      for (auto& block : *blocks)
+      {
+         if (runBoundaries) {
+            ubb(&block, tracker->getCounter());
+            pressureOutflow(&block, tracker->getCounter());
+         }
+         kernel.inner(&block, tracker->getCounterPlusOne());
+      }
+      wait_communicate();
+      for (auto& block : *blocks)
+      {
+         kernel.outer(&block, tracker->getCounterPlusOne());
+      }
+      tracker->advance();
+   };
+
+   auto kernelOnlyFunc = [&]() {
+      tracker->advance();
+      for (auto& block : *blocks)
+         kernel(&block, tracker->getCounter());
+   };
+
+
+   std::function< void() > timeStep;
+   if (timeStepStrategy == "noOverlap")
+      timeStep = normalTimeStep;
+   else if (timeStepStrategy == "Overlap")
+      timeStep = simpleOverlapTimeStep;
+   else if (timeStepStrategy == "kernelOnly")
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
+      communicate();
+      timeStep = kernelOnlyFunc;
+   }
+   else
+   {
+      WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
+                                   "'simpleOverlap', 'kernelOnly'")
+   }
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+   timeloop.add() << BeforeFunction(timeStep, "Timestep") << Sweep([](IBlock*) {}, "Dummy");
+
+
+   //////////////////////////////////
+   ///       VTK AND STUFF        ///
+   //////////////////////////////////
+
+
+   // log remaining time
+   timeloop.addFuncAfterTimeStep(
+      timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+      "remaining time logger");
+
+   //////////////////
+   /// VTK OUTPUT ///
+   //////////////////
+
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtkList", vtkWriteFrequency, 0, false, "vtk_out",
+                                                      "simulation_step", false, true, true, false, 0);
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+         {
+            List_T* lbmList = block.getData< List_T >(pdfListId);
+            lbmList->copyPDFSToCPU();
+         }
+      });
+#endif
+
+      vtkOutput->addCellInclusionFilter(lbm::ListFluidFilter< List_T >(pdfListId));
+      auto velWriter = make_shared< lbm::ListVelocityVTKWriter< List_T, float > >(pdfListId, tracker, "velocity");
+      auto densityWriter = make_shared< lbm::ListDensityVTKWriter< List_T, float > >(pdfListId, "density");
+      vtkOutput->addCellDataWriter(velWriter);
+      vtkOutput->addCellDataWriter(densityWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   //////////////////////
+   /// RUN SIMULATION ///
+   //////////////////////
+   lbm::PerformanceEvaluation<FlagField_T> performance(blocks, flagFieldId, fluidFlagUID);
+
+   WALBERLA_LOG_INFO_ON_ROOT("Simulating ListLBM:"
+                             "\n timesteps:                  "
+                             << timesteps << "\n simulation time in seconds: " << real_t(timesteps) * Ct
+                             << "\n reynolds number:            " << reynolds_number
+                             << "\n relaxation rate:            " << omega
+                             << "\n maximum inflow velocity:    " << maximum_veloctiy)
+
+   int warmupSteps = parameters.getParameter<int>("warmupSteps", 2);
+   int outerIterations = parameters.getParameter<int>("outerIterations", 1);
+   for (int i = 0; i < warmupSteps; ++i)
+      timeloop.singleStep();
+
+   for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration) {
+      timeloop.setCurrentTimeStepToZero();
+
+      WcTimingPool timeloopTiming;
+      WcTimer simTimer;
+
+      WALBERLA_MPI_WORLD_BARRIER()
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      cudaDeviceSynchronize();
+#endif
+
+      simTimer.start();
+      timeloop.run(timeloopTiming);
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      cudaDeviceSynchronize();
+#endif
+      simTimer.end();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+      real_t time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+   }
+   //printResidentMemoryStatistics();
+   return EXIT_SUCCESS;
+}
--- a/apps/benchmarks/ListLBM/LagoonSparse/LagoonSparse.prm
+++ b/apps/benchmarks/ListLBM/LagoonSparse/LagoonSparse.prm
+
+Parameters 
+{
+	reference_velocity  0.9;
+    viscosity  0.015;
+	initialVelocity < 0.01, 0, 0 >;
+	timesteps       50;
+
+    timeStepStrategy noOverlap;
+    innerOuterSplit     < 1, 1, 1>;
+    weakScaling true;
+    cudaEnabledMPI true;
+
+    runBoundaries true;
+
+	remainingTimeLoggerFrequency 10; // in seconds
+	vtkWriteFrequency 1;
+}
+
+//! [domainSetup]
+DomainSetup
+{
+   meshFile         Lagoon.obj;
+   cellsPerBlock    < 64, 64, 10 >;
+   periodic         <  0,  0,  0 >;
+}
+//! [domainSetup]
+
+Boundaries
+{
+    Border { direction N;    walldistance -1;  flag NoSlip; }
+    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction W;    walldistance -1;  flag UBB; }
+    Border { direction E;    walldistance -1;  flag PressureOutflow; }
+    Border { direction T;    walldistance -1;  flag NoSlip; }
+    Border { direction B;    walldistance -1;  flag NoSlip; }
+}
+
+Logging
+{
+    WriteDistanceOctree false;
+}
\ No newline at end of file
--- a/apps/benchmarks/ListLBM/LagoonSparse/LagoonSparse.py
+++ b/apps/benchmarks/ListLBM/LagoonSparse/LagoonSparse.py
+import sympy as sp
+import numpy as np
+from pystencils import Field, FieldType, TypedSymbol, Target
+
+from pystencils.field import fields
+
+from lbmpy import Stencil, LBStencil, Method, LBMConfig, LBMOptimisation
+from lbmpy.boundaries.boundaryconditions import FixedDensity, UBB, NoSlip
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy.sparse import create_macroscopic_value_setter_sparse, create_lb_update_rule_sparse
+from lbmpy.advanced_streaming import is_inplace
+
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
+from lbmpy_walberla.sparse import generate_list_class, generate_sparse_sweep, generate_sparse_boundary, generate_sparse_pack_info, generate_alternating_sparse_lbm_sweep, generate_alternating_sparse_boundary, generate_alternating_sparse_pack_info
+
+with CodeGeneration() as ctx:
+
+    dtype = 'float64' if ctx.double_accuracy else 'float32'
+
+    stencil = LBStencil(Stencil.D3Q19)
+    q = stencil.Q
+    dim = stencil.D
+
+    omega = sp.symbols("omega")
+    inlet_velocity = sp.symbols("u_x")
+
+    pdfs, pdfs_tmp = fields(f"pdf_field({q}), pdf_field_tmp({q}): {dtype}[1D]", layout='fzyx')
+    pdfs.field_type = FieldType.CUSTOM
+    pdfs_tmp.field_type = FieldType.CUSTOM
+    index_list = Field.create_generic("idx", spatial_dimensions=1, index_dimensions=1, dtype=np.uint32)
+
+    lbm_config = LBMConfig(
+        method=Method.SRT,
+        stencil=stencil,
+        relaxation_rate=omega,
+        streaming_pattern='aa'
+    )
+
+    lbm_opt = LBMOptimisation(
+        cse_global=False,
+        cse_pdfs=False,
+    )
+
+    if not is_inplace(lbm_config.streaming_pattern):
+        field_swaps=[(pdfs, pdfs_tmp)]
+    else:
+        field_swaps=[]
+
+    if ctx.cuda:
+        target = Target.GPU
+        vp = [('int32_t', 'cudaBlockSize0'),
+              ('int32_t', 'cudaBlockSize1'),
+              ('int32_t', 'cudaBlockSize2')]
+
+        sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                            TypedSymbol("cudaBlockSize1", np.int32),
+                            TypedSymbol("cudaBlockSize2", np.int32))
+
+        sweep_params = {'block_size': sweep_block_size}
+
+    else:
+        target = Target.CPU
+        sweep_params = {}
+        vp = ()
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    generated_list_class_name = "ListLBMList"
+    stencil_typedefs = {'Stencil_T': stencil}
+    list_template = f"using List_T = walberla::lbmpy::{generated_list_class_name};"
+    generate_list_class(ctx, generated_list_class_name, index_list, pdfs, stencil, target=target)
+
+    sparse_setter_eqs = create_macroscopic_value_setter_sparse(method, pdfs, 1.0, (0.0, 0.0, 0.0))
+    generate_sparse_sweep(ctx, 'MacroSetter', sparse_setter_eqs, stencil=stencil, target=Target.CPU)
+
+    generate_alternating_sparse_lbm_sweep(ctx, 'LBSweep', collision_rule, lbm_config, pdfs, index_list, dst=pdfs_tmp, field_swaps=field_swaps, target=target, inner_outer_split=False, varying_parameters=vp, gpu_indexing_params=sweep_params)
+
+    ubb = UBB((inlet_velocity, 0, 0))
+    generate_alternating_sparse_boundary(ctx, 'UBB', ubb, method, streaming_pattern=lbm_config.streaming_pattern, target=target)
+    fixed_density = FixedDensity(sp.Symbol("density"))
+    generate_alternating_sparse_boundary(ctx, 'Pressure', fixed_density, method, streaming_pattern=lbm_config.streaming_pattern, target=target)
+    generate_alternating_sparse_boundary(ctx, 'NoSlip', NoSlip(), method, streaming_pattern=lbm_config.streaming_pattern, target=target)
+
+    generate_alternating_sparse_pack_info(ctx, 'PackInfo', stencil, lbm_config.streaming_pattern, target=target)
+
+    generate_info_header(ctx, 'ListLBMInfoHeader', stencil_typedefs=stencil_typedefs, additional_code=list_template)
--- a/pybind11 @ f7b49961
+++ b/pybind11 @ f7b49961
-Subproject commit f7b499615e14d70ab098a20deb0cdb3889998a1a
--- a/python/lbmpy_walberla/packinfo.py
+++ b/python/lbmpy_walberla/packinfo.py
@@ -81,7 +81,6 @@ def generate_lb_pack_info(generation_context,
                d = stencil.index(streaming_dir)
                fa = write_accesses[d]
                spec[(comm_dir,)].add(fa)
-
        if t == Timestep.EVEN:
            class_name_suffix = 'Even'
        elif t == Timestep.ODD:

--- a/python/lbmpy_walberla/sparse/__init__.py
+++ b/python/lbmpy_walberla/sparse/__init__.py
+from .codegen import generate_sparse_sweep, generate_sparse_pack_info, generate_alternating_sparse_lbm_sweep, generate_alternating_sparse_pack_info
+from .boundary import generate_sparse_boundary, generate_alternating_sparse_boundary
+from .list_codegen import generate_list_class
+
+__all__ = ['generate_sparse_sweep', 'generate_sparse_boundary', 'generate_sparse_pack_info', 'generate_list_class', 'generate_alternating_sparse_lbm_sweep', 'generate_alternating_sparse_boundary', 'generate_alternating_sparse_pack_info']
No results found