Compare revisions

bc7d45bd · bc7d45bd · bc7d45bd · bc7d45bd · bc7d45bd · bc7d45bd
--- a/apps/showcases/TaylorGreenVortex/CMakeLists.txt
+++ b/apps/showcases/TaylorGreenVortex/CMakeLists.txt
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+
+waLBerla_generate_target_from_python(NAME TaylorGreenVortexGenerated
+        FILE TaylorGreenVortex.py
+        OUT_FILES TaylorGreenVortexStorageSpecification.h TaylorGreenVortexStorageSpecification.cpp
+        TaylorGreenVortexSweepCollection.h TaylorGreenVortexSweepCollection.cpp
+        NoSlip.h NoSlip.cpp
+        UBB.h UBB.cpp
+        TaylorGreenVortexBoundaryCollection.h
+        TaylorGreenVortexHeader.h)
+
+waLBerla_add_executable ( NAME TaylorGreenVortex
+        FILES TaylorGreenVortex.cpp
+        DEPENDS TaylorGreenVortexGenerated blockforest core field geometry lbm_generated timeloop )
--- a/apps/showcases/TaylorGreenVortex/TaylorGreenVortex.cpp
+++ b/apps/showcases/TaylorGreenVortex/TaylorGreenVortex.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TaylorGreenVortex.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \brief TaylorGreenVortex
+//
+//======================================================================================================================
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/MemoryUsage.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/GhostLayerField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+
+// include the generated header file. It includes all generated classes
+#include "TaylorGreenVortexHeader.h"
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#   pragma GCC diagnostic ignored "-Wignored-qualifiers"
+#endif
+
+using namespace walberla;
+using namespace std::placeholders;
+
+using StorageSpecification_T = lbm::TaylorGreenVortexStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+using PdfField_T             = lbm_generated::PdfField< StorageSpecification_T >;
+using PackInfo_T             = lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >;
+
+using SweepCollection_T = lbm::TaylorGreenVortexSweepCollection;
+
+using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >;
+using ScalarField_T = GhostLayerField< real_t, 1 >;
+
+using flag_t               = walberla::uint8_t;
+using FlagField_T          = FlagField< flag_t >;
+using BoundaryCollection_T = lbm::TaylorGreenVortexBoundaryCollection< FlagField_T >;
+
+using blockforest::communication::UniformBufferedScheme;
+
+struct IDs
+{
+   BlockDataID pdfField;
+   BlockDataID velocityField;
+   BlockDataID densityField;
+   BlockDataID avgVelField;
+   BlockDataID avgVelSqrField;
+   BlockDataID avgPressureField;
+   BlockDataID flagField;
+
+   BlockDataID pdfFieldGPU;
+   BlockDataID velocityFieldGPU;
+   BlockDataID densityFieldGPU;
+};
+
+struct Setup
+{
+   Vector3< uint_t > blocks;
+   Vector3< uint_t > cellsPerBlock;
+   Vector3< uint_t > cells;
+   Vector3< bool > periodic;
+   uint_t numGhostLayers;
+};
+
+class AccuracyEvaluation
+{
+ public:
+   AccuracyEvaluation(std::shared_ptr< StructuredBlockForest >& blocks, const IDs& ids,
+                      SweepCollection_T& sweepCollection, const real_t maxLatticeVelocity, const uint_t logFrequency,
+                      const bool logToStream, const bool logToFile)
+      : blocks_(blocks), ids_(ids), sweepCollection_(sweepCollection), maxLatticeVelocity_(maxLatticeVelocity),
+        executionCounter_(uint_c(0)), filename_("TaylorGreenVortex.txt"), normalizationFactor_(real_c(1.0))
+   {
+      logFrequency_  = logToStream ? logFrequency : uint_c(0);
+      plotFrequency_ = logToFile ? logFrequency : uint_c(0);
+   }
+
+   void setNormalizationFactor(const real_t f) { normalizationFactor_ = f; }
+   void setFilename(const std::string& filename) { filename_ = filename; }
+
+   real_t L1() const { return L1_; }
+   real_t L2() const { return L2_; }
+   real_t Lmax() const { return Lmax_; }
+
+   void operator()()
+   {
+      if (logFrequency_ == uint_c(0) && (plotFrequency_ == uint_c(0) || filename_.empty())) return;
+
+      ++executionCounter_;
+
+      const bool plot = (plotFrequency_ != uint_c(0) && (executionCounter_ - uint_c(1)) % plotFrequency_ == uint_c(0) &&
+                         !filename_.empty());
+      const bool log  = (logFrequency_ != uint_c(0) && (executionCounter_ - uint_c(1)) % logFrequency_ == uint_c(0));
+
+      if (!log && !plot) return;
+
+      const auto& domainAABB = blocks_->getDomain();
+
+      real_t _L1(real_c(0.0));
+      real_t _L2(real_c(0.0));
+      real_t _Lmax(real_c(0.0));
+
+      for (auto block = blocks_->begin(); block != blocks_->end(); ++block)
+      {
+         sweepCollection_.calculateMacroscopicParameters(block.get());
+         const VectorField_T* velocityField = block->template getData< const VectorField_T >(ids_.velocityField);
+
+         const auto level = blocks_->getLevel(*block);
+         const real_t volumeFraction =
+            (blocks_->dx(level) * blocks_->dy(level) * blocks_->dz(level)) / domainAABB.volume();
+
+         for (auto it = velocityField->beginXYZ(); it != velocityField->end(); ++it)
+         {
+            Vector3< real_t > center = blocks_->getBlockLocalCellCenter(*block, Cell(it.x(), it.y(), it.z()));
+
+            Vector3< real_t > exactVelocity(
+               real_c(maxLatticeVelocity_ * (center[1] - real_c(domainAABB.yMin())) / real_c(domainAABB.ySize())),
+               real_c(0.0), real_c(0.0));
+            Vector3< real_t > velocity(it.getF(0), it.getF(1), it.getF(2));
+
+            const auto error  = velocity - exactVelocity;
+            const real_t diff = error.length();
+
+            _L1 += diff * volumeFraction;
+            _L2 += diff * diff * volumeFraction;
+            _Lmax = std::max(_Lmax, diff);
+         }
+      }
+
+      mpi::reduceInplace(_L1, mpi::SUM);
+      mpi::reduceInplace(_L2, mpi::SUM);
+      mpi::reduceInplace(_Lmax, mpi::MAX);
+      _L2 = std::sqrt(_L2);
+
+      L1_   = _L1;
+      L2_   = _L2;
+      Lmax_ = _Lmax;
+
+      WALBERLA_ROOT_SECTION()
+      {
+         if (plot && executionCounter_ == uint_t(1))
+         {
+            std::ofstream file(filename_.c_str());
+            file << "# accuracy evaluation"
+                 << "# step [1], L1 [2], L2 [3], Lmax [4]" << std::endl;
+            file.close();
+         }
+
+         if (log)
+         {
+            WALBERLA_LOG_INFO("Evaluation of accuracy:"
+                              << "\n - L1:   " << L1_ << "\n - L2:   " << L2_ << "\n - Lmax: " << Lmax_);
+         }
+
+         if (plot)
+         {
+            std::ofstream file(filename_.c_str(), std::ofstream::out | std::ofstream::app);
+            file << (executionCounter_ - uint_t(1)) << " " << L1_ << " " << L2_ << " " << Lmax_ << std::endl;
+            file.close();
+         }
+      }
+   };
+
+ private:
+   std::shared_ptr< StructuredBlockForest > blocks_;
+   IDs ids_;
+   SweepCollection_T& sweepCollection_;
+   real_t maxLatticeVelocity_;
+
+   uint_t executionCounter_;
+
+   uint_t plotFrequency_;
+   uint_t logFrequency_;
+
+   std::string filename_;
+
+   real_t normalizationFactor_;
+
+   real_t L1_;
+   real_t L2_;
+   real_t Lmax_;
+
+}; // class AccuracyEvaluation
+
+namespace
+{
+void workloadMemoryAndSUIDAssignment(SetupBlockForest& forest, const memory_t memoryPerBlock)
+{
+   for (auto block = forest.begin(); block != forest.end(); ++block)
+   {
+      block->setWorkload(numeric_cast< workload_t >(uint_t(1) << block->getLevel()));
+      block->setMemory(memoryPerBlock);
+   }
+}
+
+shared_ptr< SetupBlockForest >createSetupBlockForest(const Setup& setup, uint_t numberOfProcesses, const memory_t memoryPerCell)
+{
+   shared_ptr< SetupBlockForest > forest = make_shared< SetupBlockForest >();
+
+   const memory_t memoryPerBlock =
+      numeric_cast< memory_t >((setup.cellsPerBlock[0] + uint_t(2) * setup.numGhostLayers) *
+                               (setup.cellsPerBlock[1] + uint_t(2) * setup.numGhostLayers) *
+                               (setup.cellsPerBlock[2] + uint_t(2) * setup.numGhostLayers)) *
+      memoryPerCell;
+
+   forest->addWorkloadMemorySUIDAssignmentFunction(
+      std::bind(workloadMemoryAndSUIDAssignment, std::placeholders::_1, memoryPerBlock));
+
+   forest->init(
+      AABB(real_c(0), real_c(0), real_c(0), real_c(setup.cells[0]), real_c(setup.cells[1]), real_c(setup.cells[2])),
+      setup.blocks[0], setup.blocks[1], setup.blocks[2], setup.periodic[0], setup.periodic[1], setup.periodic[2]);
+
+   MPIManager::instance()->useWorldComm();
+   forest->balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numberOfProcesses);
+   // WALBERLA_LOG_INFO_ON_ROOT("SetupBlockForest created successfully:\n" << *forest);
+
+   return forest;
+}
+
+shared_ptr< blockforest::StructuredBlockForest >createStructuredBlockForest(const Setup& setup, const memory_t memoryPerCell)
+{
+   // WALBERLA_LOG_INFO_ON_ROOT("Creating the block structure ...");
+   shared_ptr< SetupBlockForest > sforest = createSetupBlockForest(setup, uint_c(MPIManager::instance()->numProcesses()), memoryPerCell);
+
+   auto bf  = std::make_shared< blockforest::BlockForest >(uint_c(MPIManager::instance()->rank()), *sforest, false);
+   auto sbf = std::make_shared< blockforest::StructuredBlockForest >(bf, setup.cellsPerBlock[0], setup.cellsPerBlock[1],
+                                                                     setup.cellsPerBlock[2]);
+   sbf->createCellBoundingBoxes();
+
+   return sbf;
+}
+} // namespace
+
+class StreamCollide
+{
+ public:
+   StreamCollide(const shared_ptr< StructuredBlockForest > & blocks, const IDs& ids, const real_t omega) : ids_(ids)
+   {
+      auto pdfsID = ids.pdfField;
+      _nBlocks = blocks->getNumberOfBlocks();
+      blocks->getBlocks(blockVector, 0);
+      for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++)
+      {
+         const double level_scale_factor = double(uint_t(1) << level);
+         const double one                = double(1.0);
+         const double half               = double(0.5);
+
+         omegaVector.push_back( double(omega / (level_scale_factor * (-omega * half + one) + omega * half)) );
+      }
+
+      BlockDataID flagID = field::addFlagFieldToStorage< FlagField< uint8_t > >(blocks, "counter flags", uint_c(1));
+      for( auto it = blocks->begin(); it != blocks->end(); ++it )
+      {
+         Block * block = dynamic_cast< Block * >( it.get() );
+
+         auto * flagField = block->getData< FlagField< uint8_t > > ( flagID );
+         const uint8_t baseFlag = flagField->registerFlag(FlagUID("base"));
+         const uint8_t xFlagP = flagField->registerFlag(FlagUID("xP"));
+         const uint8_t xFlagM = flagField->registerFlag(FlagUID("xM"));
+         const uint8_t yFlagP = flagField->registerFlag(FlagUID("yP"));
+         const uint8_t yFlagM = flagField->registerFlag(FlagUID("yM"));
+         const uint8_t zFlagP = flagField->registerFlag(FlagUID("zP"));
+         const uint8_t zFlagM = flagField->registerFlag(FlagUID("zM"));
+         const uint8_t streamFlag = flagField->registerFlag(FlagUID("stream"));
+
+         for (auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir) {
+            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+            if (block->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) {
+               CellInterval ci;
+               flagField->getSliceBeforeGhostLayer(*dir, ci);
+               for (auto cell: ci){flagField->addFlag(cell, baseFlag);}
+               continue;
+            }
+
+            if (block->neighborhoodSectionHasEquallySizedBlock(neighborIdx) && block->neighborExistsLocally( neighborIdx, uint_t(0) )) {
+               CellInterval ci;
+               flagField->getSliceBeforeGhostLayer(*dir, ci);
+               for (auto cell: ci){
+                  if(stencil::cx[*dir] ==  1) {flagField->addFlag(cell, xFlagP);}
+                  if(stencil::cx[*dir] == -1) {flagField->addFlag(cell, xFlagM);}
+                  if(stencil::cy[*dir] ==  1) {flagField->addFlag(cell, yFlagP);}
+                  if(stencil::cy[*dir] == -1) {flagField->addFlag(cell, yFlagM);}
+                  if(stencil::cz[*dir] ==  1) {flagField->addFlag(cell, zFlagP);}
+                  if(stencil::cz[*dir] == -1) {flagField->addFlag(cell, zFlagM);}
+               }
+               continue;
+            }
+            // Propagate on ghost layers shadowing coarse or no blocks
+            if(block->neighborhoodSectionHasLargerBlock(neighborIdx)){
+               CellInterval ci;
+               flagField->getGhostRegion(*dir, ci, 1);
+               for (auto cell: ci){flagField->addFlag(cell, streamFlag);}
+               continue;
+            }
+            CellInterval ci;
+            flagField->getSliceBeforeGhostLayer(*dir, ci);
+            for (auto cell: ci){flagField->addFlag(cell, baseFlag);}
+         }
+      }
+
+      for( auto it = blocks->begin(); it != blocks->end(); ++it )
+      {
+         auto* local = dynamic_cast< Block* >(it.get());
+         auto pdfs_tmp = local->getData< PdfField_T >(ids_.pdfField)->cloneUninitialized();
+         cache_pdfs_[local] = pdfs_tmp;
+      }
+
+      flagPointers.resize(blocks->getNumberOfLevels());
+      pdfsPointers.resize(blocks->getNumberOfLevels());
+      pdfs_tmpPointers.resize(blocks->getNumberOfLevels());
+
+
+      for( auto it = blocks->begin(); it != blocks->end(); ++it )
+      {
+         auto* local = dynamic_cast< Block* >(it.get());
+         const uint_t level = local->getLevel();
+         for (cell_idx_t ctr_2 = -1; ctr_2 < 2; ++ctr_2){
+            for (cell_idx_t ctr_1 = -1; ctr_1 < 2; ++ctr_1){
+               for (cell_idx_t ctr_0 = -1; ctr_0 < 2; ++ctr_0){
+                  auto dir = stencil::vectorToDirection(ctr_0, ctr_1, ctr_2);
+                  if(dir == stencil::C)
+                  {
+                     auto flag = local->getData< field::GhostLayerField<uint8_t, 1> >(flagID);
+                     flagPointers[level].emplace_back(flag->dataAt(0, 0, 0, 0));
+                     auto pdfs = local->getData< field::GhostLayerField<double, 27> >(pdfsID);
+                     pdfsPointers[level].emplace_back(pdfs->dataAt(0, 0, 0, 0));
+                     auto pdfs_tmp = cache_pdfs_[local];
+                     pdfs_tmpPointers[level].emplace_back(pdfs_tmp->dataAt(0, 0, 0, 0));
+
+
+                     _size_flag_0 = int64_t(int64_c(flag->xSize()));
+                     _size_flag_1 = int64_t(int64_c(flag->ySize()));
+                     _size_flag_2 = int64_t(int64_c(flag->zSize()));
+
+                     _stride_flag_0 = int64_t(flag->xStride());
+                     _stride_flag_1 = int64_t(flag->yStride());
+                     _stride_flag_2 = int64_t(flag->zStride());
+                     _stride_flag_3 = int64_t(1 * int64_t(flag->fStride()));
+
+                     _stride_pdfs_0 = int64_t(pdfs->xStride());
+                     _stride_pdfs_1 = int64_t(pdfs->yStride());
+                     _stride_pdfs_2 = int64_t(pdfs->zStride());
+                     _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+
+                     _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+                     _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+                     _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+                     _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+
+                  }
+                  else{
+                     const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(dir);
+                     if (local->neighborhoodSectionHasEquallySizedBlock(neighborIdx) && local->neighborExistsLocally( neighborIdx, uint_t(0) )) {
+                        const BlockID& neighborId = local->getNeighborId(neighborIdx, uint_t(0));
+                        auto neighbor             = dynamic_cast< Block* >(blocks->getBlock(neighborId));
+                        auto flag = neighbor->getData< field::GhostLayerField<uint8_t, 1> >(flagID);
+                        auto pdfs = neighbor->getData< field::GhostLayerField<double, 27> >(pdfsID);
+                        auto pdfs_tmp = cache_pdfs_[neighbor];
+
+
+                        const cell_idx_t x = (stencil::cx[dir] == -1) ? cell_idx_c(pdfs->xSize() - 1) : 0;
+                        const cell_idx_t y = (stencil::cy[dir] == -1) ? cell_idx_c(pdfs->ySize() - 1) : 0;
+                        const cell_idx_t z = (stencil::cz[dir] == -1) ? cell_idx_c(pdfs->zSize() - 1) : 0;
+                        flagPointers[level].emplace_back(flag->dataAt(x, y, z, 0));
+                        pdfsPointers[level].emplace_back(pdfs->dataAt(x, y, z, 0));
+                        pdfs_tmpPointers[level].emplace_back(pdfs_tmp->dataAt(x, y, z, 0));
+                     }
+                     else{
+                        auto flag = local->getData< field::GhostLayerField<uint8_t, 1> >(flagID);
+                        flagPointers[level].emplace_back(flag->dataAt(0, 0, 0, 0));
+                        auto pdfs = local->getData< field::GhostLayerField<double, 27> >(pdfsID);
+                        pdfsPointers[level].emplace_back(pdfs->dataAt(0, 0, 0, 0));
+                        pdfs_tmpPointers[level].emplace_back(cache_pdfs_[local]->dataAt(0, 0, 0, 0));
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   void streamCollideEven()
+   {
+      uint8_t ** RESTRICT const _data_flag_dp = flagPointers[0].data();
+      double ** RESTRICT const _data_pdfs_dp = pdfsPointers[0].data();
+      double ** RESTRICT _data_pdfs_tmp_dp = pdfs_tmpPointers[0].data();
+
+
+      double omega = omegaVector[0];
+      const double xi_0 = ((1.0) / (omega*-0.25 + 2.0));
+      const double rr_0 = xi_0*(omega*-2.0 + 4.0);
+
+      for(uint_t index = 0; index < _nBlocks; ++index)
+      {
+         const uint8_t * RESTRICT _data_flag = ((uint8_t * RESTRICT const)(_data_flag_dp[27*index + 13]));
+         const double * RESTRICT _data_pdfs = ((double * RESTRICT const)(_data_pdfs_dp[27*index + 13]));
+         double * RESTRICT  _data_pdfs_tmp = ((double * RESTRICT )(_data_pdfs_tmp_dp[27*index + 13]));
+
+         for (int64_t ctr_2 = 0; ctr_2 < _size_flag_2; ctr_2 += 1)
+         {
+            for (int64_t ctr_1 = 0; ctr_1 < _size_flag_1; ctr_1 += 1)
+            {
+               for (int64_t ctr_0 = 0; ctr_0 < _size_flag_0; ctr_0 += 1)
+               {
+
+                  const uint_t index_C = uint_c(27) * index + uint_c( (0 + 1) * 9 + (0 + 1) * 3 + 0 + 1 );
+                  const uint8_t flag = _data_flag[_stride_flag_0*ctr_0 + _stride_flag_1*ctr_1 + _stride_flag_2*ctr_2];
+
+                  const uint64_t index_N = ((uint64_t)(index*27 + uint64_c( (1 + ((flag) >> (4) & 1) * (-1))*3 + 1*9 + 1 )));
+                  const uint64_t index_S = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + 1*9 + 1));
+                  const uint64_t index_W = ((uint64_t)(index*27 + 1*3 + 1*9 + 1 + ((flag) >> (1) & 1) * (1)));
+                  const uint64_t index_E = ((uint64_t)(index*27 + 1*3 + 1*9 + 1 + ((flag) >> (2) & 1) * (-1)));
+                  const uint64_t index_T = ((uint64_t)(index*27 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1*3 + 1));
+                  const uint64_t index_B = ((uint64_t)(index*27 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1*3 + 1));
+                  const uint64_t index_NW = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + 1*9 + 1 + ((flag) >> (1) & 1) * (1)));
+                  const uint64_t index_NE = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + 1*9 + 1 + ((flag) >> (2) & 1) * (-1)));
+                  const uint64_t index_SW = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + 1*9 + 1 + ((flag) >> (1) & 1) * (1)));
+                  const uint64_t index_SE = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + 1*9 + 1 + ((flag) >> (2) & 1) * (-1)));
+                  const uint64_t index_TN = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1));
+                  const uint64_t index_TS = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1));
+                  const uint64_t index_TW = ((uint64_t)(index*27 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1*3 + 1 + ((flag) >> (1) & 1) * (1)));
+                  const uint64_t index_TE = ((uint64_t)(index*27 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1*3 + 1 + ((flag) >> (2) & 1) * (-1)));
+                  const uint64_t index_BN = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1));
+                  const uint64_t index_BS = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1));
+                  const uint64_t index_BW = ((uint64_t)(index*27 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1*3 + 1 + ((flag) >> (1) & 1) * (1)));
+                  const uint64_t index_BE = ((uint64_t)(index*27 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1*3 + 1 + ((flag) >> (2) & 1) * (-1)));
+                  const uint64_t index_TNE = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1 + ((flag) >> (2) & 1) * (-1)));
+                  const uint64_t index_TNW = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1 + ((flag) >> (1) & 1) * (1)));
+                  const uint64_t index_TSE = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1 + ((flag) >> (2) & 1) * (-1)));
+                  const uint64_t index_TSW = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1 + ((flag) >> (1) & 1) * (1)));
+                  const uint64_t index_BNE = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1 + ((flag) >> (2) & 1) * (-1)));
+                  const uint64_t index_BNW = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1 + ((flag) >> (1) & 1) * (1)));
+                  const uint64_t index_BSE = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1 + ((flag) >> (2) & 1) * (-1)));
+                  const uint64_t index_BSW = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1 + ((flag) >> (1) & 1) * (1)));
+
+
+                  const double * RESTRICT _data_N = _data_pdfs_dp[index_N];
+                  const double * RESTRICT _data_S = ((double * RESTRICT const)(_data_pdfs_dp[index_S]));
+                  const double * RESTRICT _data_W = ((double * RESTRICT const)(_data_pdfs_dp[index_W]));
+                  const double * RESTRICT _data_E = ((double * RESTRICT const)(_data_pdfs_dp[index_E]));
+                  const double * RESTRICT _data_T = ((double * RESTRICT const)(_data_pdfs_dp[index_T]));
+                  const double * RESTRICT _data_B = ((double * RESTRICT const)(_data_pdfs_dp[index_B]));
+                  const double * RESTRICT _data_NW = ((double * RESTRICT const)(_data_pdfs_dp[index_NW]));
+                  const double * RESTRICT _data_NE = ((double * RESTRICT const)(_data_pdfs_dp[index_NE]));
+                  const double * RESTRICT _data_SW = ((double * RESTRICT const)(_data_pdfs_dp[index_SW]));
+                  const double * RESTRICT _data_SE = ((double * RESTRICT const)(_data_pdfs_dp[index_SE]));
+                  const double * RESTRICT _data_TN = ((double * RESTRICT const)(_data_pdfs_dp[index_TN]));
+                  const double * RESTRICT _data_TS = ((double * RESTRICT const)(_data_pdfs_dp[index_TS]));
+                  const double * RESTRICT _data_TW = ((double * RESTRICT const)(_data_pdfs_dp[index_TW]));
+                  const double * RESTRICT _data_TE = ((double * RESTRICT const)(_data_pdfs_dp[index_TE]));
+                  const double * RESTRICT _data_BN = ((double * RESTRICT const)(_data_pdfs_dp[index_BN]));
+                  const double * RESTRICT _data_BS = ((double * RESTRICT const)(_data_pdfs_dp[index_BS]));
+                  const double * RESTRICT _data_BW = ((double * RESTRICT const)(_data_pdfs_dp[index_BW]));
+                  const double * RESTRICT _data_BE = ((double * RESTRICT const)(_data_pdfs_dp[index_BE]));
+                  const double * RESTRICT _data_TNE = ((double * RESTRICT const)(_data_pdfs_dp[index_TNE]));
+                  const double * RESTRICT _data_TNW = ((double * RESTRICT const)(_data_pdfs_dp[index_TNW]));
+                  const double * RESTRICT _data_TSE = ((double * RESTRICT const)(_data_pdfs_dp[index_TSE]));
+                  const double * RESTRICT _data_TSW = ((double * RESTRICT const)(_data_pdfs_dp[index_TSW]));
+                  const double * RESTRICT _data_BNE = ((double * RESTRICT const)(_data_pdfs_dp[index_BNE]));
+                  const double * RESTRICT _data_BNW = ((double * RESTRICT const)(_data_pdfs_dp[index_BNW]));
+                  const double * RESTRICT _data_BSE = ((double * RESTRICT const)(_data_pdfs_dp[index_BSE]));
+                  const double * RESTRICT _data_BSW = ((double * RESTRICT const)(_data_pdfs_dp[index_BSW]));
+
+
+                  const double f_0 = _data_pdfs[ctr_0*_stride_pdfs_0 + ctr_1*_stride_pdfs_1 + ctr_2*_stride_pdfs_2 + 0*_stride_pdfs_3];
+                  const double f_1 = _data_N[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*ctr_2 + _stride_pdfs_3];
+                  const double f_2 = _data_S[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3];
+                  const double f_3 = _data_W[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3];
+                  const double f_4 = _data_E[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3];
+                  const double f_5 = _data_T[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 5*_stride_pdfs_3];
+                  const double f_6 = _data_B[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 6*_stride_pdfs_3];
+                  const double f_7 = _data_NW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3];
+                  const double f_8 = _data_NE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3];
+                  const double f_9 = _data_SW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3];
+                  const double f_10 = _data_SE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3];
+                  const double f_11 = _data_TN[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 11*_stride_pdfs_3];
+                  const double f_12 = _data_TS[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 12*_stride_pdfs_3];
+                  const double f_13 = _data_TW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 13*_stride_pdfs_3];
+                  const double f_14 = _data_TE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 14*_stride_pdfs_3];
+                  const double f_15 = _data_BN[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 15*_stride_pdfs_3];
+                  const double f_16 = _data_BS[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 16*_stride_pdfs_3];
+                  const double f_17 = _data_BW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 17*_stride_pdfs_3];
+                  const double f_18 = _data_BE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 18*_stride_pdfs_3];
+                  const double f_19 = _data_TNE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 19*_stride_pdfs_3];
+                  const double f_20 = _data_TNW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 20*_stride_pdfs_3];
+                  const double f_21 = _data_TSE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 21*_stride_pdfs_3];
+                  const double f_22 = _data_TSW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 22*_stride_pdfs_3];
+                  const double f_23 = _data_BNE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 23*_stride_pdfs_3];
+                  const double f_24 = _data_BNW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 24*_stride_pdfs_3];
+                  const double f_25 = _data_BSE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 25*_stride_pdfs_3];
+                  const double f_26 = _data_BSW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 26*_stride_pdfs_3];
+
+                  const double vel0Term = f_10 + f_14 + f_18 + f_19 + f_21 + f_23 + f_25 + f_4 + f_8;
+                  const double vel1Term = f_1 + f_11 + f_15 + f_20 + f_24 + f_7;
+                  const double vel2Term = f_12 + f_13 + f_22 + f_5;
+                  const double delta_rho = f_0 + f_16 + f_17 + f_2 + f_26 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+                  const double rho = delta_rho + 1.0;
+                  const double xi_1 = ((1.0) / (rho));
+                  const double u_0 = xi_1*(-f_13 - f_17 - f_20 - f_22 - f_24 - f_26 - f_3 - f_7 - f_9 + vel0Term);
+                  const double u_1 = xi_1*(-f_10 - f_12 - f_16 + f_19 - f_2 - f_21 - f_22 + f_23 - f_25 - f_26 + f_8 - f_9 + vel1Term);
+                  const double u_2 = xi_1*(f_11 + f_14 - f_15 - f_16 - f_17 - f_18 + f_19 + f_20 + f_21 - f_23 - f_24 - f_25 - f_26 - f_6 + vel2Term);
+                  const double u0Mu1 = u_0 - u_1;
+                  const double u0Pu1 = u_0 + u_1;
+                  const double u1Pu2 = u_1 + u_2;
+                  const double u1Mu2 = u_1 - u_2;
+                  const double u0Mu2 = u_0 - u_2;
+                  const double u0Pu2 = u_0 + u_2;
+                  const double f_eq_common = delta_rho + rho*-1.5*(u_0*u_0) + rho*-1.5*(u_1*u_1) + rho*-1.5*(u_2*u_2);
+
+                  const double r_0  = f_0 + omega*(-f_0 + f_eq_common*0.29629629629629628);
+                  const double r_1  = f_1 + omega*(f_1*-0.5 + f_2*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_1*u_1)) + rr_0*(f_1*-0.5 + f_2*0.5 + rho*u_1*0.22222222222222221);
+                  const double r_2  = f_2 + omega*(f_1*-0.5 + f_2*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_1*u_1)) + rr_0*(f_1*0.5 + f_2*-0.5 + rho*u_1*-0.22222222222222221);
+                  const double r_3  = f_3 + omega*(f_3*-0.5 + f_4*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_0*u_0)) + rr_0*(f_3*-0.5 + f_4*0.5 + rho*u_0*-0.22222222222222221);
+                  const double r_4  = f_4 + omega*(f_3*-0.5 + f_4*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_0*u_0)) + rr_0*(f_3*0.5 + f_4*-0.5 + rho*u_0*0.22222222222222221);
+                  const double r_5  = f_5 + omega*(f_5*-0.5 + f_6*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_2*u_2)) + rr_0*(f_5*-0.5 + f_6*0.5 + rho*u_2*0.22222222222222221);
+                  const double r_6  = f_6 + omega*(f_5*-0.5 + f_6*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_2*u_2)) + rr_0*(f_5*0.5 + f_6*-0.5 + rho*u_2*-0.22222222222222221);
+                  const double r_7  = f_7 + omega*(f_10*-0.5 + f_7*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu1*u0Mu1)) + rr_0*(f_10*0.5 + f_7*-0.5 + rho*u0Mu1*-0.055555555555555552);
+                  const double r_8  = f_8 + omega*(f_8*-0.5 + f_9*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu1*u0Pu1)) + rr_0*(f_8*-0.5 + f_9*0.5 + rho*u0Pu1*0.055555555555555552);
+                  const double r_9  = f_9 + omega*(f_8*-0.5 + f_9*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu1*u0Pu1)) + rr_0*(f_8*0.5 + f_9*-0.5 + rho*u0Pu1*-0.055555555555555552);
+                  const double r_10 = f_10 + omega*(f_10*-0.5 + f_7*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu1*u0Mu1)) + rr_0*(f_10*-0.5 + f_7*0.5 + rho*u0Mu1*0.055555555555555552);
+                  const double r_11 = f_11 + omega*(f_11*-0.5 + f_16*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Pu2*u1Pu2)) + rr_0*(f_11*-0.5 + f_16*0.5 + rho*u1Pu2*0.055555555555555552);
+                  const double r_12 = f_12 + omega*(f_12*-0.5 + f_15*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Mu2*u1Mu2)) + rr_0*(f_12*-0.5 + f_15*0.5 + rho*u1Mu2*-0.055555555555555552);
+                  const double r_13 = f_13 + omega*(f_13*-0.5 + f_18*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu2*u0Mu2)) + rr_0*(f_13*-0.5 + f_18*0.5 + rho*u0Mu2*-0.055555555555555552);
+                  const double r_14 = f_14 + omega*(f_14*-0.5 + f_17*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu2*u0Pu2)) + rr_0*(f_14*-0.5 + f_17*0.5 + rho*u0Pu2*0.055555555555555552);
+                  const double r_15 = f_15 + omega*(f_12*-0.5 + f_15*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Mu2*u1Mu2)) + rr_0*(f_12*0.5 + f_15*-0.5 + rho*u1Mu2*0.055555555555555552);
+                  const double r_16 = f_16 + omega*(f_11*-0.5 + f_16*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Pu2*u1Pu2)) + rr_0*(f_11*0.5 + f_16*-0.5 + rho*u1Pu2*-0.055555555555555552);
+                  const double r_17 = f_17 + omega*(f_14*-0.5 + f_17*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu2*u0Pu2)) + rr_0*(f_14*0.5 + f_17*-0.5 + rho*u0Pu2*-0.055555555555555552);
+                  const double r_18 = f_18 + omega*(f_13*-0.5 + f_18*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu2*u0Mu2)) + rr_0*(f_13*0.5 + f_18*-0.5 + rho*u0Mu2*0.055555555555555552);
+                  const double r_19 = f_19 + omega*(delta_rho*-0.013888888888888888 + f_19*-0.5 + f_26*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_19*-0.5 + f_26*0.5 + rho*(u0Pu1*0.013888888888888888 + u_2*0.013888888888888888));
+                  const double r_20 = f_20 + omega*(delta_rho*-0.013888888888888888 + f_20*-0.5 + f_25*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_20*-0.5 + f_25*0.5 + rho*(u0Mu1*-0.013888888888888888 + u_2*0.013888888888888888));
+                  const double r_21 = f_21 + omega*(delta_rho*-0.013888888888888888 + f_21*-0.5 + f_24*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_21*-0.5 + f_24*0.5 + rho*(u0Mu1*0.013888888888888888 + u_2*0.013888888888888888));
+                  const double r_22 = f_22 + omega*(delta_rho*-0.013888888888888888 + f_22*-0.5 + f_23*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_22*-0.5 + f_23*0.5 + rho*(u0Pu1*-0.013888888888888888 + u_2*0.013888888888888888));
+                  const double r_23 = f_23 + omega*(delta_rho*-0.013888888888888888 + f_22*-0.5 + f_23*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_22*0.5 + f_23*-0.5 + rho*(u0Pu1*0.013888888888888888 + u_2*-0.013888888888888888));
+                  const double r_24 = f_24 + omega*(delta_rho*-0.013888888888888888 + f_21*-0.5 + f_24*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_21*0.5 + f_24*-0.5 + rho*(u0Mu1*-0.013888888888888888 + u_2*-0.013888888888888888));
+                  const double r_25 = f_25 + omega*(delta_rho*-0.013888888888888888 + f_20*-0.5 + f_25*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_20*0.5 + f_25*-0.5 + rho*(u0Mu1*0.013888888888888888 + u_2*-0.013888888888888888));
+                  const double r_26 = f_26 + omega*(delta_rho*-0.013888888888888888 + f_19*-0.5 + f_26*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_19*0.5 + f_26*-0.5 + rho*(u0Pu1*-0.013888888888888888 + u_2*-0.013888888888888888));
+      
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 0*_stride_pdfs_tmp_3] = r_0;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 1*_stride_pdfs_tmp_3] = r_1;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 2*_stride_pdfs_tmp_3] = r_2;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 3*_stride_pdfs_tmp_3] = r_3;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 4*_stride_pdfs_tmp_3] = r_4;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 5*_stride_pdfs_tmp_3] = r_5;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 6*_stride_pdfs_tmp_3] = r_6;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 7*_stride_pdfs_tmp_3] = r_7;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 8*_stride_pdfs_tmp_3] = r_8;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 9*_stride_pdfs_tmp_3] = r_9;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 10*_stride_pdfs_tmp_3] = r_10;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 11*_stride_pdfs_tmp_3] = r_11;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 12*_stride_pdfs_tmp_3] = r_12;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 13*_stride_pdfs_tmp_3] = r_13;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 14*_stride_pdfs_tmp_3] = r_14;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 15*_stride_pdfs_tmp_3] = r_15;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 16*_stride_pdfs_tmp_3] = r_16;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 17*_stride_pdfs_tmp_3] = r_17;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 18*_stride_pdfs_tmp_3] = r_18;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 19*_stride_pdfs_tmp_3] = r_19;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 20*_stride_pdfs_tmp_3] = r_20;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 21*_stride_pdfs_tmp_3] = r_21;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 22*_stride_pdfs_tmp_3] = r_22;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 23*_stride_pdfs_tmp_3] = r_23;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 24*_stride_pdfs_tmp_3] = r_24;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 25*_stride_pdfs_tmp_3] = r_25;
+                  _data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 26*_stride_pdfs_tmp_3] = r_26;
+               }
+            }
+         }
+      }
+   }
+
+   void swap()
+   {
+
+      for( auto block: blockVector )
+      {
+         PdfField_T * srcField = block->getData< PdfField_T >( ids_.pdfField );
+         auto dstField = cache_pdfs_[block];
+         srcField->swapDataPointers(dstField);
+      }
+      std::swap(pdfsPointers[0], pdfs_tmpPointers[0]);
+   }
+
+ private:
+   IDs ids_;
+   std::vector<Block*> blockVector;
+   std::vector<real_t> omegaVector;
+
+   std::vector<std::vector<uint8_t *>> flagPointers;
+   std::vector<std::vector<double *>> pdfsPointers;
+   std::vector<std::vector<double *>> pdfs_tmpPointers;
+
+   std::unordered_map<IBlock*, field::GhostLayerField<double, 27> *> cache_pdfs_;
+
+   int64_t _size_flag_0;
+   int64_t _size_flag_1;
+   int64_t _size_flag_2;
+   uint_t _nBlocks;
+   int64_t _stride_pdfs_0;
+   int64_t _stride_pdfs_1;
+   int64_t _stride_pdfs_2;
+   int64_t _stride_pdfs_3;
+
+   int64_t _stride_flag_0;
+   int64_t _stride_flag_1;
+   int64_t _stride_flag_2;
+   int64_t _stride_flag_3;
+
+   int64_t _stride_pdfs_tmp_0;
+   int64_t _stride_pdfs_tmp_1;
+   int64_t _stride_pdfs_tmp_2;
+   int64_t _stride_pdfs_tmp_3;
+};
+
+class Timestep
+{
+ public:
+   Timestep(std::shared_ptr< StructuredBlockForest >& blocks, const IDs& ids,
+            StreamCollide & streamCollide, std::shared_ptr< PackInfo_T >& packInfo, UniformBufferedScheme<CommunicationStencil_T>& communication)
+      : blocks_(blocks), ids_(ids), streamCollide_(streamCollide), packInfo_(packInfo), communication_(communication)
+   {
+
+   }
+
+   void commLocal(const uint_t level)
+   {
+      for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
+      {
+         if(blocks_->getLevel(*it.get()) != level)
+            continue;
+
+         auto* sender = dynamic_cast< Block* >(it.get());
+         for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir )
+         {
+            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+            if (!(sender->neighborhoodSectionHasEquallySizedBlock(neighborIdx)))
+               continue;
+
+            const BlockID & receiverId = sender->getNeighborId( neighborIdx, uint_t(0) );
+            auto receiver = dynamic_cast< Block * >( blocks_->getBlock(receiverId) );
+
+            const PdfField_T * srcField = sender->getData< PdfField_T >( ids_.pdfField );
+            PdfField_T * dstField = receiver->getData< PdfField_T >( ids_.pdfField );
+
+            CellInterval srcRegion;
+            CellInterval dstRegion;
+            cell_idx_t gls = 1;
+            srcField->getSliceBeforeGhostLayer(*dir, srcRegion, gls, false);
+            dstField->getGhostRegion(stencil::inverseDir[*dir], dstRegion, gls, false);
+            WALBERLA_ASSERT_EQUAL(srcRegion.xSize(), dstRegion.xSize())
+            WALBERLA_ASSERT_EQUAL(srcRegion.ySize(), dstRegion.ySize())
+            WALBERLA_ASSERT_EQUAL(srcRegion.zSize(), dstRegion.zSize())
+
+            //WALBERLA_LOG_DEVEL_VAR(srcRegion)
+            //WALBERLA_LOG_DEVEL_VAR(dstRegion)
+
+            auto srcIter = srcRegion.begin();
+            auto dstIter = dstRegion.begin();
+
+            while (srcIter != srcRegion.end())
+            {
+               for( uint_t f = 0; f < Stencil_T::d_per_d_length[*dir]; ++f )
+               {
+                  dstField->get(*dstIter, Stencil_T::idx[ Stencil_T::d_per_d[*dir][f] ]) = srcField->get(*srcIter, Stencil_T::idx[ Stencil_T::d_per_d[*dir][f] ]);
+               }
+               ++srcIter;
+               ++dstIter;
+            }
+            WALBERLA_ASSERT( srcIter == srcRegion.end() )
+            WALBERLA_ASSERT( dstIter == dstRegion.end() )
+         }
+      }
+   }
+
+   void swap()
+   {
+      streamCollide_.swap();
+   }
+
+   void timestep()
+   {
+//      for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
+//      {
+//         auto block = dynamic_cast< Block* >(it.get());
+//         commLocal(0);
+//         streamCollide_.streamCollide(block);
+//         // streamCollide_(block);
+//      }
+// commLocal(0);
+      if(((timestepCounter_) &1) ^ 1)
+      {
+         streamCollide_.streamCollideEven();
+         swap();
+      }
+      else
+      {
+         streamCollide_.streamCollideEven();
+         swap();
+      }
+      timestepCounter_ = (timestepCounter_ + 1) & 1;
+   }
+
+   void operator()(){ timestep(); };
+
+ private:
+   std::shared_ptr< StructuredBlockForest > blocks_;
+   IDs ids_;
+   StreamCollide & streamCollide_;
+   std::shared_ptr< PackInfo_T > packInfo_;
+   UniformBufferedScheme<CommunicationStencil_T> communication_;
+   uint8_t timestepCounter_{0};
+};
+
+int main(int argc, char** argv)
+{
+   mpi::Environment env(argc, argv);
+   shared_ptr< Config > config = make_shared< Config >();
+   config->readParameterFile(argv[1]);
+   logging::configureLogging(config);
+
+   // read parameters
+   auto parameters           = config->getOneBlock("Parameters");
+   auto loggingParameters    = config->getOneBlock("Logging");
+   auto domainParameters     = config->getOneBlock("DomainSetup");
+   // auto EvaluationParameters = config->getOneBlock("Evaluation");
+
+   const real_t omega              = parameters.getParameter< real_t >("omega");
+   const real_t maxLatticeVelocity = parameters.getParameter< real_t >("maxLatticeVelocity");
+   const uint_t timesteps          = parameters.getParameter< uint_t >("timesteps") + uint_c(1);
+
+   Setup setup;
+   setup.blocks        = domainParameters.getParameter< Vector3< uint_t > >("blocks");
+   setup.cellsPerBlock = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   setup.cells = Vector3< uint_t >(setup.blocks[0] * setup.cellsPerBlock[0], setup.blocks[1] * setup.cellsPerBlock[1],
+                                   setup.blocks[2] * setup.cellsPerBlock[2]);
+   setup.periodic       = domainParameters.getParameter< Vector3< bool > >("periodic");
+   setup.numGhostLayers = uint_c(1);
+
+   const uint_t valuesPerCell = (uint_c(2) * Stencil_T::Q + VectorField_T ::F_SIZE + uint_c(2) * ScalarField_T::F_SIZE);
+   const uint_t sizePerValue  = sizeof(PdfField_T::value_type);
+   const memory_t memoryPerCell = memory_t(valuesPerCell * sizePerValue + uint_c(1));
+
+   bool writeSetupForestAndReturn = loggingParameters.getParameter< bool >("writeSetupForestAndReturn", false);
+   if (uint_c(MPIManager::instance()->numProcesses()) > 1) writeSetupForestAndReturn = false;
+
+   if (writeSetupForestAndReturn)
+   {
+      std::string sbffile = "TaylorGreenVortex.bfs";
+
+      std::ostringstream infoString;
+      infoString << "You have selected the option of just creating the block structure (= domain decomposition) and "
+                    "saving the result to file\n"
+                    "by specifying the output file name \'"
+                 << sbffile << "\' AND also specifying \'saveToFile\'.\n";
+
+      if (MPIManager::instance()->numProcesses() > 1)
+         WALBERLA_ABORT(infoString.str() << "In this mode you need to start " << argv[0] << " with just one process!")
+
+      WALBERLA_LOG_INFO_ON_ROOT(infoString.str() << "Creating the block structure ...")
+
+      const uint_t numberProcesses = domainParameters.getParameter< uint_t >("numberProcesses");
+
+      shared_ptr< SetupBlockForest > sforest = createSetupBlockForest(setup, numberProcesses, memoryPerCell);
+      sforest->writeVTKOutput("domain_decomposition");
+      sforest->saveToFile(sbffile.c_str());
+
+      logging::Logging::printFooterOnStream();
+      return EXIT_SUCCESS;
+   }
+
+   auto blocks = createStructuredBlockForest(setup, memoryPerCell);
+
+   IDs ids;
+   const StorageSpecification_T StorageSpec = StorageSpecification_T();
+   ids.pdfField      = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, setup.numGhostLayers, field::fzyx);
+   // ids.pdfFieldTmp   = lbm_generated::addPdfFieldToStorage(blocks, "pdfs tmp", StorageSpec, setup.numGhostLayers, field::fzyx);
+   ids.velocityField = field::addToStorage< VectorField_T >(blocks, "vel", real_c(0.0), field::fzyx, setup.numGhostLayers);
+   ids.densityField = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, setup.numGhostLayers);
+   ids.flagField    = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
+
+   const cell_idx_t domainHalf = cell_idx_c(blocks->getDomain().yMax() / real_c(2.0));
+   for (auto& block : *blocks)
+   {
+      auto velocityField = block.getData<VectorField_T>(ids.velocityField);
+      for (cell_idx_t ctr_2 = 0; ctr_2 < cell_idx_c(velocityField->zSize()); ++ctr_2)
+      {
+         for(cell_idx_t ctr_1 = 0; ctr_1 < cell_idx_c(velocityField->ySize()); ++ctr_1)
+         {
+            for (cell_idx_t ctr_0 = 0; ctr_0 < cell_idx_c(velocityField->xSize()); ++ctr_0)
+            {
+               Cell domainCell(ctr_0, ctr_1, ctr_2);
+               blocks->transformBlockLocalToGlobalCell(domainCell, block);
+               if((domainCell.y() + real_c(0.5)) > domainHalf)
+               {
+                  velocityField->get(ctr_0, ctr_1, ctr_2, 0) = maxLatticeVelocity;
+               }
+               else
+               {
+                  velocityField->get(ctr_0, ctr_1, ctr_2, 0) = -maxLatticeVelocity;
+               }
+            }
+         }
+      }
+   }
+
+   SweepCollection_T sweepCollection(blocks, ids.pdfField, ids.densityField, ids.velocityField, omega);
+   for (auto& block : *blocks)
+   {
+      sweepCollection.initialise(&block, cell_idx_c(setup.numGhostLayers - uint_c(1)));
+   }
+
+   for (auto& block : *blocks)
+   {
+      if(blocks->getLevel(block) == 0)
+      {
+         sweepCollection.streamCollide(&block);
+      }
+   }
+
+   sweepCollection.initialiseBlockPointer();
+//
+//   sweepCollection.swap(0);
+
+   const FlagUID fluidFlagUID("Fluid");
+   auto boundariesConfig = config->getBlock("Boundaries");
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, ids.flagField, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, fluidFlagUID, cell_idx_c(0));
+
+   // BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfField, fluidFlagUID, maxLatticeVelocity);
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+
+   std::shared_ptr< PackInfo_T > packInfo = std::make_shared<lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >>(ids.pdfField);
+   UniformBufferedScheme< Stencil_T > communication(blocks);
+   communication.addPackInfo(packInfo);
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+   auto VTKWriter                 = config->getOneBlock("VTKWriter");
+   const uint_t vtkWriteFrequency = VTKWriter.getParameter< uint_t >("vtkWriteFrequency");
+   const bool writeVelocity       = VTKWriter.getParameter< bool >("velocity");
+   const bool writeDensity        = VTKWriter.getParameter< bool >("density");
+   // const bool writeAverageFields  = VTKWriter.getParameter< bool >("averageFields", false);
+   const bool writeFlag = VTKWriter.getParameter< bool >("flag");
+   // const bool writeOnlySlice      = VTKWriter.getParameter< bool >("writeOnlySlice", true);
+   const bool amrFileFormat     = VTKWriter.getParameter< bool >("amrFileFormat", false);
+   const bool oneFilePerProcess = VTKWriter.getParameter< bool >("oneFilePerProcess", false);
+
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput =
+         vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_TaylorGreenVortex", "simulation_step",
+                                        false, true, true, false, 0, amrFileFormat, oneFilePerProcess);
+
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+         {
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+      });
+
+      if (writeVelocity)
+      {
+         auto velWriter = make_shared< field::VTKWriter< VectorField_T, float32 > >(ids.velocityField, "velocity");
+         vtkOutput->addCellDataWriter(velWriter);
+      }
+      if (writeDensity)
+      {
+         auto densityWriter = make_shared< field::VTKWriter< ScalarField_T, float32 > >(ids.densityField, "density");
+         vtkOutput->addCellDataWriter(densityWriter);
+      }
+      if (writeFlag)
+      {
+         auto flagWriter = make_shared< field::VTKWriter< FlagField_T > >(ids.flagField, "flag");
+         vtkOutput->addCellDataWriter(flagWriter);
+      }
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   StreamCollide streamCollide(blocks, ids, omega);
+   Timestep timeloopFunction(blocks, ids, streamCollide, packInfo, communication);
+
+   const bool generatedVersion = false;
+   if(generatedVersion){
+      timeloop.addFuncBeforeTimeStep(sweepCollection.blockStreamCollideFck(0), "Refinement Cycle");
+   }
+   else{
+      timeloop.addFuncBeforeTimeStep(timeloopFunction, "Refinement Cycle");
+   }
+
+   const real_t remainingTimeLoggerFrequency = loggingParameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   if (uint_c(remainingTimeLoggerFrequency) > 0)
+   {
+      timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency), "remaining time logger");
+   }
+
+//   const uint_t evaluationCheckFrequency = EvaluationParameters.getParameter< uint_t >("evaluationCheckFrequency");
+//   const bool evaluationLogToStream      = EvaluationParameters.getParameter< bool >("logToStream");
+//   const bool evaluationLogToFile        = EvaluationParameters.getParameter< bool >("logToFile");
+//   const std::string evaluationFilename  = EvaluationParameters.getParameter< std::string >("filename");
+
+   // std::shared_ptr< AccuracyEvaluation > evaluation = std::make_shared< AccuracyEvaluation >(blocks, ids, sweepCollection, maxLatticeVelocity, evaluationCheckFrequency, evaluationLogToStream, evaluationLogToFile);
+   // timeloop.addFuncBeforeTimeStep(SharedFunctor< AccuracyEvaluation >(evaluation), "evaluation");
+
+   //////////////////////
+   /// RUN SIMULATION ///
+   //////////////////////
+   const lbm_generated::PerformanceEvaluation< FlagField_T > performance(blocks, ids.flagField, fluidFlagUID);
+   field::CellCounter< FlagField_T > fluidCells(blocks, ids.flagField, fluidFlagUID);
+   fluidCells();
+
+   WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks())
+   WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation")
+   WcTimingPool timeloopTiming;
+   WcTimer simTimer;
+
+   WALBERLA_MPI_BARRIER()
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+
+   simTimer.start();
+   timeloop.run(timeloopTiming);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+   WALBERLA_MPI_BARRIER()
+   simTimer.end();
+
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   real_t time = simTimer.max();
+   WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+   performance.logResultOnRoot(timesteps, time);
+
+   const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+   WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+   printResidentMemoryStatistics();
+
+   return EXIT_SUCCESS;
+}
--- a/apps/showcases/TaylorGreenVortex/TaylorGreenVortex.prm
+++ b/apps/showcases/TaylorGreenVortex/TaylorGreenVortex.prm
+Parameters
+{
+	omega              1.4;
+	timesteps          100;
+	maxLatticeVelocity 0.01;
+}
+
+DomainSetup
+{
+    blocks        <  4,  4,  4 >;
+    cellsPerBlock < 8, 8, 8 >;
+    periodic      <  1,  1,  1 >;
+    numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
+}
+
+Boundaries 
+{
+    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction N;    walldistance -1;  flag UBB; }
+}
+
+VTKWriter
+{
+    vtkWriteFrequency 50;
+    velocity true;
+    density true;
+    averageFields false;
+    flag false;
+    writeOnlySlice false;
+    amrFileFormat false;
+    oneFilePerProcess false;
+}
+
+Logging
+{
+    logLevel info;  // info progress detail tracing
+    writeSetupForestAndReturn false;
+    remainingTimeLoggerFrequency 3; // in seconds
+}
+
+Evaluation
+{
+    evaluationCheckFrequency 100;
+    logToStream true;
+    logToFile true;
+    filename Channel.txt;
+}
--- a/apps/showcases/TaylorGreenVortex/TaylorGreenVortex.py
+++ b/apps/showcases/TaylorGreenVortex/TaylorGreenVortex.py
+import sympy as sp
+
+from pystencils import Target
+from pystencils import fields
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+import warnings
+
+warnings.filterwarnings("ignore")
+with CodeGeneration() as ctx:
+    target = Target.CPU  # Target.GPU if ctx.cuda else Target.CPU
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    pdf_dtype = "float64"
+
+    streaming_pattern = 'pull'
+    timesteps = get_timesteps(streaming_pattern)
+
+    omega = sp.symbols("omega")
+
+    stencil = LBStencil(Stencil.D3Q27)
+    dim = stencil.D
+    pdfs, pdfs_tmp = fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
+    velocity_field, density_field = fields(f"velocity({dim}), density(1) : {data_type}[{dim}D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern, compressible=True)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([sp.Symbol("u_x"), 0, 0], data_type=data_type))
+
+    generate_lbm_package(ctx, name="TaylorGreenVortex",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=False, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields, data_type=data_type)
+
+    generate_info_header(ctx, 'TaylorGreenVortexHeader')
--- a/apps/showcases/Thermocapillary/benchmark.py
+++ b/apps/showcases/Thermocapillary/benchmark.py
@@ -6,6 +6,11 @@ import waLBerla as wlb
 from waLBerla.tools.config import block_decomposition
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars

+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+

 def num_time_steps(block_size, time_steps_for_256_block=50):
    # Number of time steps run for a workload of 256^3 cells per process
@@ -137,6 +142,14 @@ class Scenario:
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine

+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
        sequenceValuesToScalars(data)

        csv_file = f"thermocapillary_benchmark.csv"

--- a/python/lbmpy_walberla/packing_kernels.py
+++ b/python/lbmpy_walberla/packing_kernels.py
@@ -7,8 +7,10 @@ import sympy as sp
 from jinja2 import Environment, PackageLoader, StrictUndefined

 from pystencils import Assignment, CreateKernelConfig, create_kernel, Field, FieldType, fields, Target
+from pystencils.astnodes import LoopOverCoordinate
+from pystencils.integer_functions import int_div
 from pystencils.stencil import offset_to_direction_string
-from pystencils.typing import TypedSymbol
+from pystencils.typing import TypedSymbol, BasicType, PointerType, FieldPointerSymbol
 from pystencils.stencil import inverse_direction
 from pystencils.bit_masks import flag_cond

@@ -18,7 +20,7 @@ from lbmpy.enums import Stencil
 from lbmpy.stencils import LBStencil

 from pystencils_walberla.cmake_integration import CodeGenerationContext
-from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode
+from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode, AbortNode
 from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
 from pystencils_walberla.utility import config_from_context

@@ -101,6 +103,23 @@ class PackingKernelsCodegen:
        self.accessors = [get_accessor(streaming_pattern, t) for t in get_timesteps(streaming_pattern)]
        self.mask_field = fields(f'mask : uint32 [{self.dim}D]', layout=src_field.layout)

+        self.block_wise = True
+        if not self.inplace or not self.config.target == Target.GPU:
+            self.block_wise = False
+
+        self.index = TypedSymbol("index", dtype=BasicType(np.int64))
+        self.index_shape = TypedSymbol("_size_0", dtype=BasicType(np.int64))
+        self.src_ptr_type = PointerType(self.src_field.dtype, const=True, restrict=True, double_pointer=True)
+        self.src_ptr = FieldPointerSymbol(self.src_field.name, self.src_field.dtype, const=True)
+        self.dst_ptr_type = PointerType(self.dst_field.dtype, const=False, restrict=True, double_pointer=True)
+        self.dst_ptr = FieldPointerSymbol(self.dst_field.name, self.dst_field.dtype, const=False)
+
+        self.data_src = TypedSymbol(f"_data_{self.src_field.name}_dp", dtype=self.src_ptr_type)
+        self.data_dst = TypedSymbol(f"_data_{self.dst_field.name}_dp", dtype=self.dst_ptr_type)
+
+        self.f = sp.IndexedBase(self.data_src, shape=self.index_shape)
+        self.d = sp.IndexedBase(self.data_dst, shape=self.index_shape)
+
    def create_uniform_kernel_families(self, kernels_dict=None):
        kernels = dict() if kernels_dict is None else kernels_dict

@@ -115,6 +134,8 @@ class PackingKernelsCodegen:

    def create_nonuniform_kernel_families(self, kernels_dict=None):
        kernels = dict() if kernels_dict is None else kernels_dict
+        kernels['localCopyRedistribute'] = self.get_local_copy_redistribute_kernel_family()
+        kernels['localPartialCoalescence'] = self.get_local_copy_partial_coalescence_kernel_family()
        kernels['unpackRedistribute'] = self.get_unpack_redistribute_kernel_family()
        kernels['packPartialCoalescence'] = self.get_pack_partial_coalescence_kernel_family()
        kernels['zeroCoalescenceRegion'] = self.get_zero_coalescence_region_kernel_family()
@@ -231,7 +252,10 @@ class PackingKernelsCodegen:
        dir_string = offset_to_direction_string(comm_dir)
        streaming_dirs = self.get_streaming_dirs(comm_dir)
        src, dst = self._stream_out_accs(timestep)
-        assignments = []
+        assignments = list()
+        if self.block_wise:
+            assignments.append(Assignment(self.src_ptr, self.f[self.index]))
+            assignments.append(Assignment(self.dst_ptr, self.d[self.index]))
        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
        if len(dir_indices) == 0:
            return None
@@ -283,15 +307,59 @@ class PackingKernelsCodegen:
        return create_kernel(assignments, config=config)

    def get_unpack_redistribute_kernel_family(self):
-        return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast)
+        return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast,
+                                                           exclude_time_step=Timestep.EVEN)

    def get_local_copy_redistribute_ast(self, comm_dir, timestep):
-        #   TODO
-        raise NotImplementedError()
+        assert not all(d == 0 for d in comm_dir)
+        ctr = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(self.stencil.D)]
+
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir))
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+        if len(dir_indices) == 0:
+            return None
+
+        # for inplace streaming the dst (fine grid) must always be on odd state
+        dst_timestep = Timestep.ODD if self.inplace else Timestep.BOTH
+
+        _, dst = self._stream_out_accs(dst_timestep)
+        src, _ = self._stream_out_accs(timestep)
+
+        src_abs = self.src_field.new_field_with_different_name(self.src_field.name)
+        src_abs.field_type = FieldType.CUSTOM
+
+        orthos = self.orthogonal_principals(comm_dir)
+        sub_dirs = self.contained_principals(comm_dir)
+        orthogonal_combinations = self.linear_combinations(orthos)
+        subdir_combinations = self.linear_combinations_nozero(sub_dirs)
+        second_gl_dirs = [o + s for o, s in product(orthogonal_combinations, subdir_combinations)]
+        negative_dir_correction = np.array([(1 if d == -1 else 0) for d in comm_dir])
+        assignments = []
+        for offset in orthogonal_combinations:
+            o = offset + negative_dir_correction
+            for d in range(self.values_per_cell):
+                field_acc = dst[d].get_shifted(*o)
+                src_access = [int_div(ctr[i], 2) + o for i, o in enumerate(src[d].offsets)]
+                assignments.append(Assignment(field_acc, src_abs.absolute_access(src_access, (d, ))))
+
+        for offset in second_gl_dirs:
+            o = offset + negative_dir_correction
+            for d in dir_indices:
+                field_acc = dst[d].get_shifted(*o)
+                src_access = [int_div(ctr[i], 2) + o for i, o in enumerate(src[d].offsets)]
+                assignments.append(Assignment(field_acc, src_abs.absolute_access(src_access, (d, ))))
+
+        function_name = f'localCopyRedistribute_{dir_string}' + timestep_suffix(timestep)
+        iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim))
+        config = CreateKernelConfig(function_name=function_name, iteration_slice=iteration_slice,
+                                    data_type=self.data_type, ghost_layers=0, allow_double_writes=True,
+                                    cpu_openmp=self.config.cpu_openmp, target=self.config.target)
+
+        return create_kernel(assignments, config=config)

    def get_local_copy_redistribute_kernel_family(self):
-        #   TODO
-        raise NotImplementedError()
+        return self._construct_directionwise_kernel_family(self.get_local_copy_redistribute_ast)

    # --------------------------- Pack / Unpack / LocalCopy Fine to Coarse ---------------------------------------------

@@ -322,7 +390,8 @@ class PackingKernelsCodegen:
        return ast

    def get_pack_partial_coalescence_kernel_family(self):
-        return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast)
+        return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast,
+                                                           exclude_time_step=Timestep.ODD)

    def get_unpack_coalescence_ast(self, comm_dir, timestep):
        config = replace(self.config, ghost_layers=0)
@@ -370,12 +439,53 @@ class PackingKernelsCodegen:
    def get_zero_coalescence_region_kernel_family(self):
        return self._construct_directionwise_kernel_family(self.get_zero_coalescence_region_ast)

-    #   TODO
    def get_local_copy_partial_coalescence_ast(self, comm_dir, timestep):
-        raise NotImplementedError()
+        assert not all(d == 0 for d in comm_dir)
+        ctr = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(self.stencil.D)]
+
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(comm_dir)
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+
+        if len(dir_indices) == 0:
+            return None
+        buffer = sp.symbols(f"b_:{self.values_per_cell}")
+
+        # for inplace streaming the src (fine grid) must always be on even state
+        src_timestep = Timestep.ODD if self.inplace else Timestep.BOTH
+
+        src, _ = self._stream_in_accs(src_timestep)
+        _, dst = self._stream_in_accs(timestep.next())
+        mask = self.mask_field
+
+        dst_abs = self.dst_field.new_field_with_different_name(self.dst_field.name)
+        dst_abs.field_type = FieldType.CUSTOM
+
+        coalescence_factor = sp.Rational(1, 2 ** self.dim)
+
+        offsets = list(product(*((0, 1) for _ in comm_dir)))
+        assignments = []
+        for i, d in enumerate(dir_indices):
+            acc = 0
+            for o in offsets:
+                acc += flag_cond(d, mask[o], src[d].get_shifted(*o))
+            assignments.append(Assignment(buffer[i], acc))
+
+        for i, d in enumerate(dir_indices):
+            index = dst[d].index
+            dst_access = [int_div(ctr[i], 2) + o for i, o in enumerate(dst[d].offsets)]
+            assignments.append(Assignment(dst_abs.absolute_access(dst_access, index),
+                                          dst_abs.absolute_access(dst_access, index) + coalescence_factor * buffer[i]))
+
+        iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim))
+        config = replace(self.config, iteration_slice=iteration_slice, ghost_layers=0)
+
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = f'localPartialCoalescence_{dir_string}' + timestep_suffix(timestep)
+        return ast

    def get_local_copy_partial_coalescence_kernel_family(self):
-        raise NotImplementedError()
+        return self._construct_directionwise_kernel_family(self.get_local_copy_partial_coalescence_ast)

    # ------------------------------------------ Utility ---------------------------------------------------------------

@@ -425,7 +535,7 @@ class PackingKernelsCodegen:

    # --------------------------- Private Members ----------------------------------------------------------------------

-    def _construct_directionwise_kernel_family(self, create_ast_callback):
+    def _construct_directionwise_kernel_family(self, create_ast_callback, exclude_time_step=None):
        subtrees = []
        direction_symbol = TypedSymbol('dir', dtype='stencil::Direction')
        for t in get_timesteps(self.streaming_pattern):
@@ -439,7 +549,10 @@ class PackingKernelsCodegen:
                    continue
                kernel_call = KernelCallNode(ast)
                cases_dict[f"stencil::{dir_string}"] = kernel_call
-            subtrees.append(SwitchNode(direction_symbol, cases_dict))
+            if exclude_time_step is not None and t == exclude_time_step:
+                subtrees.append(AbortNode("This function can not be called! Please contact the waLBerla team"))
+            else:
+                subtrees.append(SwitchNode(direction_symbol, cases_dict))

        if not self.inplace:
            tree = subtrees[0]

--- a/python/lbmpy_walberla/storage_specification.py
+++ b/python/lbmpy_walberla/storage_specification.py
@@ -113,8 +113,8 @@ def generate_lbm_storage_specification(generation_context: CodeGenerationContext
        'kernels': kernels,
        'direction_sizes': cg.get_direction_sizes(),
        'src_field': cg.src_field,
-        'dst_field': cg.dst_field
-
+        'dst_field': cg.dst_field,
+        'block_wise': cg.block_wise
    }
    if nonuniform:
        jinja_context['mask_field'] = cg.mask_field

--- a/python/lbmpy_walberla/sweep_collection.py
+++ b/python/lbmpy_walberla/sweep_collection.py
 from dataclasses import replace
 from typing import Dict

+from jinja2 import Environment, PackageLoader, StrictUndefined
+
 import sympy as sp
 import numpy as np

-from pystencils import Target, create_kernel
+from pystencils import Target, create_kernel, Assignment
+from pystencils.bit_masks import flag_cond
 from pystencils.config import CreateKernelConfig
-from pystencils.field import Field
+from pystencils.field import Field, fields
 from pystencils.simp import add_subexpressions_for_field_reads
+from pystencils.typing import BasicType, PointerType, FieldPointerSymbol, TypedSymbol, CastFunc

 from lbmpy.advanced_streaming import is_inplace, get_accessor, Timestep
 from lbmpy.creationfunctions import LbmCollisionRule, LBMConfig, LBMOptimisation
@@ -17,8 +21,8 @@ from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel

 from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily
 from pystencils_walberla.utility import config_from_context
-from pystencils_walberla import generate_sweep_collection
-from lbmpy_walberla.utility import create_pdf_field
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+from lbmpy_walberla.utility import create_pdf_field, timestep_suffix

 from .alternating_sweeps import EvenIntegerCondition
 from .function_generator import kernel_family_function_generator
@@ -28,7 +32,7 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
                                  lbm_config: LBMConfig, lbm_optimisation: LBMOptimisation,
                                  refinement_scaling=None, macroscopic_fields: Dict[str, Field] = None,
                                  target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
-                                  max_threads=None,
+                                  max_threads=None, set_pre_collision_pdfs=True,
                                  **create_kernel_params):

    config = config_from_context(ctx, target=target, data_type=data_type,
@@ -59,7 +63,6 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
                                         field_layout=lbm_optimisation.field_layout)

    config = replace(config, ghost_layers=0)
-
    function_generators = []

    def family(name):
@@ -68,6 +71,21 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
    def generator(name, kernel_family):
        return kernel_family_function_generator(name, kernel_family, namespace='lbm', max_threads=max_threads)

+    all_fields = collision_rule.bound_fields.union(collision_rule.free_fields)
+    all_fields.update({src_field, dst_field})
+    all_fields = list(sorted(all_fields, key=lambda e: str(e)))
+
+    swap_pairs = [(x, y) for x in all_fields for y in all_fields if y.name == f"{x.name}_tmp"]
+    for swap_pair in swap_pairs:
+        all_fields.remove(swap_pair[1])
+
+    bw_stream_collide = None
+    bw_stream = None
+    if is_inplace(streaming_pattern):
+        bw_stream_collide = block_wise_stream_collide(class_name, collision_rule, lbm_config,
+                                                      src_field, dst_field, config)
+        bw_stream = block_wise_stream(class_name, collision_rule, lbm_config, src_field, dst_field, config)
+
    function_generators.append(generator('streamCollide', family("streamCollide")))
    function_generators.append(generator('collide', family("collide")))
    function_generators.append(generator('stream', family("stream")))
@@ -76,7 +94,7 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
    config_unoptimized = replace(config, cpu_vectorize_info=None, cpu_prepend_optimizations=[], cpu_blocking=None)

    setter_family = get_setter_family(class_name, lb_method, src_field, streaming_pattern, macroscopic_fields,
-                                      config_unoptimized)
+                                      config_unoptimized, set_pre_collision_pdfs)
    setter_generator = kernel_family_function_generator('initialise', setter_family,
                                                        namespace='lbm', max_threads=max_threads)
    function_generators.append(setter_generator)
@@ -87,7 +105,66 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
                                                        namespace='lbm', max_threads=max_threads)
    function_generators.append(getter_generator)

-    generate_sweep_collection(ctx, class_name, function_generators, refinement_scaling)
+    contexts_function_generators = list()
+    for fct in function_generators:
+        contexts_function_generators.append(fct())
+
+    namespaces = set([context['namespace'] for context in contexts_function_generators])
+    assert len(namespaces) == 1, "All function_generators must output the same namespace!"
+    namespace = namespaces.pop()
+
+    headers = set()
+    for context in contexts_function_generators:
+        for header in context['interface_spec'].headers:
+            headers.add(header)
+        for header in context['kernel'].get_headers():
+            headers.add(header)
+
+    kernel_list = list()
+    for context in contexts_function_generators:
+        kernel_list.append(context['kernel'])
+
+    kernels = list()
+    for context in contexts_function_generators:
+        kernels.append({
+            'kernel': context['kernel'],
+            'function_name': context['function_name'],
+            'ghost_layers_to_include': 'ghost_layers',
+            'field': context['field'],
+            'max_threads': context['max_threads']
+        })
+
+    target = kernels[0]['kernel'].target
+
+    jinja_context = {
+        'block_stream_collide': bw_stream_collide,
+        'block_stream': bw_stream,
+        'pdf_field': src_field,
+        'all_fields': all_fields,
+        'swap_pairs': swap_pairs,
+        'kernel_list': kernel_list,
+        'kernels': kernels,
+        'namespace': namespace,
+        'class_name': class_name,
+        'headers': headers,
+        'target': target.name.lower(),
+        'is_gpu': target == Target.GPU,
+        'parameter_scaling': refinement_scaling,
+        'stencil_name': lbm_config.stencil.name,
+        'D': lbm_config.stencil.D,
+        'Q': lbm_config.stencil.Q,
+        'inplace': is_inplace(lbm_config.streaming_pattern)
+    }
+
+    env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+
+    header = env.get_template("LBMSweepCollection.tmpl.h").render(**jinja_context)
+    source = env.get_template("LBMSweepCollection.tmpl.cpp").render(**jinja_context)
+
+    source_extension = "cu" if target == Target.GPU and ctx.cuda else "cpp"
+    ctx.write_file(f"{class_name}.h", header)
+    ctx.write_file(f"{class_name}.{source_extension}", source)


 class RefinementScaling:
@@ -167,14 +244,15 @@ def lbm_kernel_family(class_name, kernel_name,
    return family


-def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields, config: CreateKernelConfig):
+def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields,
+                      config: CreateKernelConfig, set_pre_collision_pdfs):
    dim = lb_method.stencil.D
    density = macroscopic_fields.get('density', 1.0)
    velocity = macroscopic_fields.get('velocity', [0.0] * dim)

    default_dtype = config.data_type.default_factory()

-    get_timestep = {"field_name": pdfs.name, "function": "getTimestep"}
+    get_timestep = {"field_name": pdfs.name, "function": "getTimestepPlusOne"}
    temporary_fields = ()
    field_swaps = ()

@@ -184,7 +262,8 @@ def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
            timestep_suffix = str(timestep)
            setter = macroscopic_values_setter(lb_method,
                                               density=density, velocity=velocity, pdfs=pdfs,
-                                               streaming_pattern=streaming_pattern, previous_timestep=timestep)
+                                               streaming_pattern=streaming_pattern, previous_timestep=timestep,
+                                               set_pre_collision_pdfs=set_pre_collision_pdfs)

            if default_dtype != pdfs.dtype:
                setter = add_subexpressions_for_field_reads(setter, data_type=default_dtype)
@@ -198,7 +277,8 @@ def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
        timestep = Timestep.BOTH
        setter = macroscopic_values_setter(lb_method,
                                           density=density, velocity=velocity, pdfs=pdfs,
-                                           streaming_pattern=streaming_pattern, previous_timestep=timestep)
+                                           streaming_pattern=streaming_pattern, previous_timestep=timestep,
+                                           set_pre_collision_pdfs=set_pre_collision_pdfs)

        setter_ast = create_kernel(setter, config=config)
        setter_ast.function_name = 'kernel_initialise'
@@ -249,3 +329,127 @@ def get_getter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
        family = KernelFamily(node, class_name, temporary_fields=temporary_fields, field_swaps=field_swaps)

    return family
+
+
+def block_wise_stream_collide(class_name, collision_rule, lbm_config, src_field, dst_field, config):
+
+    if not is_inplace(lbm_config.streaming_pattern):
+        ast, all_fields = create_block_wise_ast(collision_rule, src_field, dst_field,
+                                                lbm_config, Timestep.BOTH, config, False)
+        tree = KernelCallNode(ast)
+    else:
+        ast_even, all_fields = create_block_wise_ast(collision_rule, src_field, dst_field,
+                                                     lbm_config, Timestep.EVEN, config, False)
+        even_call = KernelCallNode(ast_even)
+        ast_odd, _ = create_block_wise_ast(collision_rule, src_field, dst_field,
+                                           lbm_config, Timestep.ODD, config, False)
+        odd_call = KernelCallNode(ast_odd)
+        tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8)
+
+    family = KernelFamily(tree, class_name)
+
+    indexed_to_field_name = dict()
+    for field in all_fields:
+        indexed_to_field_name[field.name] = f"_data_{field.name}_dp"
+
+    context = {
+        'kernel': family,
+        'all_fields': all_fields,
+        'namespace': 'lbm',
+        'function_name': 'blockStreamCollide',
+        'indexed_to_field_name': indexed_to_field_name,
+        'max_threads': None
+    }
+
+    return context
+
+
+def block_wise_stream(class_name, collision_rule, lbm_config, src_field, dst_field, config):
+
+    if not is_inplace(lbm_config.streaming_pattern):
+        ast, all_fields = create_block_wise_ast(collision_rule, src_field, dst_field,
+                                                lbm_config, Timestep.BOTH, config, True)
+        tree = KernelCallNode(ast)
+    else:
+        ast_even, all_fields = create_block_wise_ast(collision_rule, src_field, dst_field,
+                                                     lbm_config, Timestep.EVEN, config, True)
+        even_call = KernelCallNode(ast_even)
+        ast_odd, _ = create_block_wise_ast(collision_rule, src_field, dst_field,
+                                           lbm_config, Timestep.ODD, config, True)
+        odd_call = KernelCallNode(ast_odd)
+        tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8)
+
+    family = KernelFamily(tree, class_name)
+
+    indexed_to_field_name = dict()
+    for field in all_fields:
+        indexed_to_field_name[field.name] = f"_data_{field.name}_dp"
+
+    context = {
+        'kernel': family,
+        'all_fields': all_fields,
+        'namespace': 'lbm',
+        'function_name': 'blockStream',
+        'indexed_to_field_name': indexed_to_field_name,
+        'max_threads': None
+    }
+
+    return context
+
+
+def create_block_wise_ast(collision_rule, src_field, dst_field, lbm_config, timestep, config, stream_only):
+    stencil = lbm_config.stencil
+    streaming_pattern = lbm_config.streaming_pattern
+    default_dtype = config.data_type.default_factory()
+    config = replace(config, gpu_indexing_params={})
+
+    accessor = get_accessor(streaming_pattern, timestep)
+
+    if stream_only:
+        update_rule = create_stream_only_kernel(stencil, src_field, dst_field, accessor)
+    else:
+        update_rule = create_lbm_kernel(collision_rule, src_field, dst_field, accessor, data_type=default_dtype)
+
+    bound_fields = update_rule.bound_fields
+    free_fields = update_rule.free_fields
+
+    all_fields = list(bound_fields.union(free_fields))
+    all_fields.sort(key=lambda field: field.name)
+
+    index = TypedSymbol("index", dtype=BasicType(np.int64))
+    index_shape = TypedSymbol("_size_0", dtype=BasicType(np.int64))
+
+    ass = list()
+    for field in all_fields:
+        const = True if field in free_fields else False
+        ptr_type = PointerType(field.dtype, const=const, restrict=True, double_pointer=True)
+        ptr = FieldPointerSymbol(field.name, field.dtype, const=const)
+        f = sp.IndexedBase(TypedSymbol(f"_data_{field.name}_dp", dtype=ptr_type), shape=index_shape)
+        ass.append(Assignment(ptr, f[index]))
+
+    update_rule = ass + update_rule.all_assignments
+
+    ast = create_kernel(update_rule, config=config)
+    base_name = "kernel_BlockStream" if stream_only else "kernel_BlockStreamCollide"
+    ast.function_name = base_name + timestep_suffix(timestep)
+    ast.assumed_inner_stride_one = config.cpu_vectorize_info['assume_inner_stride_one']
+    return ast, all_fields
+
+
+
+# def subexpressions_for_field_lbm_writes(ac, lattice_stencil, src_field):
+#     field_writes = set()
+#
+#     for assignment in ac.all_assignments:
+#         if hasattr(assignment, 'lhs') and hasattr(assignment, 'rhs'):
+#             field_writes.update(assignment.lhs.atoms(Field.Access))
+#
+#     field_writes = {f for f in field_writes if f.field.name == src_field.name}
+#     field_writes = list(field_writes)
+#     lbm_reads.sort(key=lambda field_read: lattice_stencil.index(field_read.offsets))
+#
+#     substitutions = {fa: sp.Symbol(f"p_{i}") for i, fa in enumerate(lbm_reads)}
+#     for i, fa in enumerate(other_reads):
+#         substitutions[fa] = sp.Symbol(f"aleph_{i}")
+#     return (ac.new_with_substitutions(substitutions, add_substitutions_as_subexpressions=True,
+#                                       substitute_on_lhs=False, sort_topologically=False), lbm_reads)
--- a/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
+++ b/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
@@ -41,12 +41,12 @@ class {{class_name}}
   enum Type { ALL = 0, INNER = 1, OUTER = 2 };


-   {{class_name}}( {{- ["const shared_ptr<StructuredBlockForest> & blocks", "BlockDataID flagID_", "BlockDataID pdfsID_", "FlagUID domainUID_", [kernel_list|generate_constructor_parameters(['indexVector', 'indexVectorSize', 'pdfs'])], additional_constructor_arguments] | type_identifier_list -}} )
+   {{class_name}}( {{- ["const shared_ptr<StructuredBlockForest> & blocks", "BlockDataID flagID_", "BlockDataID pdfsID_", "FlagUID domainUID_", [kernel_list|generate_constructor_parameters(['indexVector', 'indexVectorSize', 'forceVector', 'forceVectorSize', 'pdfs'])], additional_constructor_arguments] | type_identifier_list -}} )
      : blocks_(blocks), flagID(flagID_), pdfsID(pdfsID_), domainUID(domainUID_)
   {
      {% for object_name, boundary_class, kernel, additional_data_handler in zip(object_names, boundary_classes, kernel_list, additional_data_handlers) -%}

-      {{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", "pdfsID", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'pdfs', 'timestep', 'gpuStream'], use_field_ids=True)], additional_data_handler.constructor_argument_name] | type_identifier_list -}});
+      {{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", "pdfsID", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'forceVector', 'forceVectorSize', 'pdfs', 'timestep', 'gpuStream'], use_field_ids=True)], additional_data_handler.constructor_argument_name] | type_identifier_list -}});
      {% endfor %}

      {% for object_name, flag_uid in zip(object_names, flag_uids) -%}

--- a/python/lbmpy_walberla/templates/LBMSweepCollection.tmpl.cpp
+++ b/python/lbmpy_walberla/templates/LBMSweepCollection.tmpl.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\author pystencils
+//======================================================================================================================
+#include "{{class_name}}.h"
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#   pragma GCC diagnostic ignored "-Wignored-qualifiers"
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning push
+#pragma warning( disable :  1599 )
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 191
+#else
+#pragma diag_suppress 191
+#endif
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace {{namespace}} {
+{%if block_stream_collide -%}
+{{block_stream_collide['kernel']|generate_definitions(target, block_stream_collide['max_threads'])}}
+{{block_stream['kernel']|generate_definitions(target, block_stream['max_threads'])}}
+{%- endif %}
+
+{% for kernel in kernels %}
+{{kernel['kernel']|generate_definitions(target, kernel['max_threads'])}}
+{% endfor %}
+
+{%if block_stream_collide -%}
+void {{class_name}}::{{block_stream_collide['function_name']}}({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}})
+{
+   {%if target is equalto 'gpu' -%}
+   dim3 _grid = grid_[level];
+   dim3 _block = block_[level];
+
+   {%- for field in block_stream_collide['all_fields'] %}
+   {{field.dtype.c_name}} ** {{block_stream_collide['indexed_to_field_name'][field.name]}} = {{field.name}}PointersGPU[level];
+   {%- endfor %}
+
+   {%- for swap_pair in swap_pairs %}
+   {{swap_pair[1].dtype.c_name}} ** {{block_stream_collide['indexed_to_field_name'][swap_pair[1].name]}} = {{swap_pair[1].name}}PointersGPU[level];
+   {%- endfor %}
+   {% else %}
+
+   {%- for field in block_stream_collide['all_fields'] %}
+   {{field.dtype.c_name}} ** {{block_stream_collide['indexed_to_field_name'][field.name]}} = {{field.name}}Pointers[level].data();
+   {%- endfor %}
+
+   {%- for swap_pair in swap_pairs %}
+   {{swap_pair[1].dtype.c_name}} ** {{block_stream_collide['indexed_to_field_name'][swap_pair[1].name]}} = {{swap_pair[1].name}}Pointers[level].data();
+   {%- endfor %}
+
+   {%- endif %}
+   const int64_t _size_0 = size_0[level];
+   int64_t _size_{{block_stream_collide['all_fields'][0].name}}_0 = size_1;
+   int64_t _size_{{block_stream_collide['all_fields'][0].name}}_1 = size_2;
+   int64_t _size_{{block_stream_collide['all_fields'][0].name}}_2 = size_3;
+
+   {{block_stream_collide['kernel']|generate_field_strides()|indent(3)}}
+   {{block_stream_collide['kernel']|generate_refs_for_kernel_parameters(prefix="this->", parameters_to_ignore=["_size_0"], ignore_fields=True, parameter_registration=parameter_scaling, level_known=True)|indent(3)}}
+   {{block_stream_collide['kernel']|generate_call(stream='stream', plain_kernel_call=True)|indent(3)}}
+
+   {%- for swap_pair in swap_pairs %}
+   {%if target is equalto 'gpu' -%}
+   std::swap({{swap_pair[0].name}}PointersGPU[level], {{swap_pair[1].name}}PointersGPU[level]);
+   {% else %}
+   std::swap({{swap_pair[0].name}}Pointers[level], {{swap_pair[1].name}}Pointers[level]);
+   {%- endif %}
+   {%- endfor %}
+}
+
+void {{class_name}}::ghostLayerPropagation({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}})
+{
+   {{block_stream['kernel']|generate_field_strides()|indent(3)}}
+
+   {%if target is equalto 'gpu' -%}
+   auto parallelSection_ = parallelStreams_.parallelSection( stream );
+   for (auto it = glPropagationPDFs[level].begin(); it != glPropagationPDFs[level].end(); it++){
+      if(it->second.empty()){ continue;}
+
+      int64_t _size_0 = int64_c(it->second.size());
+      int64_t _size_{{pdf_field.name}}_0 = std::get<0>(it->first);
+      int64_t _size_{{pdf_field.name}}_1 = std::get<1>(it->first);
+      int64_t _size_{{pdf_field.name}}_2 = std::get<2>(it->first);
+
+      double ** _data_{{pdf_field.name}}_dp = glPropagationPDFsGPU[level][it->first];
+      dim3 _grid = glPropagationGrid_[level][it->first];
+      dim3 _block = glPropagationBlock_[level][it->first];
+      parallelSection_.run([&]( auto s ) {
+      {{block_stream['kernel']|generate_call(stream='s', plain_kernel_call=True)|indent(9)}}
+      });
+   }
+
+   {% else %}
+
+   for (auto it = glPropagationPDFs[level].begin(); it != glPropagationPDFs[level].end(); it++){
+      if(it->second.empty()){ continue;}
+
+      int64_t _size_0 = int64_c(it->second.size());
+      int64_t _size_{{pdf_field.name}}_0 = std::get<0>(it->first);
+      int64_t _size_{{pdf_field.name}}_1 = std::get<1>(it->first);
+      int64_t _size_{{pdf_field.name}}_2 = std::get<2>(it->first);
+
+      double ** _data_{{pdf_field.name}}_dp = it->second.data();
+      {{block_stream['kernel']|generate_call(stream='s', plain_kernel_call=True)|indent(6)}}
+   }
+   {%- endif %}
+}
+{%- endif %}
+
+{% for kernel in kernels %}
+void {{class_name}}::{{kernel['function_name']}}( {{kernel['kernel']|generate_plain_parameter_list(ghost_layers=True)}} )
+{
+   {{kernel['kernel']|generate_call(ghost_layers_to_include=kernel['ghost_layers_to_include'], stream='stream')|indent(3)}}
+}
+void {{class_name}}::{{kernel['function_name']}}CellInterval( {{kernel['kernel']|generate_plain_parameter_list(cell_interval='ci')}})
+{
+   {{kernel['kernel']|generate_call(stream='stream', cell_interval='ci')|indent(3)}}
+}
+{% endfor %}
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning pop
+#endif
--- a/python/lbmpy_walberla/templates/LBMSweepCollection.tmpl.h
+++ b/python/lbmpy_walberla/templates/LBMSweepCollection.tmpl.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/StructuredBlockForest.h"
+#include "blockforest/BlockID.h"
+#include "blockforest/Block.h"
+
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+#include "core/Macros.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/iterators/FieldIterator.h"
+
+{% if target is equalto 'gpu' -%}
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/GPUField.h"
+#include "gpu/ParallelStreams.h"
+{%- endif %}
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include "field/SwapableCompare.h"
+#include "field/GhostLayerField.h"
+
+#include "stencil/Directions.h"
+#include "stencil/{{stencil_name}}.h"
+
+#include <set>
+#include <cmath>
+
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+using namespace std::placeholders;
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#   pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace {{namespace}} {
+
+class {{class_name}}
+{
+ public:
+   enum Type { ALL = 0, INNER = 1, OUTER = 2 };
+   using sizeTuple = std::tuple<int64_t, int64_t, int64_t>;
+
+   {{class_name}}(const shared_ptr< StructuredBlockForest > & blocks, {{kernel_list|generate_constructor_parameters}}, const Cell & outerWidth=Cell(1, 1, 1))
+      : blocks_(blocks), {{ kernel_list|generate_constructor_initializer_list(parameter_registration=parameter_scaling) }}, outerWidth_(outerWidth)
+   {
+
+      {{kernel_list|generate_constructor(parameter_registration=parameter_scaling) |indent(6)}}
+      validInnerOuterSplit_ = true;
+
+      for (auto& iBlock : *blocks)
+      {
+         if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 ||
+             int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 ||
+             int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2)
+            validInnerOuterSplit_ = false;
+      }
+   }
+
+   void initialiseBlockPointer()
+   {
+      {%if block_stream_collide -%}
+
+      size_0.resize(blocks_->getNumberOfLevels());
+      {%- for field in block_stream_collide['all_fields'] %}
+      {{field.name}}Pointers.resize(blocks_->getNumberOfLevels());
+      {%if target is equalto 'gpu' -%} {{field.name}}PointersGPU.resize(blocks_->getNumberOfLevels()); {% endif %}
+      {%- endfor %}
+
+      {%if target is equalto 'gpu' -%} block_.resize(blocks_->getNumberOfLevels()); {% endif %}
+      {%if target is equalto 'gpu' -%} grid_.resize(blocks_->getNumberOfLevels()); {% endif %}
+
+      glPropagationPDFs.resize(blocks_->getNumberOfLevels());
+      {%if target is equalto 'gpu' -%} glPropagationPDFsGPU.resize(blocks_->getNumberOfLevels()); {% endif %}
+
+      {%if target is equalto 'gpu' -%} glPropagationBlock_.resize(blocks_->getNumberOfLevels()); {% endif %}
+      {%if target is equalto 'gpu' -%} glPropagationGrid_.resize(blocks_->getNumberOfLevels()); {% endif %}
+
+      for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
+      {
+         auto* local = dynamic_cast< Block* >(it.get());
+         {%- for field in block_stream_collide['all_fields'] %}
+         auto {{field.name}} = local->getData< {{field | field_type(is_gpu=is_gpu)}} >({{field.name}}ID);
+         {%- endfor %}
+
+         size_1 = int64_c({{block_stream_collide['all_fields'][0].name}}->xSize());
+         size_2 = int64_c({{block_stream_collide['all_fields'][0].name}}->ySize());
+         size_3 = int64_c({{block_stream_collide['all_fields'][0].name}}->zSize());
+
+         {%- for field in block_stream_collide['all_fields'] %}
+
+         stride_{{field.name}}_0 = int64_c({{field.name}}->xStride());
+         stride_{{field.name}}_1 = int64_c({{field.name}}->yStride());
+         stride_{{field.name}}_2 = int64_c({{field.name}}->zStride());
+         stride_{{field.name}}_3 = int64_c(1 * int64_c({{field.name}}->fStride()));
+
+         {%- endfor %}
+         break;
+      }
+
+      for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
+      {
+         auto* local = dynamic_cast< Block* >(it.get());
+         const uint_t level = local->getLevel();
+         {%- for field in block_stream_collide['all_fields'] %}
+         auto {{field.name}} = local->getData< {{field | field_type(is_gpu=is_gpu)}} >({{field.name}}ID);
+         {{field.name}}Pointers[level].emplace_back({{field.name}}->dataAt(0, 0, 0, 0));
+         {%- endfor %}
+
+
+         for(auto dir = stencil::{{stencil_name}}::beginNoCenter(); dir != stencil::{{stencil_name}}::end(); ++dir){
+            uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+            // Propagate on ghost layers shadowing coarse or no blocks
+            if(local->neighborhoodSectionHasLargerBlock(nSecIdx)){
+               CellInterval ci;
+               {{pdf_field.name}}->getGhostRegion(*dir, ci, 1);
+               sizeTuple dirTuple = std::make_tuple(int64_c(ci.xSize()), int64_c(ci.ySize()), int64_c(ci.zSize()));
+               glPropagationPDFs[level][dirTuple].push_back({{pdf_field.name}}->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0));
+            }
+         }
+      }
+
+      for (uint_t level = 0; level < blocks_->getNumberOfLevels(); level++) {
+         size_0[level] = int64_c({{pdf_field.name}}Pointers[level].size());
+
+         {%if target is equalto 'gpu' -%}
+
+         int64_t indexingX = size_1 * size_0[level];
+         int64_t indexingY = size_2;
+         int64_t indexingZ = size_3;
+
+         int64_t cudaBlockSize0 = 128;
+         int64_t cudaBlockSize1 = 1;
+         int64_t cudaBlockSize2 = 1;
+
+         block_[level] = dim3((unsigned int)((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)), (unsigned int)((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))), (unsigned int)((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))));
+         grid_[level]  = dim3((unsigned int)(( (indexingX) % (((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) == 0 ? (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) : ( (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) ) +1 )), (unsigned int)(( (indexingY) % (((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) == 0 ? (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) : ( (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) ) +1 )), (unsigned int)(( (indexingZ) % (((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) == 0 ? (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) : ( (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) ) +1 )));
+
+         {%- for field in block_stream_collide['all_fields'] %}
+
+         WALBERLA_GPU_CHECK(gpuMalloc( (void**)&{{field.name}}PointersGPU[level], sizeof(double* ) * {{field.name}}Pointers[level].size() ));
+         WALBERLA_GPU_CHECK(gpuMemcpy( {{field.name}}PointersGPU[level], &{{field.name}}Pointers[level][0], sizeof(double *) * {{field.name}}Pointers[level].size(), gpuMemcpyHostToDevice ));
+
+         {%- endfor %}
+
+         for (auto it = glPropagationPDFs[level].begin(); it != glPropagationPDFs[level].end(); it++){
+            if(it->second.empty()){ continue;}
+
+            indexingX = std::get<0>(it->first) * int64_c(it->second.size());
+            indexingY = std::get<1>(it->first);
+            indexingZ = std::get<2>(it->first);
+
+            cudaBlockSize0 = 32;
+            cudaBlockSize1 = 1;
+            cudaBlockSize2 = 1;
+
+            glPropagationBlock_[level][it->first] = dim3((unsigned int)((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)), (unsigned int)((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))), (unsigned int)((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))));
+            glPropagationGrid_[level][it->first]  = dim3((unsigned int)(( (indexingX) % (((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) == 0 ? (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) : ( (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) ) +1 )), (unsigned int)(( (indexingY) % (((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) == 0 ? (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) : ( (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) ) +1 )), (unsigned int)(( (indexingZ) % (((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) == 0 ? (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) : ( (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) ) +1 )));
+
+            WALBERLA_GPU_CHECK(gpuMalloc( (void**)&glPropagationPDFsGPU[level][it->first], sizeof(double* ) * it->second.size() ));
+            WALBERLA_GPU_CHECK(gpuMemcpy( glPropagationPDFsGPU[level][it->first], &it->second[0], sizeof(double* ) * it->second.size(), gpuMemcpyHostToDevice ));
+         }
+
+         {% endif %}
+      }
+      {% endif %}
+
+   };
+
+   ~{{class_name}}() {
+      {%if target is equalto 'gpu' and block_stream_collide -%}
+      for (uint_t level = 0; level < blocks_->getNumberOfLevels(); level++)
+      {
+         {%- for field in block_stream_collide['all_fields'] %}
+         if(!{{field.name}}Pointers[level].empty()){
+            WALBERLA_GPU_CHECK(gpuFree({{field.name}}PointersGPU[level]));
+         }
+         {%- endfor %}
+
+         for (auto it = glPropagationPDFs[level].begin(); it != glPropagationPDFs[level].end(); it++){
+            if(it->second.empty()){ continue;}
+            WALBERLA_GPU_CHECK(gpuFree(glPropagationPDFsGPU[level][it->first]));
+         }
+      }
+      {%- endif %}
+   }
+
+
+   /*************************************************************************************
+   *                Internal Function Definitions with raw Pointer
+   *************************************************************************************/
+
+   {%if block_stream_collide -%}
+   void blockStreamCollide({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}});
+   void ghostLayerPropagation({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}});
+   {%- endif %}
+   {%- for kernel in kernels %}
+   static void {{kernel['function_name']}} ({{kernel['kernel']|generate_plain_parameter_list(ghost_layers=0, stream="nullptr")}});
+   static void {{kernel['function_name']}}CellInterval ({{kernel['kernel']|generate_plain_parameter_list(cell_interval='ci', stream="nullptr")}});
+   {% endfor %}
+
+   /*************************************************************************************
+   *                Function Definitions for external Usage
+   *************************************************************************************/
+
+   std::function<void ()> blockStreamCollideFck({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+   {
+      {%if block_stream_collide -%}
+      return [{{- ["this", "level", "timestep", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}](){
+         blockStreamCollide({{- ["level", "timestep", ["stream"] if target == 'gpu' else []] | type_identifier_list -}});
+      };
+      {% else %}
+      WALBERLA_ABORT("Block wise stream collide is only generated when inplace streaming patterns are used")
+      {%- endif %}
+   }
+
+   void streamCollideOverBlocks({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+   {
+      {%if block_stream_collide -%}
+      blockStreamCollide({{- ["level", "timestep", ["stream"] if target == 'gpu' else []] | type_identifier_list -}});
+      {% else %}
+      WALBERLA_ABORT("Block wise stream collide is only generated when inplace streaming patterns are used")
+      {%- endif %}
+   }
+
+
+   {%- for kernel in kernels %}
+
+   std::function<void (IBlock *)> {{kernel['function_name']}}()
+   {
+      return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); };
+   }
+
+   std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", ] | type_identifier_list -}})
+   {
+      if (!validInnerOuterSplit_ && type != Type::ALL)
+         WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
+
+      switch (type)
+      {
+      case Type::INNER:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
+      case Type::OUTER:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
+      default:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); };
+      }
+   }
+
+   std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers"] | type_identifier_list -}})
+   {
+      if (!validInnerOuterSplit_ && type != Type::ALL)
+         WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
+
+      switch (type)
+      {
+      case Type::INNER:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
+      case Type::OUTER:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
+      default:
+         return [{{- ["this", "ghost_layers"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers"] | type_identifier_list -}}); };
+      }
+   }
+
+   {% if target is equalto 'gpu' -%}
+   std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers", "gpuStream_t gpuStream"] | type_identifier_list -}})
+   {
+      if (!validInnerOuterSplit_ && type != Type::ALL)
+         WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
+
+      switch (type)
+      {
+      case Type::INNER:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+      case Type::OUTER:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+      default:
+         return [{{- ["this", "ghost_layers", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers", "gpuStream"] | type_identifier_list -}}); };
+      }
+   }
+
+   std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "gpuStream_t gpuStream"] | type_identifier_list -}})
+   {
+      if (!validInnerOuterSplit_ && type != Type::ALL)
+         WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
+
+      switch (type)
+      {
+      case Type::INNER:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+      case Type::OUTER:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+      default:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "cell_idx_c(0)", "gpuStream"] | type_identifier_list -}}); };
+      }
+   }
+   {%- endif %}
+
+   void {{kernel['function_name']}}({{- ["IBlock * block",] | type_identifier_list -}})
+   {
+      const cell_idx_t ghost_layers = 0;
+      {% if target is equalto 'gpu' -%}
+      gpuStream_t gpuStream = nullptr;
+      {%- endif %}
+
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+      {{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}});
+      {{kernel['kernel']|generate_swaps|indent(6)}}
+   }
+
+   void {{kernel['function_name']}}({{- ["IBlock * block", "const cell_idx_t ghost_layers"] | type_identifier_list -}})
+   {
+      {% if target is equalto 'gpu' -%}
+      gpuStream_t gpuStream = nullptr;
+      {%- endif %}
+
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+      {{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}});
+      {{kernel['kernel']|generate_swaps|indent(6)}}
+   }
+
+   {% if target is equalto 'gpu' -%}
+   void {{kernel['function_name']}}({{- ["IBlock * block", "const cell_idx_t ghost_layers", "gpuStream_t gpuStream"] | type_identifier_list -}})
+   {
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+      {{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}});
+      {{kernel['kernel']|generate_swaps|indent(6)}}
+   }
+   {%- endif %}
+
+   void {{kernel['function_name']}}CellInterval({{- ["IBlock * block", "const CellInterval & ci", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+      {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
+      {{kernel['kernel']|generate_swaps|indent(6)}}
+   }
+
+   void {{kernel['function_name']}}Inner({{- ["IBlock * block", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements(advance=False)|indent(6)}}
+
+      CellInterval inner = {{kernel['field']}}->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='inner')}});
+   }
+
+   void {{kernel['function_name']}}Outer({{- ["IBlock * block", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+      {%if target is equalto 'gpu'%}
+      {
+         auto parallelSection_ = parallelStreams_.parallelSection( gpuStream );
+         for( auto & ci: layers_ )
+         {
+            parallelSection_.run([&]( auto s ) {
+               {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
+            });
+         }
+      }
+      {% else %}
+      for( auto & ci: layers_ )
+      {
+         {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
+      }
+      {% endif %}
+
+      {{kernel['kernel']|generate_swaps|indent(9)}}
+   }
+   {% endfor %}
+
+   {%if target is equalto 'gpu'%}
+   void setOuterPriority(int priority)
+   {
+      parallelStreams_.setStreamPriority(priority);
+   }
+   {%endif%}
+
+ private:
+   shared_ptr< StructuredBlockForest > blocks_;
+   {{kernel_list|generate_members(parameter_registration=parameter_scaling)|indent(4)}}
+
+   Cell outerWidth_;
+   std::vector<CellInterval> layers_;
+   bool validInnerOuterSplit_;
+   {%if target is equalto 'gpu' -%}
+   gpu::ParallelStreams parallelStreams_;
+   {%- endif %}
+
+   {%if block_stream_collide -%}
+
+   std::vector<int64_t> size_0;
+
+   int64_t size_1;
+   int64_t size_2;
+   int64_t size_3;
+
+   {%- for field in block_stream_collide['all_fields'] %}
+   int64_t stride_{{field.name}}_0;
+   int64_t stride_{{field.name}}_1;
+   int64_t stride_{{field.name}}_2;
+   int64_t stride_{{field.name}}_3;
+   {% endfor %}
+
+   {%- for field in block_stream_collide['all_fields'] %}
+   std::vector<std::vector<{{field.dtype.c_name}} *>> {{field.name}}Pointers;
+   {% endfor -%}
+
+   std::vector<std::map<sizeTuple, std::vector<double *>>> glPropagationPDFs;
+   {%if target is equalto 'gpu' -%}
+
+   {%- for field in block_stream_collide['all_fields'] %}
+   std::vector<{{field.dtype.c_name}} **> {{field.name}}PointersGPU;
+   {% endfor -%}
+
+   std::vector<dim3> block_;
+   std::vector<dim3> grid_;
+
+   std::vector<std::map<sizeTuple, double **>> glPropagationPDFsGPU;
+
+   std::vector<std::map<sizeTuple, dim3>> glPropagationBlock_;
+   std::vector<std::map<sizeTuple, dim3>> glPropagationGrid_;
+   {%- endif %}
+   {%- endif %}
+};
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
--- a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
+++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
@@ -19,12 +19,45 @@

 #include "{{class_name}}.h"

+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+#include "gpu/GPUWrapper.h"
+#include "gpu/GPUField.h"
+{%- endif %}
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+
 #if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
 #   pragma GCC diagnostic push
 #   pragma GCC diagnostic ignored "-Wfloat-equal"
 #   pragma GCC diagnostic ignored "-Wshadow"
 #   pragma GCC diagnostic ignored "-Wconversion"
 #   pragma GCC diagnostic ignored "-Wunused-variable"
+#   pragma GCC diagnostic ignored "-Wignored-qualifiers"
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning push
+#pragma warning( disable :  1599 )
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 191
+#else
+#pragma diag_suppress 191
+#endif
 #endif

 namespace walberla {
@@ -42,6 +75,8 @@ namespace {{namespace}} {
   {{ kernels['localCopyDirection'] | generate_definitions }}

   {% if nonuniform -%}
+   {{ kernels['localCopyRedistribute']    | generate_definitions }}
+   {{ kernels['localPartialCoalescence']    | generate_definitions }}
   {{ kernels['unpackRedistribute']    | generate_definitions }}
   {{ kernels['packPartialCoalescence']    | generate_definitions }}
   {{ kernels['zeroCoalescenceRegion']    | generate_definitions }}
@@ -89,7 +124,7 @@ namespace {{namespace}} {
      WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())

      {{kernels['localCopyAll']
-               | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
+               | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream')
               | indent(6) }}
   }

@@ -115,6 +150,56 @@ namespace {{namespace}} {
      {{kernels['unpackDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
   }

+   void {{class_name}}::PackKernels::localCopyDirection(
+      {{- [src_field.dtype.c_name + "** _data_" + src_field.name + "_dp", dst_field.dtype.c_name + "** _data_" + dst_field.name + "_dp",
+            kernels['localCopyDirection'].kernel_selection_parameters,
+            ["gpuStream_t stream"] if is_gpu else [], "std::array<int64_t, 4>& _sizes", "std::array<int64_t, 4>& _strides"]
+          | type_identifier_list -}}
+   ) const {
+      {% if block_wise -%}
+
+      {% if target is equalto 'gpu' -%}
+
+      const int64_t indexingX = _sizes[0] * _sizes[1];
+      const int64_t indexingY = _sizes[2];
+      const int64_t indexingZ = _sizes[3];
+
+      const int64_t cudaBlockSize0 = 128;
+      const int64_t cudaBlockSize1 = 1;
+      const int64_t cudaBlockSize2 = 1;
+
+      const int64_t _size_0 = _sizes[0];
+
+      const int64_t _size_{{src_field.name}}_0 = _sizes[1];
+      const int64_t _size_{{src_field.name}}_1 = _sizes[2];
+      const int64_t _size_{{src_field.name}}_2 = _sizes[3];
+      const int64_t _size_{{dst_field.name}}_0 = _sizes[1];
+      const int64_t _size_{{dst_field.name}}_1 = _sizes[2];
+      const int64_t _size_{{dst_field.name}}_2 = _sizes[3];
+
+      const int64_t _stride_{{src_field.name}}_0 = _strides[0];
+      const int64_t _stride_{{src_field.name}}_1 = _strides[1];
+      const int64_t _stride_{{src_field.name}}_2 = _strides[2];
+      const int64_t _stride_{{src_field.name}}_3 = _strides[3];
+
+      const int64_t _stride_{{dst_field.name}}_0 = _strides[0];
+      const int64_t _stride_{{dst_field.name}}_1 = _strides[1];
+      const int64_t _stride_{{dst_field.name}}_2 = _strides[2];
+      const int64_t _stride_{{dst_field.name}}_3 = _strides[3];
+
+      const dim3 _block = dim3((unsigned int)((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)), (unsigned int)((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))), (unsigned int)((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))));
+      const dim3 _grid  = dim3((unsigned int)(( (indexingX) % (((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) == 0 ? (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) : ( (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) ) +1 )), (unsigned int)(( (indexingY) % (((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) == 0 ? (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) : ( (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) ) +1 )), (unsigned int)(( (indexingZ) % (((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) == 0 ? (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) : ( (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) ) +1 )));
+      {%- endif %}
+
+      {{kernels['localCopyDirection']
+               | generate_call(plain_kernel_call=True, stream='stream')
+               | indent(6) }}
+
+      {%else%}
+      WALBERLA_ABORT("Block wise local communication is not implemented")
+      {%- endif %}
+   }
+
   void {{class_name}}::PackKernels::localCopyDirection(
      {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
             "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
@@ -123,16 +208,42 @@ namespace {{namespace}} {
          | type_identifier_list -}}
   ) const
   {
+      {% if not block_wise -%}
      WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
      WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
      WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())

      {{kernels['localCopyDirection']
-               | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
+               | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream')
               | indent(6) }}
+
+      {%else%}
+      WALBERLA_ABORT("Local communication is only implemented block wise")
+      {%- endif %}
   }

+
   {% if nonuniform -%}
+   void {{class_name}}::PackKernels::localCopyRedistribute(
+      {{- [  "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+             "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localCopyRedistribute'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{kernels['localCopyRedistribute'] | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }}
+   }
+
+   void {{class_name}}::PackKernels::localPartialCoalescence(
+      {{- [  "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & srcInterval",
+             "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localPartialCoalescence'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{kernels['localPartialCoalescence'] | generate_call(cell_interval={src_field.name : 'srcInterval', mask_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }}
+   }
+
   void {{class_name}}::PackKernels::unpackRedistribute(
      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
             "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters,

--- a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
+++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
@@ -30,21 +30,13 @@
 #include "stencil/{{stencil_name}}.h"
 #include "stencil/Directions.h"

-{% if target is equalto 'cpu' -%}
-#define FUNC_PREFIX
-{%- elif target is equalto 'gpu' -%}
-#define FUNC_PREFIX __global__
+{% if target is equalto 'gpu' -%}
 #include "gpu/GPUWrapper.h"
 #include "gpu/GPUField.h"
 {%- endif %}

-#ifdef __GNUC__
-#define RESTRICT __restrict__
-#elif _MSC_VER
-#define RESTRICT __restrict
-#else
-#define RESTRICT
-#endif
+#include <array>
+

 #if defined WALBERLA_CXX_COMPILER_IS_GNU || defined WALBERLA_CXX_COMPILER_IS_CLANG
 #pragma GCC diagnostic push
@@ -178,6 +170,13 @@ class {{class_name}}
      /** Copies data between two blocks on the same process.
        * PDFs streaming aligned with the direction dir are copied from the sending interval onto the receiving interval.
        * */
+      void localCopyDirection(
+         {{- [src_field.dtype.c_name + "** _data_" + src_field.name + "_dp", dst_field.dtype.c_name + "** _data_" + dst_field.name + "_dp",
+                kernels['localCopyDirection'].kernel_selection_parameters,
+                ["gpuStream_t stream"] if is_gpu else [], "std::array<int64_t, 4>& _sizes", "std::array<int64_t, 4>& _strides"]
+             | type_identifier_list -}}
+      ) const;
+
      void localCopyDirection(
         {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
                "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
@@ -186,6 +185,7 @@ class {{class_name}}
             | type_identifier_list -}}
      ) const;

+
      /**
       * Returns the number of bytes that will be packed from / unpacked to the cell interval
       * when using packDirection / unpackDirection
@@ -209,6 +209,26 @@ class {{class_name}}

      {% if nonuniform -%}

+      /**
+       * Local uniform redistribute.
+       * */
+      void localCopyRedistribute(
+         {{- [  "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+                "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localCopyRedistribute'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Local partial coalescence.
+       * */
+      void localPartialCoalescence(
+         {{- [  "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & srcInterval",
+                "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localPartialCoalescence'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
      /**
       * Unpacks and uniformly redistributes populations coming from a coarse block onto the fine grid.
       * */

--- a/python/lbmpy_walberla/walberla_lbm_package.py
+++ b/python/lbmpy_walberla/walberla_lbm_package.py
@@ -3,7 +3,6 @@ from typing import Callable, List, Dict
 from pystencils import Target, Field

 from lbmpy.creationfunctions import LbmCollisionRule, LBMConfig, LBMOptimisation
-from lbmpy.relaxationrates import get_shear_relaxation_rate

 from pystencils_walberla.cmake_integration import CodeGenerationContext

@@ -20,7 +19,7 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
                         target: Target = Target.CPU,
                         data_type=None, pdfs_data_type=None,
                         cpu_openmp=None, cpu_vectorize_info=None,
-                         max_threads=None,
+                         max_threads=None, set_pre_collision_pdfs=True,
                         **kernel_parameters):

    if macroscopic_fields is None:
@@ -35,7 +34,7 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
                                       cpu_openmp=cpu_openmp)

    if nonuniform:
-        omega = get_shear_relaxation_rate(method)
+        omega = lbm_config.relaxation_rate
        refinement_scaling = RefinementScaling()
        refinement_scaling.add_standard_relaxation_rate_scaling(omega)
    else:
@@ -47,14 +46,15 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
                                  macroscopic_fields=macroscopic_fields,
                                  target=target, data_type=data_type,
                                  cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info,
-                                  max_threads=max_threads,
+                                  max_threads=max_threads, set_pre_collision_pdfs=set_pre_collision_pdfs,
                                  **kernel_parameters)

    spatial_shape = None
    if lbm_optimisation.symbolic_field and lbm_optimisation.symbolic_field.has_fixed_shape:
        spatial_shape = lbm_optimisation.symbolic_field.spatial_shape + (lbm_config.stencil.Q, )

-    generate_boundary_collection(ctx, f'{name}BoundaryCollection', boundary_generators=boundaries,
-                                 lb_method=method, field_name='pdfs', spatial_shape=spatial_shape,
-                                 streaming_pattern=lbm_config.streaming_pattern,
-                                 target=target, layout=lbm_optimisation.field_layout)
+    if boundaries is not None:
+        generate_boundary_collection(ctx, f'{name}BoundaryCollection', boundary_generators=boundaries,
+                                     lb_method=method, field_name='pdfs', spatial_shape=spatial_shape,
+                                     streaming_pattern=lbm_config.streaming_pattern,
+                                     target=target, layout=lbm_optimisation.field_layout)
--- a/python/pystencils_walberla/__init__.py
+++ b/python/pystencils_walberla/__init__.py
-from .boundary import generate_staggered_boundary, generate_staggered_flux_boundary
+from .boundary import generate_boundary, generate_staggered_boundary, generate_staggered_flux_boundary
 from .cmake_integration import CodeGeneration, ManualCodeGenerationContext

 from .function_generator import function_generator
@@ -8,7 +8,7 @@ from .pack_info import (generate_pack_info, generate_pack_info_for_field,
                        generate_pack_info_from_kernel, generate_mpidtype_info_from_kernel)
 from .utility import generate_info_header, get_vectorize_instruction_set, config_from_context

-__all__ = ['generate_staggered_boundary', 'generate_staggered_flux_boundary',
+__all__ = ['generate_boundary', 'generate_staggered_boundary', 'generate_staggered_flux_boundary',
           'CodeGeneration', 'ManualCodeGenerationContext',
           'function_generator',
           'generate_sweep', 'generate_selective_sweep', 'generate_sweep_collection',

--- a/python/pystencils_walberla/boundary.py
+++ b/python/pystencils_walberla/boundary.py
@@ -67,8 +67,15 @@ def generate_boundary(generation_context,
    if not kernel_creation_function:
        kernel_creation_function = create_boundary_kernel

-    kernel = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object,
-                                      target=target, **create_kernel_params)
+    if boundary_object.calculate_force_on_boundary:
+        force_vector_type = np.dtype([(f"F_{i}", np.float64) for i in range(dim)], align=True)
+        force_vector = Field('forceVector', FieldType.INDEXED, force_vector_type, layout=[0],
+                             shape=(TypedSymbol("forceVectorSize", create_type("int32")), 1), strides=(1, 1))
+        kernel = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object,
+                                          target=target, force_vector=force_vector, **create_kernel_params)
+    else:
+        kernel = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object,
+                                          target=target, **create_kernel_params)

    if isinstance(kernel, KernelFunction):
        kernel.function_name = f"boundary_{boundary_object.name}"
@@ -103,7 +110,8 @@ def generate_boundary(generation_context,
        'additional_data_handler': additional_data_handler,
        'dtype': "double" if is_float else "float",
        'layout': layout,
-        'index_shape': index_shape
+        'index_shape': index_shape,
+        'calculate_force': boundary_object.calculate_force_on_boundary
    }

    env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)

--- a/python/pystencils_walberla/jinja_filters.py
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -9,7 +9,7 @@ import sympy as sp

 from pystencils import Target, Backend
 from pystencils.backends.cbackend import generate_c
-from pystencils.typing import TypedSymbol, get_base_type
+from pystencils.typing import TypedSymbol, get_base_type, PointerType
 from pystencils.field import FieldType
 from pystencils.sympyextensions import prod

@@ -226,17 +226,23 @@ def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_igno


 def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignore=(), ignore_fields=False,
-                                        parameter_registration=None):
+                                        parameter_registration=None, level_known=False):
+
+    pointer_symbols = {p.symbol.name for p in kernel_info.parameters
+                       if not p.is_field_parameter and isinstance(p.symbol.dtype, PointerType)}
    symbols = {p.field_name for p in kernel_info.parameters if p.is_field_pointer and not ignore_fields}
    symbols.update(p.symbol.name for p in kernel_info.parameters if not p.is_field_parameter)
    symbols.difference_update(parameters_to_ignore)
+    if ignore_fields:
+        symbols.difference_update(pointer_symbols)
    type_information = {p.symbol.name: p.symbol.dtype for p in kernel_info.parameters if not p.is_field_parameter}
    result = []
    registered_parameters = [] if not parameter_registration else parameter_registration.scaling_info
    for s in symbols:
        if s in registered_parameters:
            dtype = type_information[s].c_name
-            result.append("const uint_t level = block->getBlockStorage().getLevel(*block);")
+            if not level_known:
+                result.append("const uint_t level = block->getBlockStorage().getLevel(*block);")
            result.append(f"{dtype} & {s} = {s}Vector[level];")
        else:
            result.append(f"auto & {s} = {prefix}{s}_;")
@@ -245,7 +251,7 @@ def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignor

 @jinja2_context_decorator
 def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, stream='0',
-                  spatial_shape_symbols=()):
+                  spatial_shape_symbols=(), plain_kernel_call=False):
    """Generates the function call to a pystencils kernel

    Args:
@@ -265,6 +271,20 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
                               parameters - however in special cases like boundary conditions a manual specification
                               may be necessary.
    """
+    ast_params = kernel.parameters
+    if len(spatial_shape_symbols) == 0:
+        for param in ast_params:
+            if param.is_field_parameter and FieldType.is_indexed(param.fields[0]):
+                continue
+            if param.is_field_pointer:
+                field = param.fields[0]
+                if field.has_fixed_shape:
+                    spatial_shape_symbols = field.spatial_shape
+
+    if plain_kernel_call:
+        return kernel.generate_kernel_invocation_code(plain_kernel_call=True, stream=stream,
+                                                      spatial_shape_symbols=spatial_shape_symbols)
+
    assert isinstance(ghost_layers_to_include, str) or ghost_layers_to_include >= 0
    ast_params = kernel.parameters
    vec_info = ctx.get('cpu_vectorize_info', None)
@@ -296,7 +316,7 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
        if isinstance(cell_interval, str):
            return cell_interval
        elif isinstance(cell_interval, dict):
-            return cell_interval[field_object]
+            return cell_interval[field_object.name]
        else:
            return None

@@ -591,7 +611,8 @@ def generate_members(ctx, kernel_infos, parameters_to_ignore=None, only_fields=F
            original_field_name = field_name[:-len('_tmp')]
            f_size = get_field_fsize(f)
            field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu)
-            result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name))
+            result.append(temporary_fieldMemberTemplate.format(type=field_type,
+                                                               original_field_name=original_field_name))

    for kernel_info in kernel_infos:
        if hasattr(kernel_info, 'varying_parameters'):
@@ -734,6 +755,16 @@ def generate_constructor(ctx, kernel_infos, parameter_registration):
    return "\n".join(result)


+@jinja2_context_decorator
+def generate_field_strides(ctx, kernel_info):
+    result = []
+    for param in kernel_info.parameters:
+        if param.is_field_stride:
+            type_str = param.symbol.dtype.c_name
+            result.append(f"const {type_str} {param.symbol.name} = {param.symbol.name[1:]};")
+    return "\n".join(result)
+
+
 def generate_list_of_expressions(expressions, prepend=''):
    if len(expressions) == 0:
        return ''
@@ -806,3 +837,4 @@ def add_pystencils_filters_to_jinja_env(jinja_env):
    jinja_env.filters['identifier_list'] = identifier_list
    jinja_env.filters['list_of_expressions'] = generate_list_of_expressions
    jinja_env.filters['field_type'] = field_type
+    jinja_env.filters['generate_field_strides'] = generate_field_strides
--- a/python/pystencils_walberla/kernel_info.py
+++ b/python/pystencils_walberla/kernel_info.py
@@ -32,13 +32,20 @@ class KernelInfo:
        all_headers = [list(get_headers(self.ast))]
        return reduce(merge_sorted_lists, all_headers)

-    def generate_kernel_invocation_code(self, **kwargs):
+    def generate_kernel_invocation_code(self, plain_kernel_call=False, **kwargs):
        ast = self.ast
        ast_params = self.parameters
        fnc_name = ast.function_name
        is_cpu = self.ast.target == Target.CPU
        call_parameters = ", ".join([p.symbol.name for p in ast_params])

+        if plain_kernel_call:
+            if is_cpu:
+                return f"internal_{fnc_name}::{fnc_name}({call_parameters});"
+            else:
+                stream = kwargs.get('stream', '0')
+                return f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
+
        if not is_cpu:
            stream = kwargs.get('stream', '0')
            spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ())

--- a/python/pystencils_walberla/kernel_selection.py
+++ b/python/pystencils_walberla/kernel_selection.py
@@ -157,6 +157,21 @@ class SwitchNode(AbstractKernelSelectionNode):
        return switch_code


+class AbortNode(AbstractKernelSelectionNode):
+    def __init__(self, message):
+        self.message = message
+
+    @property
+    def selection_parameters(self):
+        return set()
+
+    def collect_kernel_calls(self):
+        return set()
+
+    def get_code(self, **kwargs):
+        return f'WALBERLA_ABORT("{self.message}")'
+
+
 class KernelCallNode(AbstractKernelSelectionNode):
    def __init__(self, ast):
        self.ast = ast
@@ -169,13 +184,20 @@ class KernelCallNode(AbstractKernelSelectionNode):
    def collect_kernel_calls(self):
        return {self}

-    def get_code(self, **kwargs):
+    def get_code(self, plain_kernel_call=False, **kwargs):
        ast = self.ast
        ast_params = self.parameters
        fnc_name = ast.function_name
        is_cpu = self.ast.target == Target.CPU
        call_parameters = ", ".join([p.symbol.name for p in ast_params])

+        if plain_kernel_call:
+            if is_cpu:
+                return f"internal_{fnc_name}::{fnc_name}({call_parameters});"
+            else:
+                stream = kwargs.get('stream', '0')
+                return f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
+
        if not is_cpu:
            stream = kwargs.get('stream', '0')
            spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ())

--- a/python/pystencils_walberla/templates/Boundary.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.cpp
@@ -82,10 +82,23 @@ void {{class_name}}::run_impl(

   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);

-   {{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}}
+   {% if calculate_force -%}
+   auto * forceVector = block->getData<ForceVector>(forceVectorID);
+   WALBERLA_ASSERT_EQUAL(indexVectorSize, int32_c( forceVector->forceVector().size() ))
+
+   {% if target == 'gpu' -%}
+   auto forcePointer = forceVector->pointerGpu();
+   int32_t forceVectorSize = int32_c( forceVector->forceVector().size() );
+   {% else %}
+   auto forcePointer = forceVector->pointerCpu();
+   {% endif %}
+   uint8_t * _data_forceVector = reinterpret_cast<uint8_t*>(forcePointer);
+   {%- endif %}
+
+   {{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize', 'forceVector', 'forceVectorSize'])|indent(4)}}
   {{kernel|generate_timestep_advancements|indent(4)}}
-   {{kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize'], ignore_fields=True)|indent(4) }}
-   {{kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}}
+   {{kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize', 'forceVectorSize'], ignore_fields=True)|indent(4) }}
+   {{kernel|generate_call(spatial_shape_symbols=['indexVectorSize', 'forceVectorSize'], stream='stream')|indent(4)}}
 }

 void {{class_name}}::run(
No results found