diff --git a/apps/benchmarks/NonUniformGridGPU/GridGeneration.h b/apps/benchmarks/NonUniformGridGPU/GridGeneration.h
index 618bee66e5ac50ea0e97069e15ab3d53ae7db873..a37f4be4b5fe7440f08116ab25ab30dd3e0b8111 100644
--- a/apps/benchmarks/NonUniformGridGPU/GridGeneration.h
+++ b/apps/benchmarks/NonUniformGridGPU/GridGeneration.h
@@ -40,7 +40,8 @@ using Stencil_T              = StorageSpecification_T::Stencil;
 using namespace walberla;
 
 void createSetupBlockForest(SetupBlockForest& setupBfs,
-                            const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
+                            const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
+                            const bool useMPIManager=false)
 {
    WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
 
@@ -50,11 +51,14 @@ void createSetupBlockForest(SetupBlockForest& setupBfs,
    Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
 
    const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
-   const uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
+   uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
    const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
    const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
    const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
 
+   if(useMPIManager)
+      numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
+
    const LDC ldc(refinementDepth);
 
    auto refSelection = ldc.refinementSelector();
@@ -64,6 +68,9 @@ void createSetupBlockForest(SetupBlockForest& setupBfs,
    setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
    setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
 
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
    {
       std::ostringstream oss;
       oss << blockForestFilestem << ".bfs";
@@ -101,4 +108,36 @@ void createSetupBlockForest(SetupBlockForest& setupBfs,
 
       WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
    }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs,
+                       const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
+{
+   if (mpi::MPIManager::instance()->numProcesses() > 1)
+   {
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      if(!std::filesystem::exists(setupBlockForestFilepath.c_str()))
+      {
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else
+      {
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else
+   {
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
 }
\ No newline at end of file
diff --git a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
index 6fe5a508965e61e6bb22cc9cef82a4520096b4ee..2548dd62ef84fa5b397fa0b98525117f1ac4376f 100644
--- a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
@@ -46,20 +46,32 @@ class LDCRefinement
 
    void operator()(SetupBlockForest& forest) const
    {
-      const AABB & domain = forest.getDomain();
-
-      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
-      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );
+      //const AABB & domain = forest.getDomain();
+//
+//      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
+//      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );
+//
+//      for(auto & block : forest)
+//      {
+//         auto & aabb = block.getAABB();
+//         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
+//         {
+//            if( block.getLevel() < refinementDepth_)
+//               block.setMarker( true );
+//         }
+//      }
 
       for(auto & block : forest)
       {
-         auto & aabb = block.getAABB();
-         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
+         if( forest.atDomainYMaxBorder(block) )
          {
             if( block.getLevel() < refinementDepth_)
                block.setMarker( true );
          }
       }
+
+
+
    }
 };
 
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
index aed74d7523cfdc5f630dc5e8ae7e30c07973324f..f6d36fff4ade177a44875777659710ef1f58e09f 100644
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
@@ -93,6 +93,7 @@ int main(int argc, char** argv)
       logging::configureLogging(config);
       auto domainSetup      = config->getOneBlock("DomainSetup");
       auto blockForestSetup = config->getOneBlock("SetupBlockForest");
+      const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);
 
       Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
       // Reading parameters
@@ -103,33 +104,12 @@ int main(int argc, char** argv)
       const bool gpuEnabledMPI    = parameters.getParameter< bool >("gpuEnabledMPI", false);
 
       shared_ptr< BlockForest > bfs;
+      createBlockForest(bfs, domainSetup, blockForestSetup);
 
-      if (mpi::MPIManager::instance()->numProcesses() > 1)
+      if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
       {
-         const std::string blockForestFilestem =
-            blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
-         // Load structured block forest from file
-         std::ostringstream oss;
-         oss << blockForestFilestem << ".bfs";
-         const std::string setupBlockForestFilepath = oss.str();
-
-         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
-                                               setupBlockForestFilepath.c_str(), false);
-      }
-      else
-      {
-         SetupBlockForest setupBfs;
-         createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
-         const bool writeSetupForestAndReturn =
-            blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);
-
-         if (writeSetupForestAndReturn)
-         {
-            WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
-            return EXIT_SUCCESS;
-         }
-
-         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+         WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+         return EXIT_SUCCESS;
       }
 
       auto blocks =
@@ -211,7 +191,7 @@ int main(int argc, char** argv)
          LBMMeshRefinement(blocks, pdfFieldGpuID, sweepCollection, boundaryCollection, communication, packInfo);
       SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
 
-      // LBMMeshRefinement.test(5);
+      LBMMeshRefinement.test(1);
       // return EXIT_SUCCESS;
 
       LBMMeshRefinement.addRefinementToTimeLoop(timeLoop);
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
index 89d5e448c3dedd07ab09307961efe576dfb15942..5c74463c90cae3e0b2dc5c26d0014ddd0ec1012f 100644
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
@@ -30,13 +30,13 @@ const char * infoCollisionSetup = "{collision_setup}";
 const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
- 
+
 with CodeGeneration() as ctx:
     field_type = "float64" if ctx.double_accuracy else "float32"
 
-    streaming_pattern = 'aa'
+    streaming_pattern = 'esopull'
     timesteps = get_timesteps(streaming_pattern)
-    stencil = LBStencil(Stencil.D3Q19)
+    stencil = LBStencil(Stencil.D3Q27)
 
     assert stencil.D == 3, "This application supports only three-dimensional stencils"
     pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
index d17a69f56f0374eb7a63106cfa9ba8b11a013e68..3ecd2b8b08c14069d40cb3d4042fb69f7c96f227 100644
--- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
@@ -22,7 +22,7 @@ class Scenario:
                  timesteps=101,
                  gpu_enabled_mpi=False,
                  vtk_write_frequency=0,
-                 logger_frequency=0,
+                 logger_frequency=30,
                  blockforest_filestem="blockforest",
                  write_setup_vtk=True):
 
@@ -66,7 +66,7 @@ class Scenario:
                 'timesteps': self.timesteps,
                 'remainingTimeLoggerFrequency': self.logger_frequency,
                 'vtkWriteFrequency': self.vtk_write_frequency,
-                'useVTKAMRWriter': True,
+                'useVTKAMRWriter': False,
                 'oneFilePerProcess': False,
                 'gpuEnabledMPI': self.gpu_enabled_mpi,
             },
@@ -120,8 +120,8 @@ def validation_run():
     """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
     wlb.log_info_on_root("Validation run")
 
-    domain_size = (96, 96, 96)
-    cells_per_block = (32, 32, 32)
+    domain_size = (48, 48, 48)
+    cells_per_block = (16, 16, 16)
 
     root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
 
@@ -129,6 +129,8 @@ def validation_run():
     scenario = Scenario(domain_size=domain_size,
                         root_blocks=root_blocks,
                         cells_per_block=cells_per_block,
+                        timesteps=10001,
+                        vtk_write_frequency=2500,
                         refinement_depth=1,
                         gpu_enabled_mpi=False)
     scenarios.add(scenario)
@@ -149,7 +151,7 @@ def scaling(num_proc, gpu_enabled_mpi=False, uniform=True):
         factor = int(num_proc // 4)
         name = "nonuniform"
 
-    cells_per_block = (184, 184, 184)
+    cells_per_block = (136, 136, 136)
     domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
 
     root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
@@ -166,5 +168,5 @@ def scaling(num_proc, gpu_enabled_mpi=False, uniform=True):
     scenarios.add(scenario)
 
 
-# validation_run()
-scaling(4, True, False)
+validation_run()
+# scaling(4, False, False)
diff --git a/python/lbmpy_walberla/packing_kernels.py b/python/lbmpy_walberla/packing_kernels.py
index 8a8728031cd4eb56ba4366e25ef6fd85b1f2e1b5..c977e3b30f7862a5133399602ef34ae1931547a4 100644
--- a/python/lbmpy_walberla/packing_kernels.py
+++ b/python/lbmpy_walberla/packing_kernels.py
@@ -7,6 +7,8 @@ import sympy as sp
 from jinja2 import Environment, PackageLoader, StrictUndefined
 
 from pystencils import Assignment, CreateKernelConfig, create_kernel, Field, FieldType, fields, Target
+from pystencils.astnodes import LoopOverCoordinate
+from pystencils.integer_functions import int_div
 from pystencils.stencil import offset_to_direction_string
 from pystencils.typing import TypedSymbol
 from pystencils.stencil import inverse_direction
@@ -18,7 +20,7 @@ from lbmpy.enums import Stencil
 from lbmpy.stencils import LBStencil
 
 from pystencils_walberla.cmake_integration import CodeGenerationContext
-from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode
+from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode, AbortNode
 from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
 from pystencils_walberla.utility import config_from_context
 
@@ -115,6 +117,8 @@ class PackingKernelsCodegen:
 
     def create_nonuniform_kernel_families(self, kernels_dict=None):
         kernels = dict() if kernels_dict is None else kernels_dict
+        kernels['localCopyRedistribute'] = self.get_local_copy_redistribute_kernel_family()
+        kernels['localPartialCoalescence'] = self.get_local_copy_partial_coalescence_kernel_family()
         kernels['unpackRedistribute'] = self.get_unpack_redistribute_kernel_family()
         kernels['packPartialCoalescence'] = self.get_pack_partial_coalescence_kernel_family()
         kernels['zeroCoalescenceRegion'] = self.get_zero_coalescence_region_kernel_family()
@@ -283,15 +287,59 @@ class PackingKernelsCodegen:
         return create_kernel(assignments, config=config)
 
     def get_unpack_redistribute_kernel_family(self):
-        return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast)
+        return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast,
+                                                           exclude_time_step=Timestep.EVEN)
 
     def get_local_copy_redistribute_ast(self, comm_dir, timestep):
-        #   TODO
-        raise NotImplementedError()
+        assert not all(d == 0 for d in comm_dir)
+        ctr = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(self.stencil.D)]
+
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir))
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+        if len(dir_indices) == 0:
+            return None
+
+        # for inplace streaming the dst (fine grid) must always be on odd state
+        dst_timestep = Timestep.ODD if self.inplace else Timestep.BOTH
+
+        _, dst = self._stream_out_accs(dst_timestep)
+        src, _ = self._stream_out_accs(timestep)
+
+        src_abs = self.src_field.new_field_with_different_name(self.src_field.name)
+        src_abs.field_type = FieldType.CUSTOM
+
+        orthos = self.orthogonal_principals(comm_dir)
+        sub_dirs = self.contained_principals(comm_dir)
+        orthogonal_combinations = self.linear_combinations(orthos)
+        subdir_combinations = self.linear_combinations_nozero(sub_dirs)
+        second_gl_dirs = [o + s for o, s in product(orthogonal_combinations, subdir_combinations)]
+        negative_dir_correction = np.array([(1 if d == -1 else 0) for d in comm_dir])
+        assignments = []
+        for offset in orthogonal_combinations:
+            o = offset + negative_dir_correction
+            for d in range(self.values_per_cell):
+                field_acc = dst[d].get_shifted(*o)
+                src_access = [int_div(ctr[i], 2) + o for i, o in enumerate(src[d].offsets)]
+                assignments.append(Assignment(field_acc, src_abs.absolute_access(src_access, (d, ))))
+
+        for offset in second_gl_dirs:
+            o = offset + negative_dir_correction
+            for d in dir_indices:
+                field_acc = dst[d].get_shifted(*o)
+                src_access = [int_div(ctr[i], 2) + o for i, o in enumerate(src[d].offsets)]
+                assignments.append(Assignment(field_acc, src_abs.absolute_access(src_access, (d, ))))
+
+        function_name = f'localCopyRedistribute_{dir_string}' + timestep_suffix(timestep)
+        iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim))
+        config = CreateKernelConfig(function_name=function_name, iteration_slice=iteration_slice,
+                                    data_type=self.data_type, ghost_layers=0, allow_double_writes=True,
+                                    cpu_openmp=self.config.cpu_openmp, target=self.config.target)
+
+        return create_kernel(assignments, config=config)
 
     def get_local_copy_redistribute_kernel_family(self):
-        #   TODO
-        raise NotImplementedError()
+        return self._construct_directionwise_kernel_family(self.get_local_copy_redistribute_ast)
 
     # --------------------------- Pack / Unpack / LocalCopy Fine to Coarse ---------------------------------------------
 
@@ -322,7 +370,8 @@ class PackingKernelsCodegen:
         return ast
 
     def get_pack_partial_coalescence_kernel_family(self):
-        return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast)
+        return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast,
+                                                           exclude_time_step=Timestep.ODD)
 
     def get_unpack_coalescence_ast(self, comm_dir, timestep):
         config = replace(self.config, ghost_layers=0)
@@ -370,12 +419,53 @@ class PackingKernelsCodegen:
     def get_zero_coalescence_region_kernel_family(self):
         return self._construct_directionwise_kernel_family(self.get_zero_coalescence_region_ast)
 
-    #   TODO
     def get_local_copy_partial_coalescence_ast(self, comm_dir, timestep):
-        raise NotImplementedError()
+        assert not all(d == 0 for d in comm_dir)
+        ctr = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(self.stencil.D)]
+
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(comm_dir)
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+
+        if len(dir_indices) == 0:
+            return None
+        buffer = sp.symbols(f"b_:{self.values_per_cell}")
+
+        # for inplace streaming the src (fine grid) must always be on even state
+        src_timestep = Timestep.ODD if self.inplace else Timestep.BOTH
+
+        src, _ = self._stream_in_accs(src_timestep)
+        _, dst = self._stream_in_accs(timestep.next())
+        mask = self.mask_field
+
+        dst_abs = self.dst_field.new_field_with_different_name(self.dst_field.name)
+        dst_abs.field_type = FieldType.CUSTOM
+
+        coalescence_factor = sp.Rational(1, 2 ** self.dim)
+
+        offsets = list(product(*((0, 1) for _ in comm_dir)))
+        assignments = []
+        for i, d in enumerate(dir_indices):
+            acc = 0
+            for o in offsets:
+                acc += flag_cond(d, mask[o], src[d].get_shifted(*o))
+            assignments.append(Assignment(buffer[i], acc))
+
+        for i, d in enumerate(dir_indices):
+            index = dst[d].index
+            dst_access = [int_div(ctr[i], 2) + o for i, o in enumerate(dst[d].offsets)]
+            assignments.append(Assignment(dst_abs.absolute_access(dst_access, index),
+                                          dst_abs.absolute_access(dst_access, index) + coalescence_factor * buffer[i]))
+
+        iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim))
+        config = replace(self.config, iteration_slice=iteration_slice, ghost_layers=0)
+
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = f'localPartialCoalescence_{dir_string}' + timestep_suffix(timestep)
+        return ast
 
     def get_local_copy_partial_coalescence_kernel_family(self):
-        raise NotImplementedError()
+        return self._construct_directionwise_kernel_family(self.get_local_copy_partial_coalescence_ast)
 
     # ------------------------------------------ Utility ---------------------------------------------------------------
 
@@ -425,7 +515,7 @@ class PackingKernelsCodegen:
 
     # --------------------------- Private Members ----------------------------------------------------------------------
 
-    def _construct_directionwise_kernel_family(self, create_ast_callback):
+    def _construct_directionwise_kernel_family(self, create_ast_callback, exclude_time_step=None):
         subtrees = []
         direction_symbol = TypedSymbol('dir', dtype='stencil::Direction')
         for t in get_timesteps(self.streaming_pattern):
@@ -439,7 +529,10 @@ class PackingKernelsCodegen:
                     continue
                 kernel_call = KernelCallNode(ast)
                 cases_dict[f"stencil::{dir_string}"] = kernel_call
-            subtrees.append(SwitchNode(direction_symbol, cases_dict))
+            if exclude_time_step is not None and t == exclude_time_step:
+                subtrees.append(AbortNode("This function can not be called! Please contact the waLBerla team"))
+            else:
+                subtrees.append(SwitchNode(direction_symbol, cases_dict))
 
         if not self.inplace:
             tree = subtrees[0]
diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
index 92106cddf0248c93707e6881b600b6a702d27985..caafd77c2e9506784e0451333d86b0c322420306 100644
--- a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
+++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
@@ -42,6 +42,8 @@ namespace {{namespace}} {
    {{ kernels['localCopyDirection'] | generate_definitions }}
 
    {% if nonuniform -%}
+   {{ kernels['localCopyRedistribute']    | generate_definitions }}
+   {{ kernels['localPartialCoalescence']    | generate_definitions }}
    {{ kernels['unpackRedistribute']    | generate_definitions }}
    {{ kernels['packPartialCoalescence']    | generate_definitions }}
    {{ kernels['zeroCoalescenceRegion']    | generate_definitions }}
@@ -89,7 +91,7 @@ namespace {{namespace}} {
       WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
 
       {{kernels['localCopyAll']
-               | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
+               | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream')
                | indent(6) }}
    }
 
@@ -128,11 +130,31 @@ namespace {{namespace}} {
       WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
 
       {{kernels['localCopyDirection']
-               | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
+               | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream')
                | indent(6) }}
    }
 
    {% if nonuniform -%}
+   void {{class_name}}::PackKernels::localCopyRedistribute(
+      {{- [  "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+             "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localCopyRedistribute'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{kernels['localCopyRedistribute'] | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }}
+   }
+
+   void {{class_name}}::PackKernels::localPartialCoalescence(
+      {{- [  "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & srcInterval",
+             "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localPartialCoalescence'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{kernels['localPartialCoalescence'] | generate_call(cell_interval={src_field.name : 'srcInterval', mask_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }}
+   }
+
    void {{class_name}}::PackKernels::unpackRedistribute(
       {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
              "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters,
diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
index ad40a55eef10c08deb66e6266753a828be9f3452..5ca6f66595d296a071eb2e71f0f603c71f6ac640 100644
--- a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
+++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
@@ -209,6 +209,26 @@ class {{class_name}}
 
       {% if nonuniform -%}
 
+      /**
+       * Local uniform redistribute.
+       * */
+      void localCopyRedistribute(
+         {{- [  "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+                "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localCopyRedistribute'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Local partial coalescence.
+       * */
+      void localPartialCoalescence(
+         {{- [  "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & srcInterval",
+                "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localPartialCoalescence'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
       /**
        * Unpacks and uniformly redistributes populations coming from a coarse block onto the fine grid.
        * */
diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py
index 6d05bf8ffd51808821311bf09f0db263570983b9..66384f790e9ad390effd5d72d4de57c6750082a2 100644
--- a/python/pystencils_walberla/jinja_filters.py
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -296,7 +296,7 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
         if isinstance(cell_interval, str):
             return cell_interval
         elif isinstance(cell_interval, dict):
-            return cell_interval[field_object]
+            return cell_interval[field_object.name]
         else:
             return None
 
diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py
index ad8a99867e0970b102409823f6a17258983bae2b..a41f7b93595a9484ae6424f6c85e528ebc2ce4c0 100644
--- a/python/pystencils_walberla/kernel_selection.py
+++ b/python/pystencils_walberla/kernel_selection.py
@@ -157,6 +157,21 @@ class SwitchNode(AbstractKernelSelectionNode):
         return switch_code
 
 
+class AbortNode(AbstractKernelSelectionNode):
+    def __init__(self, message):
+        self.message = message
+
+    @property
+    def selection_parameters(self):
+        return set()
+
+    def collect_kernel_calls(self):
+        return set()
+
+    def get_code(self, **kwargs):
+        return f'WALBERLA_ABORT("{self.message}")'
+
+
 class KernelCallNode(AbstractKernelSelectionNode):
     def __init__(self, ast):
         self.ast = ast
diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h
index b872be1d0c80e3537971b49434d64033373a1822..f0d8c53b6fae443ee6e1244161b7e37f77e75179 100644
--- a/src/gpu/communication/NonUniformGPUScheme.h
+++ b/src/gpu/communication/NonUniformGPUScheme.h
@@ -111,7 +111,6 @@ class NonUniformGPUScheme
 
    std::vector< std::vector< mpi::GenericBufferSystem< CpuBuffer_T, CpuBuffer_T > > > bufferSystemCPU_;
    std::vector< std::vector< mpi::GenericBufferSystem< GpuBuffer_T, GpuBuffer_T > > > bufferSystemGPU_;
-   std::vector< std::vector< GpuBuffer_T > > localBuffer_;
    GpuBuffer_T adaptiveGPUBuffer;
 
    std::vector< shared_ptr< GeneratedNonUniformGPUPackInfo > > packInfos_;
@@ -174,7 +173,6 @@ void NonUniformGPUScheme< Stencil >::init()
 {
    bufferSystemCPU_.resize(3);
    bufferSystemGPU_.resize(3);
-   localBuffer_.resize(3);
    headers_.resize(3);
 
    communicationInProgress_.resize(3);
@@ -201,7 +199,6 @@ void NonUniformGPUScheme< Stencil >::refresh()
    {
       bufferSystemCPU_[i].clear();
       bufferSystemGPU_[i].clear();
-      localBuffer_[i].clear();
       headers_[i].clear();
       headers_[i].resize(size_t(levels + uint_t(1)));
 
@@ -210,7 +207,6 @@ void NonUniformGPUScheme< Stencil >::refresh()
          headers_[i][j].clear();
          bufferSystemCPU_[i].emplace_back(mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j));
          bufferSystemGPU_[i].emplace_back(mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j));
-         localBuffer_[i].emplace_back();
       }
 
       communicationInProgress_[i].resize(size_t(levels + uint_t(1)), false);
@@ -370,10 +366,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i
       }
    }
    // wait for packing to finish
-   for (uint_t i = 0; i < Stencil::Q; ++i)
-   {
-      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
-   }
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 
 
    if (sendFromGPU_)
@@ -433,13 +426,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t
             if( coarseBlock->neighborExistsLocally( neighborIdx, n ) )
             {
                auto fineReceiverBlock = dynamic_cast< Block * >( forest->getBlock( fineReceiverId ) );
-               GpuBuffer_T& gpuDataBuffer = localBuffer_[COARSE_TO_FINE][index];
-
                for (auto& pi : packInfos_)
                {
-                  WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-                  WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
-                  pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr);
+                  pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, nullptr);
                }
             }
             else
@@ -469,15 +458,11 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t
             }
          }
       }
-      localBuffer_[COARSE_TO_FINE][index].clear();
    }
 
 
    // wait for packing to finish
-   for (uint_t i = 0; i < Stencil::Q; ++i)
-   {
-      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
-   }
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 
    if (sendFromGPU_)
       bufferSystemGPU_[COARSE_TO_FINE][index].sendAll();
@@ -536,13 +521,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t
          if( fineBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) )
          {
             auto coarseReceiverBlock = dynamic_cast< Block * >( forest->getBlock( coarseReceiverId ) );
-            GpuBuffer_T& gpuDataBuffer = localBuffer_[FINE_TO_COARSE][index];
-
             for (auto& pi : packInfos_)
             {
-               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-               WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir))
-               pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr);
+               pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, nullptr);
             }
          }
          else
@@ -571,13 +552,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t
             }
          }
       }
-      localBuffer_[FINE_TO_COARSE][index].clear();
    }
    // wait for packing to finish
-   for (uint_t i = 0; i < Stencil::Q; ++i)
-   {
-      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
-   }
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 
    if (sendFromGPU_)
       bufferSystemGPU_[FINE_TO_COARSE][index].sendAll();
@@ -644,10 +621,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t leve
          }
       }
    }
-   for (uint_t i = 0; i < Stencil::Q; ++i)
-   {
-      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
-   }
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
    communicationInProgress_[EQUAL_LEVEL][level] = false;
 }
 
@@ -673,7 +647,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi
                GpuBuffer_T &gpuDataBuffer = recvInfo.buffer();
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
                pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir],
-                                          gpuDataBuffer, streams_[0]);
+                                          gpuDataBuffer, nullptr);
             }
          }
       }
@@ -697,17 +671,14 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi
                WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr)
 
-               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[0]))
-               pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[0]);
+               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, nullptr))
+               pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, nullptr);
             }
          }
       }
    }
 
-   for (uint_t i = 0; i < Stencil::Q; ++i)
-   {
-      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
-   }
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
    communicationInProgress_[COARSE_TO_FINE][fineLevel] = false;
 }
 
@@ -766,10 +737,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi
          }
       }
    }
-   for (uint_t i = 0; i < Stencil::Q; ++i)
-   {
-      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
-   }
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
    communicationInProgress_[FINE_TO_COARSE][fineLevel] = false;
 }
 
@@ -788,9 +756,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
 
    std::vector< std::vector< shared_ptr< mpi::BufferSystem > > > headerExchangeBs;
 
-   std::vector< std::vector< mpi::MPISize > > localBufferSize;
-
-   localBufferSize.resize(3);
    senderInfo.resize(3);
    receiverInfo.resize(3);
 
@@ -809,10 +774,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
       headerExchangeBs[EQUAL_LEVEL].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123));
       headerExchangeBs[COARSE_TO_FINE].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123));
       headerExchangeBs[FINE_TO_COARSE].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123));
-
-      localBufferSize[EQUAL_LEVEL].push_back(mpi::MPISize(0));
-      localBufferSize[COARSE_TO_FINE].push_back(mpi::MPISize(0));
-      localBufferSize[FINE_TO_COARSE].push_back(mpi::MPISize(0));
    }
 
    for (auto& iBlock : *forest)
@@ -868,8 +829,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
 
                if( block->neighborExistsLocally( neighborIdx, n ) )
                {
-                  for (auto& pi : packInfos_)
-                     localBufferSize[COARSE_TO_FINE][fineLevel] += mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir));
                   continue;
                }
 
@@ -896,8 +855,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
 
             if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) )
             {
-               for (auto& pi : packInfos_)
-                  localBufferSize[FINE_TO_COARSE][level] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir));
                continue;
             }
 
@@ -939,8 +896,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
             bufferSystemCPU_[i][j].sendBuffer(it.first).resize(size_t(it.second));
             bufferSystemGPU_[i][j].sendBuffer(it.first).resize(size_t(it.second));
          }
-         if (localBufferSize[i][j] > 0)
-            localBuffer_[i][j].resize(size_t(localBufferSize[i][j]));
       }
    }
    forestModificationStamp_      = forest->getBlockForest().getModificationStamp();
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
index 96c514fcc4369084273c098ac9bf4ad21310ae29..c1595e4ed6332fee6e0b885c6e455d3e0ffb99d0 100644
--- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
@@ -52,6 +52,12 @@ class NonuniformGPUPackingKernelsWrapper
    void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
                            CellInterval dstInterval, Direction dir, gpuStream_t stream) const               = 0;
 
+   void localCopyRedistribute(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
+                              CellInterval dstInterval, Direction dir, gpuStream_t stream) const = 0;
+
+   void localPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval srcInterval,
+                                PdfField_T* dstField, CellInterval dstInterval, Direction dir, gpuStream_t stream) const = 0;
+
    void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer,
                            stencil::Direction dir, gpuStream_t stream ) const = 0;
 
@@ -108,6 +114,18 @@ class NonuniformGPUPackingKernelsWrapper< PdfField_T, false >
       kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, stream);
    }
 
+   void localCopyRedistribute(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
+                              CellInterval dstInterval, Direction dir, gpuStream_t stream) const
+   {
+      kernels_.localCopyRedistribute(srcField, srcInterval, dstField, dstInterval, dir, stream);
+   }
+
+   void localPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval srcInterval,
+                                PdfField_T* dstField, CellInterval dstInterval, Direction dir, gpuStream_t stream) const
+   {
+      kernels_.localPartialCoalescence(srcField, maskField, srcInterval, dstField, dstInterval, dir, stream);
+   }
+
    void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer,
                            stencil::Direction dir, gpuStream_t stream = nullptr) const
    {
@@ -192,10 +210,34 @@ class NonuniformGPUPackingKernelsWrapper< PdfField_T, true >
       kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep, stream);
    }
 
+   void localCopyRedistribute(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
+                              CellInterval dstInterval, Direction dir, gpuStream_t stream) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT((dstField->getTimestep() & 1) ^ 1, "When the course to fine step is executed, the fine Field must "
+                                                         "be on an odd timestep, while the source field could either be "
+                                                         "on an even or an odd state.")
+      kernels_.localCopyRedistribute(srcField, srcInterval, dstField, dstInterval, dir, timestep, stream);
+   }
+
+   void localPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval srcInterval,
+                                PdfField_T* dstField, CellInterval dstInterval, Direction dir, gpuStream_t stream) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      WALBERLA_ASSERT(!(srcField->getTimestep() & 1) ^ 1, "When the fine to coarse step is executed, the fine Field must "
+                                                          "be on an even timestep, while the source field could either be "
+                                                          "on an even or an odd state.")
+
+      kernels_.localPartialCoalescence(srcField, maskField, srcInterval, dstField, dstInterval, dir, timestep, stream);
+   }
+
    void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer,
                            stencil::Direction dir, gpuStream_t stream = nullptr) const
    {
       uint8_t timestep = dstField->getTimestep();
+      WALBERLA_ASSERT((dstField->getTimestep() & 1) ^ 1, "When the course to find step is executed, the fine Field must "
+                                                         "be on an odd timestep, while the source field could either be "
+                                                         "on an even or an odd state.")
       kernels_.unpackRedistribute(dstField, ci, inBuffer, dir, timestep, stream);
    }
 
@@ -203,6 +245,10 @@ class NonuniformGPUPackingKernelsWrapper< PdfField_T, true >
                                unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const
    {
       uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT(!(srcField->getTimestep() & 1) ^ 1, "When the fine to coarse step is executed, the fine Field must "
+                                                         "be on an even timestep, while the source field could either be "
+                                                         "on an even or an odd state.")
+
       kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir, timestep, stream);
    }
 
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
index 987cebe9b2bfd343ed0277ed3faefef4dddaa753..3fe29511c3758a1543b02e5c4b28c971e939621a 100644
--- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
@@ -168,21 +168,13 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFi
       Direction const unpackDir      = dstIntervals[index].first;
       CellInterval dstInterval = dstIntervals[index].second;
 
-      uint_t packSize      = kernels_.size(srcInterval);
-
 #ifndef NDEBUG
       Direction const packDir        = srcIntervals[index].first;
       WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir])
       uint_t unpackSize = kernels_.redistributeSize(dstInterval);
-      WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+      WALBERLA_ASSERT_EQUAL(kernels_.size(srcInterval), unpackSize)
 #endif
-
-      // TODO: This is a dirty workaround. Code-generate direct redistribution!
-      unsigned char *buffer;
-      WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize))
-      kernels_.packAll(srcField, srcInterval, buffer, stream);
-      kernels_.unpackRedistribute(dstField, dstInterval, buffer, unpackDir, stream);
-      WALBERLA_GPU_CHECK(gpuFree(buffer))
+      kernels_.localCopyRedistribute(srcField, srcInterval, dstField, dstInterval, unpackDir, stream);
    }
 }
 
@@ -190,6 +182,9 @@ template< typename PdfField_T>
 void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine(
    const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream)
 {
+   // WARNING: This function uses an inplace buffer array.
+   // If possible the direct communicateLocalCoarseToFine without buffer array should be used
+
    auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
    auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_);
 
@@ -269,22 +264,16 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoar
 
    CellInterval srcInterval;
    srcField->getGhostRegion(dir, srcInterval, 2);
-   uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir);
 
    CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(),
                                                                 invDir, dstField);
 
 #ifndef NDEBUG
    uint_t unpackSize = kernels_.size(dstInterval, invDir);
-   WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+   WALBERLA_ASSERT_EQUAL(kernels_.partialCoalescenceSize(srcInterval, dir), unpackSize)
 #endif
 
-   // TODO: This is a dirty workaround. Code-generate direct redistribution!
-   unsigned char *buffer;
-   WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize))
-   kernels_.packPartialCoalescence(srcField, maskField, srcInterval, buffer, dir, stream);
-   kernels_.unpackCoalescence(dstField, dstInterval, buffer, invDir, stream);
-   WALBERLA_GPU_CHECK(gpuFree(buffer))
+   kernels_.localPartialCoalescence(srcField, maskField, srcInterval, dstField, dstInterval, dir, stream);
 }
 
 
@@ -311,6 +300,19 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoar
    WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
 #endif
 
+//   WALBERLA_ROOT_SECTION()
+//   {
+//      WALBERLA_LOG_DEVEL_VAR(stencil::dirToString[dir])
+//      WALBERLA_LOG_DEVEL_VAR(unsigned(srcField->getTimestep()))
+//      WALBERLA_LOG_DEVEL_VAR(srcInterval)
+//
+//      WALBERLA_LOG_DEVEL_VAR(stencil::dirToString[invDir])
+//      WALBERLA_LOG_DEVEL_VAR(unsigned(dstField->getTimestep()))
+//      WALBERLA_LOG_DEVEL_VAR(dstInterval)
+//   }
+
+
+
    auto bufferPtr = buffer.advanceNoResize(packSize);
    kernels_.packPartialCoalescence(srcField, maskField, srcInterval, bufferPtr, dir, stream);
    kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, stream);