diff --git a/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp b/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
index bdf4b46e71ac66b15fbe211d1db74d3cc2a42ad5..acff9cb81f59786a699a624a8f2bbe2e56da5eb0 100644
--- a/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
+++ b/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
@@ -63,7 +63,8 @@ class UniformGpuFieldPackInfoBase : public gpu::GeneratedGPUPackInfo
       CellInterval ci;
       field->getGhostRegion(dir, ci, sliceWidth_, false);
       std::span< value_type > buffer{ (value_type*) rawBuffer, this->size(dir, block) };
-      impl().doUnpack(field, buffer, dir, ci, stream);
+      stencil::Direction commDir{ stencil::inverseDir[ dir ] };
+      impl().doUnpack(field, buffer, commDir, ci, stream);
    }
 
    void communicateLocal  ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) override {
@@ -83,13 +84,14 @@ class UniformGpuFieldPackInfoBase : public gpu::GeneratedGPUPackInfo
    uint_t size(stencil::Direction dir, IBlock* block) override
    {
       using Field_T = typename Impl::Field_T;
+      using value_type = typename Field_T::value_type;
 
       const Field_T * field = block->getData< Field_T >(fieldId_);
       CellInterval ci;
-      field->getGhostRegion(dir, ci, 1, false);
+      field->getGhostRegion(dir, ci, sliceWidth_, false);
 
       uint_t elementsPerCell{ impl().elementsPerCell(dir) };
-      return elementsPerCell * ci.numCells();
+      return elementsPerCell * ci.numCells() * sizeof( value_type );
    }
 
  protected:
diff --git a/src/walberla/codegen/api.py b/src/walberla/codegen/api.py
index 284b31766546141768791ca7d544ba29151d4238..b5fa9d78009062165fcf7bd48b246be02ec37680 100644
--- a/src/walberla/codegen/api.py
+++ b/src/walberla/codegen/api.py
@@ -113,6 +113,11 @@ class CellInterval(_PlainCppClass):
 class Direction(_PlainCppClass):
     _type = cpptype("walberla::stencil::Direction", "stencil/Directions.h")
 
+    @staticmethod
+    def from_offset(offset: tuple[int, int, int]) -> str:
+        from pystencils.stencil import offset_to_direction_string
+        return f"walberla::stencil::Direction::{offset_to_direction_string(offset)}"
+
 
 class BlockDataID(_PlainCppClass):
     _type = cpptype("walberla::BlockDataID", "domain_decomposition/BlockDataID.h")
diff --git a/src/walberla/codegen/communication/pack_infos.py b/src/walberla/codegen/communication/pack_infos.py
index f262526eef3e13296ec948b6614f990ac8bbd2d6..a3284f386328c1950226dba46f5793ac2f87fbe6 100644
--- a/src/walberla/codegen/communication/pack_infos.py
+++ b/src/walberla/codegen/communication/pack_infos.py
@@ -22,7 +22,7 @@ from pystencilssfg.ir.call_tree import SfgCallTreeNode
 from pystencilssfg.ir.postprocessing import PostProcessingContext, SfgDeferredNode
 from pystencilssfg.lang import SfgKernelParamVar, AugExpr, strip_ptr_ref
 from pystencilssfg.lang.cpp import std
-from pystencilssfg.lang.gpu import CudaAPI, HipAPI
+from pystencilssfg.lang.gpu import CudaAPI, HipAPI, ProvidesGpuRuntimeAPI
 
 from ..api import GpuFieldPtr, Direction, CellInterval, uint_t
 from ..build_config import get_build_config
@@ -123,7 +123,7 @@ class GpuPdfFieldPackInfo(CustomGenerator):
             cfg=build_config.get_pystencils_config(),
         )
 
-        # GpuAPI: type[ProvidesGpuRuntimeAPI]
+        GpuAPI: type[ProvidesGpuRuntimeAPI]
         match pkc.cfg.get_target():
             case Target.CUDA:
                 GpuAPI = CudaAPI
@@ -171,7 +171,7 @@ class GpuPdfFieldPackInfo(CustomGenerator):
                     sfg.switch(dir)
                     .cases(
                         {
-                            f"walberla::stencil::Direction::{offset_to_direction_string(comm_dir)}": sfg.gpu_invoke(
+                            Direction.from_offset(comm_dir): sfg.gpu_invoke(
                                 pack_kernels[comm_dir], stream=stream
                             )
                             for comm_dir in comm_dirs
@@ -189,7 +189,7 @@ class GpuPdfFieldPackInfo(CustomGenerator):
                     sfg.switch(dir)
                     .cases(
                         {
-                            f"walberla::stencil::Direction::{offset_to_direction_string(comm_dir)}": sfg.gpu_invoke(
+                            Direction.from_offset(comm_dir): sfg.gpu_invoke(
                                 unpack_kernels[comm_dir], stream=stream
                             )
                             for comm_dir in comm_dirs
@@ -226,7 +226,12 @@ class GpuPdfFieldPackInfo(CustomGenerator):
                 .inline()
                 .const()
                 .params(dir)
-                .returns(uint_t)(f"return {self._field.index_shape[0]};"),  # FIXME: Wrong!
+                .returns(uint_t)(
+                    sfg.switch(dir, autobreak=False).cases({
+                        Direction.from_offset(comm_dir): f"return {len(elems)};"
+                        for comm_dir, elems in self._communication_sets.items()
+                    }).default("return 0;")
+                ),
             )
         )
 
diff --git a/tests/BasicLbmScenarios/CMakeLists.txt b/tests/BasicLbmScenarios/CMakeLists.txt
index 58dfde5a5c2de2ce2901c0efd94b4eefcf11104e..e2a87497f6cbd3f855f0def1a9a726f791d6cde2 100644
--- a/tests/BasicLbmScenarios/CMakeLists.txt
+++ b/tests/BasicLbmScenarios/CMakeLists.txt
@@ -11,7 +11,9 @@ target_link_libraries( TestBasicLbmScenariosCPU PRIVATE walberla::core walberla:
 add_dependencies( SfgTests TestBasicLbmScenariosCPU )
 
 foreach( TestID ${TestIDs} )
-    add_test( NAME "BasicLbmScenarios - CPU - ${TestID}" COMMAND TestBasicLbmScenariosCPU ${TestID} )
+foreach( NumProcs 1 2 4 )
+    add_test( NAME "BasicLbmScenarios - CPU - ${NumProcs} Processes - ${TestID}" COMMAND mpiexec -c ${NumProcs} TestBasicLbmScenariosCPU ${TestID} )
+endforeach()
 endforeach()
 
 
@@ -39,6 +41,8 @@ if( $CACHE{WALBERLA_BUILD_WITH_HIP} )
     add_dependencies( SfgTests TestBasicLbmScenariosHIP )
 
     foreach( TestID ${TestIDs} )
-        add_test( NAME "BasicLbmScenarios - HIP - ${TestID}" COMMAND TestBasicLbmScenariosHIP ${TestID} )
+    foreach( NumProcs 1 2 4 )
+        add_test( NAME "BasicLbmScenarios - HIP -  ${NumProcs} Processes - ${TestID}" COMMAND mpiexec -c ${NumProcs} TestBasicLbmScenariosHIP ${TestID} )
+    endforeach()
     endforeach()
 endif()
diff --git a/tests/BasicLbmScenarios/TestBasicLbmScenarios.cpp b/tests/BasicLbmScenarios/TestBasicLbmScenarios.cpp
index 974e5f28b914698aef91aa6c73604447616be0c9..e25e33a73ad607ca21deaade1b934a7cd4f898e1 100644
--- a/tests/BasicLbmScenarios/TestBasicLbmScenarios.cpp
+++ b/tests/BasicLbmScenarios/TestBasicLbmScenarios.cpp
@@ -25,8 +25,12 @@ using TestFunction = std::function< void(mpi::Environment&) >;
  */
 void fullyPeriodic(mpi::Environment& env)
 {
-   SimDomain dom{ SimDomainBuilder{
-      .blocks = { 1, 1, 1 }, .cellsPerBlock = { 32, 32, 32 }, .periodic = { true, true, true } }
+   uint_t numProcesses = uint_c(MPIManager::instance()->numProcesses());
+   Vector3< uint_t > numBlocks{ math::getFactors3D(numProcesses) };
+
+   SimDomain dom{ SimDomainBuilder{ .blocks        = { numBlocks[0], numBlocks[1], numBlocks[2] },
+                                    .cellsPerBlock = { 16, 16, 16 },
+                                    .periodic      = { true, true, true } }
                      .build() };
 
    const Vector3< real_t > force{ 0.005, 0., 0. };
@@ -63,9 +67,12 @@ void fullyPeriodic(mpi::Environment& env)
 void mirroredHalfChannel(mpi::Environment& env)
 {
    size_t zCells{ 64 };
+   uint_t numProcesses = uint_c(MPIManager::instance()->numProcesses());
+   std::vector< uint_t > numBlocksXY{ math::getFactors(numProcesses, 2u) };
 
-   SimDomain dom{ SimDomainBuilder{
-      .blocks = { 1, 1, 1 }, .cellsPerBlock = { 4, 4, zCells }, .periodic = { true, true, false } }
+   SimDomain dom{ SimDomainBuilder{ .blocks        = { numBlocksXY[0], numBlocksXY[1], 1 },
+                                    .cellsPerBlock = { 4, 4, zCells },
+                                    .periodic      = { true, true, false } }
                      .build() };
 
    /* Hagen-Poiseuille-law in lattice units */
@@ -144,8 +151,10 @@ void mirroredHalfChannel(mpi::Environment& env)
  */
 void freeSlipPipe(mpi::Environment& env)
 {
+   uint_t numProcesses = uint_c(MPIManager::instance()->numProcesses());
+
    SimDomain dom{ SimDomainBuilder{
-      .blocks = { 1, 1, 1 }, .cellsPerBlock = { 4, 32, 32 }, .periodic = { true, false, false } }
+      .blocks = { numProcesses, 1, 1 }, .cellsPerBlock = { 4, 32, 32 }, .periodic = { true, false, false } }
                      .build() };
 
    const FlagUID fluidFlagUid{ "Fluid" };
@@ -265,7 +274,7 @@ int main(int argc, char** argv)
 
    if (auto entry = BasicLbmScenarios::TESTS.find(testId); entry != BasicLbmScenarios::TESTS.end())
    {
-      std::get< BasicLbmScenarios::TestFunction >(*entry)(env);
+      std::get< BasicLbmScenarios::TestFunction > (*entry)(env);
       return EXIT_SUCCESS;
    }