From 2aa8cafb19a37f9e8b4b2f7d223eb872969c0042 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 8 Apr 2025 17:19:02 +0200
Subject: [PATCH] Introduce generator for GPU PDF-Field Pack Infos.

Squashed commit of the following:

commit fd5ccd1442a27fdb209f38dfaa7e2e36e770d8cf
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Tue Apr 8 17:14:38 2025 +0200

    attempt fix ci

commit 2c3b8677008cf964fe2b3fb64b1ca14d84f7070c
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Tue Apr 8 17:00:14 2025 +0200

    attempt to fix, 2

commit 30e3dbc6cca2b02bb2f87e4ba438ac5e0d8635fa
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Tue Apr 8 16:57:31 2025 +0200

    attempt to fix mpiexec in CI

commit 8daef09dfd1e39b462a7c682ff29f68ac338df62
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Tue Apr 8 16:20:58 2025 +0200

    output test logs on failure in CI

commit 2eb890ae6d5ca30a4b9374eebe7d005be3890ce1
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Tue Apr 8 15:16:34 2025 +0200

    fix and test gpu packinfo packet size and direction sets.

commit 4800dcb0f80da8fe315b943c6f447a6e34d9f16a
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Tue Apr 8 12:40:18 2025 +0200

    fix compiler error

commit a171e6f4fcd9b2dd1dc1566565b149d6edbb6f7d
Merge: de494f9 7effd7c
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Tue Apr 8 12:11:51 2025 +0200

    Merge branch 'master' into fhennig/gpu-packinfo

commit de494f9be033fff98f582c7c56d213af92f9b9c7
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Mon Apr 7 17:17:05 2025 +0200

    add device memtag

commit 683531548402af740852cc7e90bd2c20cf1be3b5
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Mon Apr 7 13:19:17 2025 +0200

    update gpu pdf field pack info: restrict to streaming PDFs

commit 9be1458f80f82c89c6c3e8214713f71e075163bf
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Thu Apr 3 09:52:30 2025 +0200

    fix GPU comm scheme

commit ad769fc122b426dfcfde64f8918617860725bf08
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Wed Apr 2 22:32:16 2025 +0200

    finished generated packinfo implementation

commit 68b60f662ceea3481760dab5abb767067e5c7662
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Wed Apr 2 16:57:11 2025 +0200

    pack info codegen WIP

commit 2998cdf7114d1b3c504af1d39bb5652836280db4
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Wed Apr 2 15:25:07 2025 +0200

    gpu pack info base class and impl concept
---
 .gitlab-ci.yml                                |   4 +-
 lib/CMakeLists.txt                            |   1 +
 .../UniformGpuFieldPackInfoBase.hpp           | 107 +++++++
 .../experimental/memory/MemoryTags.hpp        |   7 +
 src/walberla/codegen/api.py                   |  24 +-
 .../codegen/communication/__init__.py         |   3 +
 .../codegen/communication/pack_infos.py       | 289 ++++++++++++++++++
 tests/BasicLbmScenarios/CMakeLists.txt        |   8 +-
 tests/BasicLbmScenarios/LbmAlgorithms.py      |  14 +-
 tests/BasicLbmScenarios/PackInfo.py           |  12 +
 tests/BasicLbmScenarios/SimDomain.hpp         |  18 +-
 .../TestBasicLbmScenarios.cpp                 |  21 +-
 tests/CMakeLists.txt                          |   5 +
 13 files changed, 480 insertions(+), 33 deletions(-)
 create mode 100644 lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
 create mode 100644 src/walberla/codegen/communication/__init__.py
 create mode 100644 src/walberla/codegen/communication/pack_infos.py
 create mode 100644 tests/BasicLbmScenarios/PackInfo.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index bdaaf9e..37e7a77 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -35,8 +35,10 @@ typecheck:
     - cd build/${cmakePresetName}
     - cmake --build . --target SfgTests
     - cmake --build . --target UserManualExamples
+  variables:
+    TESTSUITE_MPIEXEC_FLAGS: --oversubscribe;--allow-run-as-root
   script:
-    - ctest
+    - ctest --output-on-failure
 
 .clang-19:
   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-19:latest
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 4a50799..71cbd39 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -7,6 +7,7 @@ target_sources( walberla_experimental
     walberla/experimental/sweep/SparseIndexList.hpp
     walberla/experimental/lbm/GenericHbbBoundary.hpp
     walberla/experimental/lbm/IrregularFreeSlip.hpp
+    walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
 )
 
 target_link_libraries(
diff --git a/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp b/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
new file mode 100644
index 0000000..acff9cb
--- /dev/null
+++ b/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
@@ -0,0 +1,107 @@
+#pragma once
+
+#if defined(WALBERLA_BUILD_WITH_CUDA) || defined(WALBERLA_BUILD_WITH_HIP)
+
+#include "core/all.h"
+
+#include "gpu/communication/GeneratedGPUPackInfo.h"
+
+#include <concepts>
+#include <span>
+
+namespace walberla::experimental::communication
+{
+
+namespace detail
+{
+template< typename T >
+concept UniformGpuFieldPackInfoImpl = requires(T impl,                                              //
+                                               typename T::Field_T& field,                          //
+                                               std::span< typename T::Field_T::value_type > buffer, //
+                                               stencil::Direction dir,                              //
+                                               CellInterval& ci,                                    //
+                                               gpuStream_t stream                                   //
+) {
+   typename T::Field_T;
+
+   { impl.doPack(field, buffer, dir, ci, stream) } -> std::same_as< void >;
+
+   { impl.doUnpack(field, buffer, dir, ci, stream) } -> std::same_as< void >;
+
+   { impl.doLocalCopy(field, ci, field, ci, dir, stream) } -> std::same_as< void >;
+
+   { impl.elementsPerCell(dir) } -> std::same_as< uint_t >;
+};
+} // namespace detail
+
+template< typename Impl >
+class UniformGpuFieldPackInfoBase : public gpu::GeneratedGPUPackInfo
+{
+ public:
+   // static_assert( detail::UniformGpuFieldPackInfoImpl< Impl >, "Impl does not satisfy contraints.");
+
+   UniformGpuFieldPackInfoBase(BlockDataID fieldId, uint_t sliceWidth = 1)
+      : fieldId_{ fieldId }, sliceWidth_{ sliceWidth }
+   {}
+
+   void pack(stencil::Direction dir, unsigned char* rawBuffer, IBlock* block, gpuStream_t stream) override
+   {
+      using Field_T    = typename Impl::Field_T;
+      using value_type = typename Field_T::value_type;
+      Field_T * field   = block->getData< Field_T >(fieldId_);
+      CellInterval ci;
+      field->getSliceBeforeGhostLayer(dir, ci, sliceWidth_, false);
+      std::span< value_type > buffer{ ( value_type* ) rawBuffer, this->size(dir, block) };
+      impl().doPack(field, buffer, dir, ci, stream);
+   }
+
+   void unpack(stencil::Direction dir, unsigned char* rawBuffer, IBlock* block, gpuStream_t stream) override
+   {
+      using Field_T    = typename Impl::Field_T;
+      using value_type = typename Field_T::value_type;
+      Field_T * field   = block->getData< Field_T >(fieldId_);
+      CellInterval ci;
+      field->getGhostRegion(dir, ci, sliceWidth_, false);
+      std::span< value_type > buffer{ (value_type*) rawBuffer, this->size(dir, block) };
+      stencil::Direction commDir{ stencil::inverseDir[ dir ] };
+      impl().doUnpack(field, buffer, commDir, ci, stream);
+   }
+
+   void communicateLocal  ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) override {
+      using Field_T    = typename Impl::Field_T;
+
+      Field_T * srcField = const_cast< IBlock * >(sender)->getData< Field_T >(fieldId_);
+      Field_T * dstField = receiver->getData< Field_T >(fieldId_);
+
+      CellInterval srcRegion;
+      CellInterval dstRegion;
+      srcField->getSliceBeforeGhostLayer(dir, srcRegion, sliceWidth_, false);
+      dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, sliceWidth_, false);
+
+      impl().doLocalCopy(srcField, srcRegion, dstField, dstRegion, dir, stream);
+   }
+
+   uint_t size(stencil::Direction dir, IBlock* block) override
+   {
+      using Field_T = typename Impl::Field_T;
+      using value_type = typename Field_T::value_type;
+
+      const Field_T * field = block->getData< Field_T >(fieldId_);
+      CellInterval ci;
+      field->getGhostRegion(dir, ci, sliceWidth_, false);
+
+      uint_t elementsPerCell{ impl().elementsPerCell(dir) };
+      return elementsPerCell * ci.numCells() * sizeof( value_type );
+   }
+
+ protected:
+   BlockDataID fieldId_;
+   uint_t sliceWidth_;
+
+ private:
+   Impl& impl() { return static_cast< Impl& >(*this); }
+};
+
+} // namespace walberla::experimental::communication
+
+#endif
diff --git a/lib/walberla/experimental/memory/MemoryTags.hpp b/lib/walberla/experimental/memory/MemoryTags.hpp
index 902d739..0a26a34 100644
--- a/lib/walberla/experimental/memory/MemoryTags.hpp
+++ b/lib/walberla/experimental/memory/MemoryTags.hpp
@@ -38,6 +38,13 @@ struct unified : public _mem_tag
 {};
 inline unified unified_v;
 
+/**
+ * @brief Memory tag indicating GPU device memory.
+ */
+struct device : public _mem_tag
+{};
+inline device device_v;
+
 } // namespace memtag
 
 template< typename T >
diff --git a/src/walberla/codegen/api.py b/src/walberla/codegen/api.py
index 766c6fc..b5fa9d7 100644
--- a/src/walberla/codegen/api.py
+++ b/src/walberla/codegen/api.py
@@ -113,6 +113,11 @@ class CellInterval(_PlainCppClass):
 class Direction(_PlainCppClass):
     _type = cpptype("walberla::stencil::Direction", "stencil/Directions.h")
 
+    @staticmethod
+    def from_offset(offset: tuple[int, int, int]) -> str:
+        from pystencils.stencil import offset_to_direction_string
+        return f"walberla::stencil::Direction::{offset_to_direction_string(offset)}"
+
 
 class BlockDataID(_PlainCppClass):
     _type = cpptype("walberla::BlockDataID", "domain_decomposition/BlockDataID.h")
@@ -264,7 +269,7 @@ class GhostLayerFieldPtr(GenericWalberlaField):
     )
 
     @staticmethod
-    def create(field: Field):
+    def create(field: Field, const: bool = False):
         if field.index_dimensions > 1:
             raise ValueError(
                 "Cannot map fields with more than one index dimension to field::GhostLayerField."
@@ -279,15 +284,11 @@ class GhostLayerFieldPtr(GenericWalberlaField):
 
         fsize = field.index_shape[0] if field.index_shape else 1
 
-        return GhostLayerFieldPtr(element_type, fsize).var(field.name)
+        return GhostLayerFieldPtr(element_type, fsize, const=const).var(field.name)
 
-    def __init__(
-        self,
-        element_type: UserTypeSpec,
-        fsize: int,
-    ):
+    def __init__(self, element_type: UserTypeSpec, fsize: int, const: bool = False):
         element_type = create_type(element_type)
-        field_type = self._template(element_type=element_type, fsize=fsize)
+        field_type = self._template(element_type=element_type, fsize=fsize, const=const)
 
         super().__init__(element_type, field_type, ptr=True)
 
@@ -299,7 +300,7 @@ class GpuFieldPtr(GenericWalberlaField):
     )
 
     @staticmethod
-    def create(field: Field):
+    def create(field: Field, const: bool = False):
         if field.index_dimensions > 1:
             raise ValueError(
                 "Cannot map fields with more than one index dimension to gpu::GpuField."
@@ -314,15 +315,16 @@ class GpuFieldPtr(GenericWalberlaField):
 
         fsize = field.index_shape[0] if field.index_shape else 1
 
-        return GpuFieldPtr(element_type, fsize).var(field.name)
+        return GpuFieldPtr(element_type, fsize, const=const).var(field.name)
 
     def __init__(
         self,
         element_type: UserTypeSpec,
         fsize: int,
+        const: bool = False,
     ):
         element_type = create_type(element_type)
-        field_type = self._template(element_type=element_type)
+        field_type = self._template(element_type=element_type, const=const)
 
         super().__init__(element_type, field_type, ptr=True)
 
diff --git a/src/walberla/codegen/communication/__init__.py b/src/walberla/codegen/communication/__init__.py
new file mode 100644
index 0000000..9a51492
--- /dev/null
+++ b/src/walberla/codegen/communication/__init__.py
@@ -0,0 +1,3 @@
+from .pack_infos import GpuPdfFieldPackInfo
+
+__all__ = ["GpuPdfFieldPackInfo"]
diff --git a/src/walberla/codegen/communication/pack_infos.py b/src/walberla/codegen/communication/pack_infos.py
new file mode 100644
index 0000000..a3284f3
--- /dev/null
+++ b/src/walberla/codegen/communication/pack_infos.py
@@ -0,0 +1,289 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from pystencils import (
+    Field,
+    FieldType,
+    Assignment,
+    CreateKernelConfig,
+    DynamicType,
+    Target,
+)
+from pystencils.stencil import offset_to_direction_string
+from pystencils.codegen.properties import FieldBasePtr
+from lbmpy import Stencil, LBStencil
+
+from pystencilssfg import SfgComposer
+from pystencilssfg.composer.basic_composer import KernelsAdder
+from pystencilssfg.composer.custom import CustomGenerator
+from pystencilssfg.ir import SfgKernelHandle, SfgEmptyNode
+from pystencilssfg.ir.call_tree import SfgCallTreeNode
+from pystencilssfg.ir.postprocessing import PostProcessingContext, SfgDeferredNode
+from pystencilssfg.lang import SfgKernelParamVar, AugExpr, strip_ptr_ref
+from pystencilssfg.lang.cpp import std
+from pystencilssfg.lang.gpu import CudaAPI, HipAPI, ProvidesGpuRuntimeAPI
+
+from ..api import GpuFieldPtr, Direction, CellInterval, uint_t
+from ..build_config import get_build_config
+
+
+@dataclass
+class PackingKernelsContext:
+    sfg: SfgComposer
+    kns: KernelsAdder
+    cfg: CreateKernelConfig
+
+
+@dataclass
+class CaptureBufferPointer(SfgDeferredNode):
+    sfg: SfgComposer
+    buffer_name: str
+    buffer_span: std.span
+
+    def expand(self, ppc: PostProcessingContext) -> SfgCallTreeNode:
+        for param in ppc.live_variables:
+            if (
+                isinstance(param, SfgKernelParamVar)
+                and param.wrapped.fields[0].name == self.buffer_name
+                and param.wrapped.get_properties(FieldBasePtr)
+            ):
+                return self.sfg.init(param)(
+                    AugExpr.format("{}.data()", self.buffer_span)
+                )
+
+        return SfgEmptyNode()
+
+
+class GpuPdfFieldPackInfo(CustomGenerator):
+    """Pack Info for lattice Boltzmann Gpu PDF fields.
+
+    Generate a ghost layer exchange pack info for communicating lattice Boltzmann populations
+    streaming across a block boundary,
+    for use with `gpu::GpuField` and `gpu::UniformGpuScheme`.
+
+    For a given velocity set, this pack info will only communicate those populations f_i
+    from block A to a neighbor block B which are being advected, by the streaming step,
+    from a cell in A to an adjacent cell in B.
+
+    .. note::
+        For the time being, this pack info is restricted to the *pull* streaming pattern.
+
+    Args:
+        name: Name of the generated pack info class
+        stencil: Velocity set of the lattice Boltzmann method
+        field: Symbolic representation of the PDF field
+    """
+
+    def __init__(self, name: str, stencil: LBStencil, field: Field):
+        if field.index_dimensions > 1:
+            raise ValueError(
+                "GpuFieldPackInfo currently does not support higher-order tensor fields"
+            )
+
+        if isinstance(field.dtype, DynamicType):
+            raise ValueError(
+                "Cannot generate GpuFieldPackInfo for a dynamically-typed field"
+            )
+
+        self._name = name
+        self._stencil = stencil
+        self._full_stencil = (
+            LBStencil(Stencil.D3Q27)
+            if self._stencil.D == 3
+            else LBStencil(Stencil.D2Q9)
+        )
+
+        #   Map storing the set of communicated populations for each communication direction
+        self._communication_sets: dict[tuple[int, int, int], list[int]] = dict()
+        for comm_dir in self._full_stencil:
+            if indices := self._get_streaming_indices(comm_dir):
+                self._communication_sets[comm_dir] = indices
+
+        self._field = field
+        self._dtype = field.dtype
+        self._src_field = Field.new_field_with_different_name(
+            self._field, f"{self._field.name}_src"
+        )
+        self._dst_field = Field.new_field_with_different_name(
+            self._field, f"{self._field.name}_dst"
+        )
+
+    def generate(self, sfg: SfgComposer) -> None:
+        base_class = f"walberla::experimental::communication::UniformGpuFieldPackInfoBase< {self._name } >"
+        sfg.include(
+            "walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp"
+        )
+
+        build_config = get_build_config(sfg)
+
+        pkc = PackingKernelsContext(
+            sfg,
+            kns=sfg.kernel_namespace(f"{self._name}_kernels"),
+            cfg=build_config.get_pystencils_config(),
+        )
+
+        GpuAPI: type[ProvidesGpuRuntimeAPI]
+        match pkc.cfg.get_target():
+            case Target.CUDA:
+                GpuAPI = CudaAPI
+            case Target.HIP:
+                GpuAPI = HipAPI
+            case other:
+                raise ValueError(
+                    f"Invalid target for generating GpuFieldPackInfo: {other}"
+                )
+
+        pack_kernels: dict[tuple[int, int, int], SfgKernelHandle] = dict()
+        unpack_kernels: dict[tuple[int, int, int], SfgKernelHandle] = dict()
+        local_copy_kernels: dict[tuple[int, int, int], SfgKernelHandle] = dict()
+
+        comm_dirs = self._communication_sets.keys()
+
+        for comm_dir in comm_dirs:
+            if not all(c == 0 for c in comm_dir):
+                pack_kernels[comm_dir] = self._do_pack(pkc, comm_dir)
+                unpack_kernels[comm_dir] = self._do_unpack(pkc, comm_dir)
+                local_copy_kernels[comm_dir] = self._do_local_copy(pkc, comm_dir)
+
+        src_gpu_field = GpuFieldPtr.create(self._src_field)
+        gpu_field_type = strip_ptr_ref(src_gpu_field.get_dtype())
+        dst_gpu_field = GpuFieldPtr.create(self._dst_field)
+        buffer_span = std.span(self._dtype).var("buffer")
+        dir = Direction().var("dir")
+        src_interval = CellInterval(const=True, ref=True).var("srcInterval")
+        dst_interval = CellInterval(const=True, ref=True).var("dstInterval")
+
+        stream = GpuAPI.stream_t().var("stream")
+
+        sfg.klass(self._name, bases=[f"public {base_class}"])(
+            sfg.public(
+                f"using Base = {base_class};",
+                "using Base::Base;",
+                f"using Field_T = {gpu_field_type.c_string()};",
+                sfg.method("doPack").params(
+                    src_gpu_field, buffer_span, dir, src_interval, stream
+                )(
+                    sfg.map_field(
+                        self._src_field, src_gpu_field.with_cell_interval(src_interval)
+                    ),
+                    CaptureBufferPointer(sfg, "buffer", buffer_span),
+                    sfg.switch(dir)
+                    .cases(
+                        {
+                            Direction.from_offset(comm_dir): sfg.gpu_invoke(
+                                pack_kernels[comm_dir], stream=stream
+                            )
+                            for comm_dir in comm_dirs
+                        }
+                    )
+                    .default("/* unreachable */"),
+                ),
+                sfg.method("doUnpack").params(
+                    src_gpu_field, buffer_span, dir, dst_interval, stream
+                )(
+                    sfg.map_field(
+                        self._dst_field, src_gpu_field.with_cell_interval(dst_interval)
+                    ),
+                    CaptureBufferPointer(sfg, "buffer", buffer_span),
+                    sfg.switch(dir)
+                    .cases(
+                        {
+                            Direction.from_offset(comm_dir): sfg.gpu_invoke(
+                                unpack_kernels[comm_dir], stream=stream
+                            )
+                            for comm_dir in comm_dirs
+                        }
+                    )
+                    .default("/* unreachable */"),
+                ),
+                sfg.method("doLocalCopy").params(
+                    src_gpu_field,
+                    src_interval,
+                    dst_gpu_field,
+                    dst_interval,
+                    dir,
+                    stream,
+                )(
+                    sfg.map_field(
+                        self._src_field, src_gpu_field.with_cell_interval(src_interval)
+                    ),
+                    sfg.map_field(
+                        self._dst_field, dst_gpu_field.with_cell_interval(dst_interval)
+                    ),
+                    sfg.switch(dir)
+                    .cases(
+                        {
+                            f"walberla::stencil::Direction::{offset_to_direction_string(comm_dir)}": sfg.gpu_invoke(
+                                local_copy_kernels[comm_dir], stream=stream
+                            )
+                            for comm_dir in comm_dirs
+                        }
+                    )
+                    .default("/* unreachable */"),
+                ),
+                sfg.method("elementsPerCell")
+                .inline()
+                .const()
+                .params(dir)
+                .returns(uint_t)(
+                    sfg.switch(dir, autobreak=False).cases({
+                        Direction.from_offset(comm_dir): f"return {len(elems)};"
+                        for comm_dir, elems in self._communication_sets.items()
+                    }).default("return 0;")
+                ),
+            )
+        )
+
+    def _pack_accesses(self, comm_dir: tuple[int, int, int]) -> list[Field.Access]:
+        return [self._src_field.center(i) for i in self._communication_sets[comm_dir]]
+
+    def _unpack_accesses(self, comm_dir: tuple[int, int, int]) -> list[Field.Access]:
+        return [self._dst_field.center(i) for i in self._communication_sets[comm_dir]]
+
+    def _get_streaming_indices(self, comm_dir) -> list[int]:
+        if all(d == 0 for d in comm_dir):
+            return []
+        else:
+            from lbmpy.advanced_streaming.communication import _extend_dir
+
+            directions = set(_extend_dir(comm_dir)) & set(self._stencil)
+            indices = sorted(self._stencil.index(d) for d in directions)
+            return indices
+
+    def _do_pack(
+        self, pkc: PackingKernelsContext, comm_dir: tuple[int, int, int]
+    ) -> SfgKernelHandle:
+        pack_accs = self._pack_accesses(comm_dir)
+        buffer = self._buffer(len(pack_accs))
+        asms = [Assignment(buffer(i), acc) for i, acc in enumerate(pack_accs)]
+        dir_str = offset_to_direction_string(comm_dir)
+        return pkc.kns.create(asms, f"pack{dir_str}", pkc.cfg)
+
+    def _do_unpack(
+        self, pkc: PackingKernelsContext, comm_dir: tuple[int, int, int]
+    ) -> SfgKernelHandle:
+        unpack_accs = self._unpack_accesses(comm_dir)
+        buffer = self._buffer(len(unpack_accs))
+        asms = [Assignment(acc, buffer(i)) for i, acc in enumerate(unpack_accs)]
+        dir_str = offset_to_direction_string(comm_dir)
+        return pkc.kns.create(asms, f"unpack{dir_str}", pkc.cfg)
+
+    def _do_local_copy(
+        self, pkc: PackingKernelsContext, comm_dir: tuple[int, int, int]
+    ) -> SfgKernelHandle:
+        pack_accs = self._pack_accesses(comm_dir)
+        unpack_accs = self._unpack_accesses(comm_dir)
+
+        asms = [Assignment(dst, src) for dst, src in zip(unpack_accs, pack_accs)]
+        dir_str = offset_to_direction_string(comm_dir)
+        return pkc.kns.create(asms, f"localCopy{dir_str}", pkc.cfg)
+
+    def _buffer(self, num_elems: int):
+        return Field.create_generic(
+            "buffer",
+            1,
+            field_type=FieldType.BUFFER,
+            dtype=self._field.dtype,
+            index_shape=(num_elems,),
+        )
diff --git a/tests/BasicLbmScenarios/CMakeLists.txt b/tests/BasicLbmScenarios/CMakeLists.txt
index 58dfde5..c128ff8 100644
--- a/tests/BasicLbmScenarios/CMakeLists.txt
+++ b/tests/BasicLbmScenarios/CMakeLists.txt
@@ -11,7 +11,9 @@ target_link_libraries( TestBasicLbmScenariosCPU PRIVATE walberla::core walberla:
 add_dependencies( SfgTests TestBasicLbmScenariosCPU )
 
 foreach( TestID ${TestIDs} )
-    add_test( NAME "BasicLbmScenarios - CPU - ${TestID}" COMMAND TestBasicLbmScenariosCPU ${TestID} )
+foreach( NumProcs 1 2 4 )
+    add_test( NAME "BasicLbmScenarios - CPU - ${NumProcs} Processes - ${TestID}" COMMAND mpiexec -c ${NumProcs} ${_SFG_TESTSUITE_MPIEXEC_FLAGS} TestBasicLbmScenariosCPU ${TestID} )
+endforeach()
 endforeach()
 
 
@@ -39,6 +41,8 @@ if( $CACHE{WALBERLA_BUILD_WITH_HIP} )
     add_dependencies( SfgTests TestBasicLbmScenariosHIP )
 
     foreach( TestID ${TestIDs} )
-        add_test( NAME "BasicLbmScenarios - HIP - ${TestID}" COMMAND TestBasicLbmScenariosHIP ${TestID} )
+    foreach( NumProcs 1 2 4 )
+        add_test( NAME "BasicLbmScenarios - HIP -  ${NumProcs} Processes - ${TestID}" COMMAND mpiexec -c ${NumProcs} ${_SFG_TESTSUITE_MPIEXEC_FLAGS} TestBasicLbmScenariosHIP ${TestID} )
+    endforeach()
     endforeach()
 endif()
diff --git a/tests/BasicLbmScenarios/LbmAlgorithms.py b/tests/BasicLbmScenarios/LbmAlgorithms.py
index eefe063..c3faa28 100644
--- a/tests/BasicLbmScenarios/LbmAlgorithms.py
+++ b/tests/BasicLbmScenarios/LbmAlgorithms.py
@@ -16,6 +16,7 @@ from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
 
 from walberla.codegen import Sweep, get_build_config
 from walberla.codegen.boundaries import NoSlip, FreeSlip
+from walberla.codegen.communication import GpuPdfFieldPackInfo
 
 from walberla.codegen.build_config import DEBUG_MOCK_CMAKE
 
@@ -31,17 +32,19 @@ with SourceFileGenerator(keep_unknown_argv=True) as sfg:
 
     match args.target:
         case "cpu":
-            build_config.override.target = Target.CurrentCPU
+            target = Target.CurrentCPU
             sfg.code("#define LBM_SCENARIOS_CPU_BUILD true")
         case "hip":
-            build_config.override.target = Target.HIP
+            target = Target.HIP
             sfg.code("#define LBM_SCENARIOS_GPU_BUILD true")
         case "cuda":
-            build_config.override.target = Target.CUDA
+            target = Target.CUDA
             sfg.code("#define LBM_SCENARIOS_GPU_BUILD true")
         case _:
             raise ValueError(f"Unexpected target id: {args.target}")
 
+    build_config.override.target = target
+
     sfg.namespace("BasicLbmScenarios::gen")
 
     stencil = LBStencil(Stencil.D3Q19)
@@ -117,3 +120,8 @@ with SourceFileGenerator(keep_unknown_argv=True) as sfg:
             wall_orientation=FreeSlip.IRREGULAR,
         )
         sfg.generate(irreg_freeslip)
+
+    if build_config.override.target.is_gpu():
+        with sfg.namespace("comm"):
+            pack_info = GpuPdfFieldPackInfo("GpuPdfsPackInfo", stencil, f)
+            sfg.generate(pack_info)
diff --git a/tests/BasicLbmScenarios/PackInfo.py b/tests/BasicLbmScenarios/PackInfo.py
new file mode 100644
index 0000000..df3ccdd
--- /dev/null
+++ b/tests/BasicLbmScenarios/PackInfo.py
@@ -0,0 +1,12 @@
+import pystencils as ps
+from lbmpy import Stencil, LBStencil
+from pystencilssfg import SourceFileGenerator
+from walberla.codegen.communication import GpuPdfFieldPackInfo
+from walberla.codegen.build_config import DEBUG_MOCK_CMAKE
+
+DEBUG_MOCK_CMAKE.use_hip_default()
+
+with SourceFileGenerator() as sfg:
+    stencil = LBStencil(Stencil.D3Q19)
+    field = ps.fields(f"f({stencil.Q}): double[{stencil.D}D]")
+    sfg.generate(GpuPdfFieldPackInfo("PackInfo", stencil, field))
diff --git a/tests/BasicLbmScenarios/SimDomain.hpp b/tests/BasicLbmScenarios/SimDomain.hpp
index 4417699..105e5d7 100644
--- a/tests/BasicLbmScenarios/SimDomain.hpp
+++ b/tests/BasicLbmScenarios/SimDomain.hpp
@@ -45,7 +45,7 @@ using CommonGpuField = gpu::GPUField< PdfField_T::value_type >;
 
 using GpuCommScheme   = gpu::communication::UniformGPUScheme< gen::LbStencil >;
 // using GpuPdfsPackInfo = gpu::communication::MemcpyPackInfo< CommonGpuField >;
-using GpuPdfsPackInfo = gpu::communication::GPUPackInfo< CommonGpuField >;
+using GpuPdfsPackInfo = gen::comm::GpuPdfsPackInfo;
 #endif
 
 struct SimDomain
@@ -70,7 +70,7 @@ struct SimDomain
       const BlockDataID uId;
    } gpuFields;
 
-   // GpuCommScheme commGpu;
+   std::unique_ptr< GpuCommScheme > commGpu;
 
    void initFromFields(const Vector3< real_t > force)
    {
@@ -121,7 +121,7 @@ struct SimDomain
    void syncGhostLayers()
    {
       // WALBERLA_GPU_CHECK(gpuPeekAtLastError());
-      commCpu();
+      (*commGpu)();
    }
 
    void fields2host()
@@ -242,15 +242,13 @@ struct SimDomainBuilder
       const BlockDataID rhoIdGpu  = gpu::addGPUFieldToStorage< ScalarField_T >(sbfs, rhoId, "rho_gpu");
       const BlockDataID uIdGpu    = gpu::addGPUFieldToStorage< VectorField_T >(sbfs, uId, "u_gpu");
 
-      // GpuCommScheme commGpu{ sbfs };
-      // auto gpuPdfsPackInfo = std::make_shared< GpuPdfsPackInfo >(pdfsIdGpu);
-      // commGpu.addPackInfo(gpuPdfsPackInfo);
+      auto commGpu = std::make_unique< GpuCommScheme >( sbfs );
       auto gpuPdfsPackInfo = std::make_shared< GpuPdfsPackInfo >(pdfsIdGpu);
-      commCpu.addPackInfo(gpuPdfsPackInfo);
+      commGpu->addPackInfo(gpuPdfsPackInfo);
+      // commCpu.addPackInfo(gpuPdfsPackInfo);
 #endif
 
-      return
-      {
+      return {
          .blocks = sbfs, //
          .cpuFields = { //
             .pdfsId = pdfsId,
@@ -265,7 +263,7 @@ struct SimDomainBuilder
             .rhoId = rhoIdGpu,
             .uId = uIdGpu
          },
-         // .commGpu = commGpu
+         .commGpu = std::move(commGpu)
 #endif
       };
    }
diff --git a/tests/BasicLbmScenarios/TestBasicLbmScenarios.cpp b/tests/BasicLbmScenarios/TestBasicLbmScenarios.cpp
index 974e5f2..e25e33a 100644
--- a/tests/BasicLbmScenarios/TestBasicLbmScenarios.cpp
+++ b/tests/BasicLbmScenarios/TestBasicLbmScenarios.cpp
@@ -25,8 +25,12 @@ using TestFunction = std::function< void(mpi::Environment&) >;
  */
 void fullyPeriodic(mpi::Environment& env)
 {
-   SimDomain dom{ SimDomainBuilder{
-      .blocks = { 1, 1, 1 }, .cellsPerBlock = { 32, 32, 32 }, .periodic = { true, true, true } }
+   uint_t numProcesses = uint_c(MPIManager::instance()->numProcesses());
+   Vector3< uint_t > numBlocks{ math::getFactors3D(numProcesses) };
+
+   SimDomain dom{ SimDomainBuilder{ .blocks        = { numBlocks[0], numBlocks[1], numBlocks[2] },
+                                    .cellsPerBlock = { 16, 16, 16 },
+                                    .periodic      = { true, true, true } }
                      .build() };
 
    const Vector3< real_t > force{ 0.005, 0., 0. };
@@ -63,9 +67,12 @@ void fullyPeriodic(mpi::Environment& env)
 void mirroredHalfChannel(mpi::Environment& env)
 {
    size_t zCells{ 64 };
+   uint_t numProcesses = uint_c(MPIManager::instance()->numProcesses());
+   std::vector< uint_t > numBlocksXY{ math::getFactors(numProcesses, 2u) };
 
-   SimDomain dom{ SimDomainBuilder{
-      .blocks = { 1, 1, 1 }, .cellsPerBlock = { 4, 4, zCells }, .periodic = { true, true, false } }
+   SimDomain dom{ SimDomainBuilder{ .blocks        = { numBlocksXY[0], numBlocksXY[1], 1 },
+                                    .cellsPerBlock = { 4, 4, zCells },
+                                    .periodic      = { true, true, false } }
                      .build() };
 
    /* Hagen-Poiseuille-law in lattice units */
@@ -144,8 +151,10 @@ void mirroredHalfChannel(mpi::Environment& env)
  */
 void freeSlipPipe(mpi::Environment& env)
 {
+   uint_t numProcesses = uint_c(MPIManager::instance()->numProcesses());
+
    SimDomain dom{ SimDomainBuilder{
-      .blocks = { 1, 1, 1 }, .cellsPerBlock = { 4, 32, 32 }, .periodic = { true, false, false } }
+      .blocks = { numProcesses, 1, 1 }, .cellsPerBlock = { 4, 32, 32 }, .periodic = { true, false, false } }
                      .build() };
 
    const FlagUID fluidFlagUid{ "Fluid" };
@@ -265,7 +274,7 @@ int main(int argc, char** argv)
 
    if (auto entry = BasicLbmScenarios::TESTS.find(testId); entry != BasicLbmScenarios::TESTS.end())
    {
-      std::get< BasicLbmScenarios::TestFunction >(*entry)(env);
+      std::get< BasicLbmScenarios::TestFunction > (*entry)(env);
       return EXIT_SUCCESS;
    }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e27ee17..2738c29 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -17,6 +17,11 @@ set(WALBERLA_BUILD_TUTORIALS OFF CACHE BOOL "")
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED)
 
+set(
+    _SFG_TESTSUITE_MPIEXEC_FLAGS
+    $ENV{TESTSUITE_MPIEXEC_FLAGS}
+)
+
 include(FetchContent)
 
 FetchContent_Declare(
-- 
GitLab