diff --git a/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp b/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
index 803455e449f5d8673de519883ebcf128fbdd0c41..bdf4b46e71ac66b15fbe211d1db74d3cc2a42ad5 100644
--- a/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
+++ b/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-// #if defined(WALBERLA_BUILD_WITH_CUDA) || defined(WALBERLA_BUILD_WITH_HIP)
+#if defined(WALBERLA_BUILD_WITH_CUDA) || defined(WALBERLA_BUILD_WITH_HIP)
 
 #include "core/all.h"
 
@@ -28,45 +28,63 @@ concept UniformGpuFieldPackInfoImpl = requires(T impl,
 
    { impl.doUnpack(field, buffer, dir, ci, stream) } -> std::same_as< void >;
 
+   { impl.doLocalCopy(field, ci, field, ci, dir, stream) } -> std::same_as< void >;
+
    { impl.elementsPerCell(dir) } -> std::same_as< uint_t >;
-}
+};
 } // namespace detail
 
-template< detail::UniformGpuFieldPackInfoImpl Impl >
+template< typename Impl >
 class UniformGpuFieldPackInfoBase : public gpu::GeneratedGPUPackInfo
 {
  public:
-   UniformGpuFieldPackInfoBase(BlockDataID fieldId, uint_t sliceWidth = 0)
+   // static_assert( detail::UniformGpuFieldPackInfoImpl< Impl >, "Impl does not satisfy contraints.");
+
+   UniformGpuFieldPackInfoBase(BlockDataID fieldId, uint_t sliceWidth = 1)
       : fieldId_{ fieldId }, sliceWidth_{ sliceWidth }
    {}
 
-   void pack(stencil::Direction dir, unsigned char* bufferPtr, IBlock* block, gpuStream_t stream) override
+   void pack(stencil::Direction dir, unsigned char* rawBuffer, IBlock* block, gpuStream_t stream) override
    {
       using Field_T    = typename Impl::Field_T;
       using value_type = typename Field_T::value_type;
-      Field_T& field   = *block->getData< Field_T >(fieldId_);
+      Field_T * field   = block->getData< Field_T >(fieldId_);
       CellInterval ci;
       field->getSliceBeforeGhostLayer(dir, ci, sliceWidth_, false);
-      std::span< value_type > buffer{ static_cast< value_type* >(bufferPtr), this->size(dir, block) };
-      impl().doPack(field, buffer, dir, stream);
+      std::span< value_type > buffer{ ( value_type* ) rawBuffer, this->size(dir, block) };
+      impl().doPack(field, buffer, dir, ci, stream);
    }
 
-   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override
+   void unpack(stencil::Direction dir, unsigned char* rawBuffer, IBlock* block, gpuStream_t stream) override
    {
       using Field_T    = typename Impl::Field_T;
       using value_type = typename Field_T::value_type;
-      Field_T& field   = *block->getData< Field_T >(fieldId_);
+      Field_T * field   = block->getData< Field_T >(fieldId_);
       CellInterval ci;
       field->getGhostRegion(dir, ci, sliceWidth_, false);
-      std::span< value_type > buffer{ static_cast< value_type* >(bufferPtr), this->size(dir, block) };
-      impl().doUnpack(field, buffer, dir, stream);
+      std::span< value_type > buffer{ (value_type*) rawBuffer, this->size(dir, block) };
+      impl().doUnpack(field, buffer, dir, ci, stream);
+   }
+
+   void communicateLocal  ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) override {
+      using Field_T    = typename Impl::Field_T;
+
+      Field_T * srcField = const_cast< IBlock * >(sender)->getData< Field_T >(fieldId_);
+      Field_T * dstField = receiver->getData< Field_T >(fieldId_);
+
+      CellInterval srcRegion;
+      CellInterval dstRegion;
+      srcField->getSliceBeforeGhostLayer(dir, srcRegion, sliceWidth_, false);
+      dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, sliceWidth_, false);
+
+      impl().doLocalCopy(srcField, srcRegion, dstField, dstRegion, dir, stream);
    }
 
    uint_t size(stencil::Direction dir, IBlock* block) override
    {
       using Field_T = typename Impl::Field_T;
 
-      auto field = block->getData< Field_T >(fieldId_);
+      const Field_T * field = block->getData< Field_T >(fieldId_);
       CellInterval ci;
       field->getGhostRegion(dir, ci, 1, false);
 
@@ -84,4 +102,4 @@ class UniformGpuFieldPackInfoBase : public gpu::GeneratedGPUPackInfo
 
 } // namespace walberla::experimental::communication
 
-// #endif
+#endif
diff --git a/src/walberla/codegen/api.py b/src/walberla/codegen/api.py
index 766c6fc48653f74d6398c9c71235f82f2f2620ae..284b31766546141768791ca7d544ba29151d4238 100644
--- a/src/walberla/codegen/api.py
+++ b/src/walberla/codegen/api.py
@@ -264,7 +264,7 @@ class GhostLayerFieldPtr(GenericWalberlaField):
     )
 
     @staticmethod
-    def create(field: Field):
+    def create(field: Field, const: bool = False):
         if field.index_dimensions > 1:
             raise ValueError(
                 "Cannot map fields with more than one index dimension to field::GhostLayerField."
@@ -279,15 +279,11 @@ class GhostLayerFieldPtr(GenericWalberlaField):
 
         fsize = field.index_shape[0] if field.index_shape else 1
 
-        return GhostLayerFieldPtr(element_type, fsize).var(field.name)
+        return GhostLayerFieldPtr(element_type, fsize, const=const).var(field.name)
 
-    def __init__(
-        self,
-        element_type: UserTypeSpec,
-        fsize: int,
-    ):
+    def __init__(self, element_type: UserTypeSpec, fsize: int, const: bool = False):
         element_type = create_type(element_type)
-        field_type = self._template(element_type=element_type, fsize=fsize)
+        field_type = self._template(element_type=element_type, fsize=fsize, const=const)
 
         super().__init__(element_type, field_type, ptr=True)
 
@@ -299,7 +295,7 @@ class GpuFieldPtr(GenericWalberlaField):
     )
 
     @staticmethod
-    def create(field: Field):
+    def create(field: Field, const: bool = False):
         if field.index_dimensions > 1:
             raise ValueError(
                 "Cannot map fields with more than one index dimension to gpu::GpuField."
@@ -314,15 +310,16 @@ class GpuFieldPtr(GenericWalberlaField):
 
         fsize = field.index_shape[0] if field.index_shape else 1
 
-        return GpuFieldPtr(element_type, fsize).var(field.name)
+        return GpuFieldPtr(element_type, fsize, const=const).var(field.name)
 
     def __init__(
         self,
         element_type: UserTypeSpec,
         fsize: int,
+        const: bool = False,
     ):
         element_type = create_type(element_type)
-        field_type = self._template(element_type=element_type)
+        field_type = self._template(element_type=element_type, const=const)
 
         super().__init__(element_type, field_type, ptr=True)
 
diff --git a/src/walberla/codegen/communication/pack_infos.py b/src/walberla/codegen/communication/pack_infos.py
index 9989c9ca025fe52c9d78b741e30aac045cc310ca..229bc9dc8cbf5d08a74bc20718140e1bbe25d3b6 100644
--- a/src/walberla/codegen/communication/pack_infos.py
+++ b/src/walberla/codegen/communication/pack_infos.py
@@ -11,16 +11,20 @@ from pystencils import (
     Target,
 )
 from pystencils.stencil import offset_to_direction_string
+from pystencils.codegen.properties import FieldBasePtr
 from lbmpy import LBStencil
 
 from pystencilssfg import SfgComposer
 from pystencilssfg.composer.basic_composer import KernelsAdder
 from pystencilssfg.composer.custom import CustomGenerator
-from pystencilssfg.ir import SfgKernelHandle
+from pystencilssfg.ir import SfgKernelHandle, SfgEmptyNode
+from pystencilssfg.ir.call_tree import SfgCallTreeNode
+from pystencilssfg.ir.postprocessing import PostProcessingContext, SfgDeferredNode
+from pystencilssfg.lang import SfgKernelParamVar, AugExpr, strip_ptr_ref
 from pystencilssfg.lang.cpp import std
 from pystencilssfg.lang.gpu import CudaAPI, HipAPI
 
-from ..api import GpuFieldPtr, Direction, CellInterval
+from ..api import GpuFieldPtr, Direction, CellInterval, uint_t
 from ..build_config import get_build_config
 
 
@@ -31,6 +35,26 @@ class PackingKernelsContext:
     cfg: CreateKernelConfig
 
 
+@dataclass
+class CaptureBufferPointer(SfgDeferredNode):
+    sfg: SfgComposer
+    buffer_name: str
+    buffer_span: std.span
+
+    def expand(self, ppc: PostProcessingContext) -> SfgCallTreeNode:
+        for param in ppc.live_variables:
+            if (
+                isinstance(param, SfgKernelParamVar)
+                and param.wrapped.fields[0].name == self.buffer_name
+                and param.wrapped.get_properties(FieldBasePtr)
+            ):
+                return self.sfg.init(param)(
+                    AugExpr.format("{}.data()", self.buffer_span)
+                )
+
+        return SfgEmptyNode()
+
+
 class GpuFieldPackInfo(CustomGenerator):
     def __init__(self, name: str, stencil: LBStencil, field: Field):
         if field.index_dimensions > 1:
@@ -47,9 +71,18 @@ class GpuFieldPackInfo(CustomGenerator):
         self._stencil = stencil
         self._field = field
         self._dtype = field.dtype
+        self._src_field = Field.new_field_with_different_name(
+            self._field, f"{self._field.name}_src"
+        )
+        self._dst_field = Field.new_field_with_different_name(
+            self._field, f"{self._field.name}_dst"
+        )
 
     def generate(self, sfg: SfgComposer) -> None:
-        base_class = f"walberla::experimental::communication::UniformGpuFieldPackInfoImpl< {self._name } >"
+        base_class = f"walberla::experimental::communication::UniformGpuFieldPackInfoBase< {self._name } >"
+        sfg.include(
+            "walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp"
+        )
 
         build_config = get_build_config(sfg)
 
@@ -72,51 +105,105 @@ class GpuFieldPackInfo(CustomGenerator):
 
         pack_kernels: dict[tuple[int, int, int], SfgKernelHandle] = dict()
         unpack_kernels: dict[tuple[int, int, int], SfgKernelHandle] = dict()
+        local_copy_kernels: dict[tuple[int, int, int], SfgKernelHandle] = dict()
+
+        comm_dirs = [c for c in self._stencil if not all(x == 0 for x in c)]
 
-        for comm_dir in self._stencil:
-            pack_kernels[comm_dir] = self._do_pack(pkc, comm_dir)
-            unpack_kernels[comm_dir] = self._do_unpack(pkc, comm_dir)
+        for comm_dir in comm_dirs:
+            if not all(c == 0 for c in comm_dir):
+                pack_kernels[comm_dir] = self._do_pack(pkc, comm_dir)
+                unpack_kernels[comm_dir] = self._do_unpack(pkc, comm_dir)
+                local_copy_kernels[comm_dir] = self._do_local_copy(pkc, comm_dir)
 
-        gpu_field = GpuFieldPtr.create(self._field)
+        src_gpu_field = GpuFieldPtr.create(self._src_field)
+        gpu_field_type = strip_ptr_ref(src_gpu_field.get_dtype())
+        dst_gpu_field = GpuFieldPtr.create(self._dst_field)
         buffer_span = std.span(self._dtype).var("buffer")
         dir = Direction().var("dir")
-        ci = CellInterval().var("ci")
-        stream = GpuAPI.stream_t().var("stream")
+        src_interval = CellInterval(const=True, ref=True).var("srcInterval")
+        dst_interval = CellInterval(const=True, ref=True).var("dstInterval")
 
-        common_buffer = self._buffer(1)
+        stream = GpuAPI.stream_t().var("stream")
 
         sfg.klass(self._name, bases=[f"public {base_class}"])(
             sfg.public(
-                f"using Field_T = {gpu_field.get_dtype().c_string()};",
-                sfg.method("doPack").params(gpu_field, buffer_span, dir, ci, stream)(
-                    sfg.map_field(self._field, gpu_field),
-                    sfg.map_field(common_buffer, buffer_span),
-                    sfg.switch(dir).cases(
+                f"using Base = {base_class};",
+                "using Base::Base;",
+                f"using Field_T = {gpu_field_type.c_string()};",
+                sfg.method("doPack").params(
+                    src_gpu_field, buffer_span, dir, src_interval, stream
+                )(
+                    sfg.map_field(
+                        self._src_field, src_gpu_field.with_cell_interval(src_interval)
+                    ),
+                    CaptureBufferPointer(sfg, "buffer", buffer_span),
+                    sfg.switch(dir)
+                    .cases(
                         {
                             f"walberla::stencil::Direction::{offset_to_direction_string(comm_dir)}": sfg.gpu_invoke(
                                 pack_kernels[comm_dir], stream=stream
                             )
-                            for comm_dir in self._stencil
+                            for comm_dir in comm_dirs
                         }
-                    ),
+                    )
+                    .default("/* unreachable */"),
                 ),
-                sfg.method("doUnpack").params(gpu_field, buffer_span, dir, ci, stream)(
-                    sfg.map_field(self._field, gpu_field),
-                    sfg.map_field(common_buffer, buffer_span),
-                    sfg.switch(dir).cases(
+                sfg.method("doUnpack").params(
+                    src_gpu_field, buffer_span, dir, dst_interval, stream
+                )(
+                    sfg.map_field(
+                        self._dst_field, src_gpu_field.with_cell_interval(dst_interval)
+                    ),
+                    CaptureBufferPointer(sfg, "buffer", buffer_span),
+                    sfg.switch(dir)
+                    .cases(
                         {
                             f"walberla::stencil::Direction::{offset_to_direction_string(comm_dir)}": sfg.gpu_invoke(
                                 unpack_kernels[comm_dir], stream=stream
                             )
-                            for comm_dir in self._stencil
+                            for comm_dir in comm_dirs
                         }
+                    )
+                    .default("/* unreachable */"),
+                ),
+                sfg.method("doLocalCopy").params(
+                    src_gpu_field,
+                    src_interval,
+                    dst_gpu_field,
+                    dst_interval,
+                    dir,
+                    stream,
+                )(
+                    sfg.map_field(
+                        self._src_field, src_gpu_field.with_cell_interval(src_interval)
+                    ),
+                    sfg.map_field(
+                        self._dst_field, dst_gpu_field.with_cell_interval(dst_interval)
                     ),
+                    sfg.switch(dir)
+                    .cases(
+                        {
+                            f"walberla::stencil::Direction::{offset_to_direction_string(comm_dir)}": sfg.gpu_invoke(
+                                local_copy_kernels[comm_dir], stream=stream
+                            )
+                            for comm_dir in comm_dirs
+                        }
+                    )
+                    .default("/* unreachable */"),
                 ),
+                sfg.method("elementsPerCell")
+                .inline()
+                .const()
+                .params(dir)
+                .returns(uint_t)(f"return {self._field.index_shape[0]};"),
             )
         )
 
-    def _pack_accesses(self, comm_dir: tuple[int, int, int]):
-        return [self._field.center(i) for i in range(self._field.index_shape[0])]
+    def _pack_accesses(self, comm_dir: tuple[int, int, int]) -> list[Field.Access]:
+        return [self._src_field.center(i) for i in range(self._field.index_shape[0])]
+
+    def _unpack_accesses(self, comm_dir: tuple[int, int, int]) -> list[Field.Access]:
+        return [self._dst_field.center(i) for i in range(self._field.index_shape[0])]
 
     def _do_pack(
         self, pkc: PackingKernelsContext, comm_dir: tuple[int, int, int]
@@ -130,12 +217,22 @@ class GpuFieldPackInfo(CustomGenerator):
     def _do_unpack(
         self, pkc: PackingKernelsContext, comm_dir: tuple[int, int, int]
     ) -> SfgKernelHandle:
-        pack_accs = self._pack_accesses(comm_dir)
-        buffer = self._buffer(len(pack_accs))
-        asms = [Assignment(acc, buffer(i)) for i, acc in enumerate(pack_accs)]
+        unpack_accs = self._unpack_accesses(comm_dir)
+        buffer = self._buffer(len(unpack_accs))
+        asms = [Assignment(acc, buffer(i)) for i, acc in enumerate(unpack_accs)]
         dir_str = offset_to_direction_string(comm_dir)
         return pkc.kns.create(asms, f"unpack{dir_str}", pkc.cfg)
 
+    def _do_local_copy(
+        self, pkc: PackingKernelsContext, comm_dir: tuple[int, int, int]
+    ) -> SfgKernelHandle:
+        pack_accs = self._pack_accesses(comm_dir)
+        unpack_accs = self._unpack_accesses(comm_dir)
+
+        asms = [Assignment(dst, src) for dst, src in zip(unpack_accs, pack_accs)]
+        dir_str = offset_to_direction_string(comm_dir)
+        return pkc.kns.create(asms, f"localCopy{dir_str}", pkc.cfg)
+
     def _buffer(self, num_elems: int):
         return Field.create_generic(
             "buffer",
diff --git a/tests/BasicLbmScenarios/LbmAlgorithms.py b/tests/BasicLbmScenarios/LbmAlgorithms.py
index eefe063997ded8cbdb183b1244336479f8514e8e..9658792ce11f159b680eb62e5514d03f53a1a655 100644
--- a/tests/BasicLbmScenarios/LbmAlgorithms.py
+++ b/tests/BasicLbmScenarios/LbmAlgorithms.py
@@ -16,6 +16,7 @@ from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
 
 from walberla.codegen import Sweep, get_build_config
 from walberla.codegen.boundaries import NoSlip, FreeSlip
+from walberla.codegen.communication import GpuFieldPackInfo
 
 from walberla.codegen.build_config import DEBUG_MOCK_CMAKE
 
@@ -31,17 +32,19 @@ with SourceFileGenerator(keep_unknown_argv=True) as sfg:
 
     match args.target:
         case "cpu":
-            build_config.override.target = Target.CurrentCPU
+            target = Target.CurrentCPU
             sfg.code("#define LBM_SCENARIOS_CPU_BUILD true")
         case "hip":
-            build_config.override.target = Target.HIP
+            target = Target.HIP
             sfg.code("#define LBM_SCENARIOS_GPU_BUILD true")
         case "cuda":
-            build_config.override.target = Target.CUDA
+            target = Target.CUDA
             sfg.code("#define LBM_SCENARIOS_GPU_BUILD true")
         case _:
             raise ValueError(f"Unexpected target id: {args.target}")
 
+    build_config.override.target = target
+
     sfg.namespace("BasicLbmScenarios::gen")
 
     stencil = LBStencil(Stencil.D3Q19)
@@ -117,3 +120,8 @@ with SourceFileGenerator(keep_unknown_argv=True) as sfg:
             wall_orientation=FreeSlip.IRREGULAR,
         )
         sfg.generate(irreg_freeslip)
+
+    if build_config.override.target.is_gpu():
+        with sfg.namespace("comm"):
+            pack_info = GpuFieldPackInfo("GpuPdfsPackInfo", stencil, f)
+            sfg.generate(pack_info)
diff --git a/tests/BasicLbmScenarios/SimDomain.hpp b/tests/BasicLbmScenarios/SimDomain.hpp
index 4417699a8d2eec630f684ce46755fee0fecd45dd..508adb9d38aa80e6ae1e71be9f6bab82b20fc56c 100644
--- a/tests/BasicLbmScenarios/SimDomain.hpp
+++ b/tests/BasicLbmScenarios/SimDomain.hpp
@@ -45,7 +45,7 @@ using CommonGpuField = gpu::GPUField< PdfField_T::value_type >;
 
 using GpuCommScheme   = gpu::communication::UniformGPUScheme< gen::LbStencil >;
 // using GpuPdfsPackInfo = gpu::communication::MemcpyPackInfo< CommonGpuField >;
-using GpuPdfsPackInfo = gpu::communication::GPUPackInfo< CommonGpuField >;
+using GpuPdfsPackInfo = gen::comm::GpuPdfsPackInfo;
 #endif
 
 struct SimDomain
@@ -70,7 +70,7 @@ struct SimDomain
       const BlockDataID uId;
    } gpuFields;
 
-   // GpuCommScheme commGpu;
+   GpuCommScheme commGpu;
 
    void initFromFields(const Vector3< real_t > force)
    {
@@ -121,7 +121,7 @@ struct SimDomain
    void syncGhostLayers()
    {
       // WALBERLA_GPU_CHECK(gpuPeekAtLastError());
-      commCpu();
+      commGpu();
    }
 
    void fields2host()
@@ -242,11 +242,10 @@ struct SimDomainBuilder
       const BlockDataID rhoIdGpu  = gpu::addGPUFieldToStorage< ScalarField_T >(sbfs, rhoId, "rho_gpu");
       const BlockDataID uIdGpu    = gpu::addGPUFieldToStorage< VectorField_T >(sbfs, uId, "u_gpu");
 
-      // GpuCommScheme commGpu{ sbfs };
-      // auto gpuPdfsPackInfo = std::make_shared< GpuPdfsPackInfo >(pdfsIdGpu);
-      // commGpu.addPackInfo(gpuPdfsPackInfo);
+      GpuCommScheme commGpu{ sbfs };
       auto gpuPdfsPackInfo = std::make_shared< GpuPdfsPackInfo >(pdfsIdGpu);
-      commCpu.addPackInfo(gpuPdfsPackInfo);
+      commGpu.addPackInfo(gpuPdfsPackInfo);
+      // commCpu.addPackInfo(gpuPdfsPackInfo);
 #endif
 
       return
@@ -265,7 +264,7 @@ struct SimDomainBuilder
             .rhoId = rhoIdGpu,
             .uId = uIdGpu
          },
-         // .commGpu = commGpu
+         .commGpu = commGpu
 #endif
       };
    }