diff --git a/src/lbmpy/advanced_streaming/communication.py b/src/lbmpy/advanced_streaming/communication.py
index 412247bb409ad98973a085b37de4da5c3c737fae..c02b6e2229d3f1a3da0e81cfdd70a6e5c5ff5e87 100644
--- a/src/lbmpy/advanced_streaming/communication.py
+++ b/src/lbmpy/advanced_streaming/communication.py
@@ -1,8 +1,18 @@
 import itertools
 from pystencils import CreateKernelConfig, Field, Assignment, AssignmentCollection
-from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice
-from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, \
-    Timestep, get_timesteps, numeric_offsets
+from pystencils.slicing import (
+    shift_slice,
+    get_slice_before_ghost_layer,
+    normalize_slice,
+)
+from lbmpy.advanced_streaming.utility import (
+    is_inplace,
+    get_accessor,
+    numeric_index,
+    Timestep,
+    get_timesteps,
+    numeric_offsets,
+)
 from pystencils.datahandling import SerialDataHandling
 from pystencils.enums import Target
 from itertools import chain
@@ -10,22 +20,28 @@ from itertools import chain
 
 class LBMPeriodicityHandling:
 
-    def __init__(self, stencil, data_handling, pdf_field_name,
-                 streaming_pattern='pull', ghost_layers=1,
-                 cupy_direct_copy=True):
+    def __init__(
+        self,
+        stencil,
+        data_handling,
+        pdf_field_name,
+        streaming_pattern="pull",
+        ghost_layers=1,
+        cupy_direct_copy=True,
+    ):
         """
-            Periodicity Handling for Lattice Boltzmann Streaming.
-
-            **On the usage with cuda:**
-            - cupy allows the copying of sliced arrays within device memory using the numpy syntax,
-            e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
-            handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and
-            compiled. The compiled kernels are almost twice as fast in execution as cupy array copying,
-            but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
-            Choose your weapon depending on your use case.
+        Periodicity Handling for Lattice Boltzmann Streaming.
+
+        **On the usage with cuda:**
+        - cupy allows the copying of sliced arrays within device memory using the numpy syntax,
+        e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
+        handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and
+        compiled. The compiled kernels are almost twice as fast in execution as cupy array copying,
+        but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
+        Choose your weapon depending on your use case.
         """
         if not isinstance(data_handling, SerialDataHandling):
-            raise ValueError('Only serial data handling is supported!')
+            raise ValueError("Only serial data handling is supported!")
 
         self.stencil = stencil
         self.dim = stencil.D
@@ -56,12 +72,16 @@ class LBMPeriodicityHandling:
         self.comm_slices = []
         timesteps = get_timesteps(streaming_pattern)
         for timestep in timesteps:
-            slices_per_comm_dir = get_communication_slices(stencil=stencil,
-                                                           comm_stencil=copy_directions,
-                                                           streaming_pattern=streaming_pattern,
-                                                           prev_timestep=timestep,
-                                                           ghost_layers=ghost_layers)
-            self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
+            slices_per_comm_dir = get_communication_slices(
+                stencil=stencil,
+                comm_stencil=copy_directions,
+                streaming_pattern=streaming_pattern,
+                prev_timestep=timestep,
+                ghost_layers=ghost_layers,
+            )
+            self.comm_slices.append(
+                list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
+            )
 
         if self.target == Target.GPU and not cupy_direct_copy:
             self.device_copy_kernels = list()
@@ -81,11 +101,11 @@ class LBMPeriodicityHandling:
             arr[dst] = arr[src]
 
     def _compile_copy_kernels(self, timestep):
+        assert self.target == Target.GPU
         pdf_field = self.dh.fields[self.pdf_field_name]
         kernels = []
         for src, dst in self.comm_slices[timestep.idx]:
-            kernels.append(
-                periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
+            kernels.append(periodic_pdf_gpu_copy_kernel(pdf_field, src, dst))
         return kernels
 
     def _periodicity_handling_gpu(self, prev_timestep):
@@ -100,7 +120,12 @@ class LBMPeriodicityHandling:
 
 
 def get_communication_slices(
-        stencil, comm_stencil=None, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1):
+    stencil,
+    comm_stencil=None,
+    streaming_pattern="pull",
+    prev_timestep=Timestep.BOTH,
+    ghost_layers=1,
+):
     """
     Return the source and destination slices for periodicity handling or communication between blocks.
 
@@ -116,7 +141,9 @@ def get_communication_slices(
     if comm_stencil is None:
         comm_stencil = itertools.product(*([-1, 0, 1] for _ in range(stencil.D)))
 
-    pdfs = Field.create_generic('pdfs', spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,))
+    pdfs = Field.create_generic(
+        "pdfs", spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,)
+    )
     write_accesses = get_accessor(streaming_pattern, prev_timestep).write(pdfs, stencil)
     slices_per_comm_direction = dict()
 
@@ -130,7 +157,9 @@ def get_communication_slices(
             d = stencil.index(streaming_dir)
             write_index = numeric_index(write_accesses[d])[0]
 
-            origin_slice = get_slice_before_ghost_layer(comm_dir, ghost_layers=ghost_layers, thickness=1)
+            origin_slice = get_slice_before_ghost_layer(
+                comm_dir, ghost_layers=ghost_layers, thickness=1
+            )
             src_slice = _fix_length_one_slices(origin_slice)
 
             write_offsets = numeric_offsets(write_accesses[d])
@@ -138,13 +167,15 @@ def get_communication_slices(
 
             # TODO: this is just a hotfix. _trim_slice_in_direction breaks FreeSlip BC with adjacent periodic side
             if streaming_pattern != "pull":
-                src_slice = shift_slice(_trim_slice_in_direction(src_slice, tangential_dir), write_offsets)
+                src_slice = shift_slice(
+                    _trim_slice_in_direction(src_slice, tangential_dir), write_offsets
+                )
 
             neighbour_transform = _get_neighbour_transform(comm_dir, ghost_layers)
             dst_slice = shift_slice(src_slice, neighbour_transform)
 
-            src_slice = src_slice + (write_index, )
-            dst_slice = dst_slice + (write_index, )
+            src_slice = src_slice + (write_index,)
+            dst_slice = dst_slice + (write_index,)
 
             slices_for_dir.append((src_slice, dst_slice))
 
@@ -152,10 +183,10 @@ def get_communication_slices(
     return slices_per_comm_direction
 
 
-def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
-                             domain_size=None, target=Target.GPU):
-    """Copies a rectangular array slice onto another non-overlapping array slice"""
-    from pystencils.gpucuda.kernelcreation import create_cuda_kernel
+def periodic_pdf_gpu_copy_kernel(pdf_field, src_slice, dst_slice, domain_size=None):
+    """Generate a GPU kernel which copies all values from one slice of a field
+    to another non-overlapping slice."""
+    from pystencils import create_kernel
 
     pdf_idx = src_slice[-1]
     assert isinstance(pdf_idx, int), "PDF index needs to be an integer constant"
@@ -176,18 +207,28 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
     def _stop(s):
         return s.stop if isinstance(s, slice) else s
 
-    offset = [_start(s1) - _start(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)]
-    assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
-        "Slices have to have same size"
-
-    copy_eq = AssignmentCollection(main_assignments=[Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))])
-    config = CreateKernelConfig(iteration_slice=dst_slice, skip_independence_check=True)
-    ast = create_cuda_kernel(copy_eq, config=config)
-    if target == Target.GPU:
-        from pystencils.gpucuda import make_python_function
-        return make_python_function(ast)
-    else:
-        raise ValueError('Invalid target:', target)
+    offset = [
+        _start(s1) - _start(s2)
+        for s1, s2 in zip(normalized_from_slice, normalized_to_slice)
+    ]
+    assert offset == [
+        _stop(s1) - _stop(s2)
+        for s1, s2 in zip(normalized_from_slice, normalized_to_slice)
+    ], "Slices have to have same size"
+
+    copy_eq = AssignmentCollection(
+        main_assignments=[
+            Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))
+        ]
+    )
+    config = CreateKernelConfig(
+        iteration_slice=dst_slice,
+        skip_independence_check=True,
+        target=Target.GPU,
+    )
+
+    ast = create_kernel(copy_eq, config=config)
+    return ast.compile()
 
 
 def _extend_dir(direction):
@@ -196,10 +237,10 @@ def _extend_dir(direction):
     elif direction[0] == 0:
         for d in [-1, 0, 1]:
             for rest in _extend_dir(direction[1:]):
-                yield (d, ) + rest
+                yield (d,) + rest
     else:
         for rest in _extend_dir(direction[1:]):
-            yield (direction[0], ) + rest
+            yield (direction[0],) + rest
 
 
 def _get_neighbour_transform(direction, ghost_layers):
diff --git a/src/lbmpy/moment_transforms/rawmomenttransforms.py b/src/lbmpy/moment_transforms/rawmomenttransforms.py
index 11013841d0cda6cec0f57bdf5da40cbfd2fc302d..63952347c1075aab1bdfe473ecd92679d466f986 100644
--- a/src/lbmpy/moment_transforms/rawmomenttransforms.py
+++ b/src/lbmpy/moment_transforms/rawmomenttransforms.py
@@ -172,7 +172,7 @@ class PdfsToMomentsByMatrixTransform(AbstractRawMomentTransform):
 
     #   ----------------------------- Private Members -----------------------------
 
-    @ property
+    @property
     def _default_simplification(self):
         forward_simp = SimplificationStrategy()
         # forward_simp.add(substitute_moments_in_conserved_quantity_equations)
@@ -218,7 +218,7 @@ class PdfsToMomentsByChimeraTransform(AbstractRawMomentTransform):
                                                                                 self.moment_polynomials)
         self.poly_to_mono_matrix = self.mono_to_poly_matrix.inv()
 
-    @ property
+    @property
     def absorbs_conserved_quantity_equations(self):
         return True
 
@@ -414,7 +414,7 @@ class PdfsToMomentsByChimeraTransform(AbstractRawMomentTransform):
 
     #   ----------------------------- Private Members -----------------------------
 
-    @ property
+    @property
     def _default_simplification(self):
         from lbmpy.methods.momentbased.momentbasedsimplifications import (
             substitute_moments_in_conserved_quantity_equations,
diff --git a/tests/advanced_streaming/test_communication.py b/tests/advanced_streaming/test_communication.py
index dddeeea69b1957691acd89bc0d8652f99d628673..9328e8963e42dac8b681647e5f72d575d20049ef 100644
--- a/tests/advanced_streaming/test_communication.py
+++ b/tests/advanced_streaming/test_communication.py
@@ -5,36 +5,52 @@ import numpy as np
 from lbmpy.stencils import LBStencil
 from pystencils.slicing import get_slice_before_ghost_layer, get_ghost_region_slice
 from lbmpy.creationfunctions import create_lb_update_rule, LBMConfig, LBMOptimisation
-from lbmpy.advanced_streaming.communication import get_communication_slices, _fix_length_one_slices, \
-    LBMPeriodicityHandling
+from lbmpy.advanced_streaming.communication import (
+    get_communication_slices,
+    _fix_length_one_slices,
+    LBMPeriodicityHandling,
+    periodic_pdf_gpu_copy_kernel,
+)
 from lbmpy.advanced_streaming.utility import streaming_patterns, Timestep
 from lbmpy.enums import Stencil
 
 import pytest
 
 
-@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
-@pytest.mark.parametrize('streaming_pattern', streaming_patterns)
-@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD])
+@pytest.mark.parametrize(
+    "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
+)
+@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
+@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD])
 def test_slices_not_empty(stencil, streaming_pattern, timestep):
     stencil = LBStencil(stencil)
     arr = np.zeros((4,) * stencil.D + (stencil.Q,))
-    slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep,
-                                      ghost_layers=1)
+    slices = get_communication_slices(
+        stencil,
+        streaming_pattern=streaming_pattern,
+        prev_timestep=timestep,
+        ghost_layers=1,
+    )
     for _, slices_list in slices.items():
         for src, dst in slices_list:
             assert all(s != 0 for s in arr[src].shape)
             assert all(s != 0 for s in arr[dst].shape)
 
 
-@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
-@pytest.mark.parametrize('streaming_pattern', streaming_patterns)
-@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD])
+@pytest.mark.parametrize(
+    "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
+)
+@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
+@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD])
 def test_src_dst_same_shape(stencil, streaming_pattern, timestep):
     stencil = LBStencil(stencil)
     arr = np.zeros((4,) * stencil.D + (stencil.Q,))
-    slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep,
-                                      ghost_layers=1)
+    slices = get_communication_slices(
+        stencil,
+        streaming_pattern=streaming_pattern,
+        prev_timestep=timestep,
+        ghost_layers=1,
+    )
     for _, slices_list in slices.items():
         for src, dst in slices_list:
             src_shape = arr[src].shape
@@ -42,12 +58,15 @@ def test_src_dst_same_shape(stencil, streaming_pattern, timestep):
             assert src_shape == dst_shape
 
 
-@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
+@pytest.mark.parametrize(
+    "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
+)
 def test_pull_communication_slices(stencil):
     stencil = LBStencil(stencil)
 
     slices = get_communication_slices(
-        stencil, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1)
+        stencil, streaming_pattern="pull", prev_timestep=Timestep.BOTH, ghost_layers=1
+    )
     for i, d in enumerate(stencil):
         if i == 0:
             continue
@@ -58,21 +77,115 @@ def test_pull_communication_slices(stencil):
                 dst = s[1][:-1]
                 break
 
-        inner_slice = _fix_length_one_slices(get_slice_before_ghost_layer(d, ghost_layers=1))
+        inner_slice = _fix_length_one_slices(
+            get_slice_before_ghost_layer(d, ghost_layers=1)
+        )
         inv_dir = (-e for e in d)
-        gl_slice = _fix_length_one_slices(get_ghost_region_slice(inv_dir, ghost_layers=1))
+        gl_slice = _fix_length_one_slices(
+            get_ghost_region_slice(inv_dir, ghost_layers=1)
+        )
         assert src == inner_slice
         assert dst == gl_slice
 
 
-@pytest.mark.parametrize('stencil_name', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
+@pytest.mark.parametrize("direction", LBStencil(Stencil.D3Q27).stencil_entries)
+@pytest.mark.parametrize("pull", [False, True])
+def test_gpu_comm_kernels(direction: tuple, pull: bool):
+    pytest.importorskip("cupy")
+
+    stencil = LBStencil(Stencil.D3Q27)
+    inv_dir = stencil[stencil.inverse_index(direction)]
+    target = ps.Target.GPU
+
+    domain_size = (4,) * stencil.D
+
+    dh: ps.datahandling.SerialDataHandling = ps.create_data_handling(
+        domain_size,
+        periodicity=(True,) * stencil.D,
+        parallel=False,
+        default_target=target,
+    )
+
+    field = dh.add_array("field", values_per_cell=2)
+
+    if pull:
+        dst_slice = get_ghost_region_slice(inv_dir)
+        src_slice = get_slice_before_ghost_layer(direction)
+    else:
+        dst_slice = get_slice_before_ghost_layer(direction)
+        src_slice = get_ghost_region_slice(inv_dir)
+
+    src_slice += (1,)
+    dst_slice += (1,)
+
+    kernel = periodic_pdf_gpu_copy_kernel(field, src_slice, dst_slice)
+
+    dh.cpu_arrays[field.name][src_slice] = 42.0
+    dh.all_to_gpu()
+
+    dh.run_kernel(kernel)
+
+    dh.all_to_cpu()
+    np.testing.assert_equal(dh.cpu_arrays[field.name][dst_slice], 42.0)
+
+
+@pytest.mark.parametrize("stencil", [Stencil.D2Q9, Stencil.D3Q19])
+@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
+def test_direct_copy_and_kernels_equivalence(stencil: Stencil, streaming_pattern: str):
+    pytest.importorskip("cupy")
+
+    target = ps.Target.GPU
+    stencil = LBStencil(stencil)
+    domain_size = (4,) * stencil.D
+
+    dh: ps.datahandling.SerialDataHandling = ps.create_data_handling(
+        domain_size,
+        periodicity=(True,) * stencil.D,
+        parallel=False,
+        default_target=target,
+    )
+
+    pdfs_a = dh.add_array("pdfs_a", values_per_cell=stencil.Q)
+    pdfs_b = dh.add_array("pdfs_b", values_per_cell=stencil.Q)
+
+    dh.fill(pdfs_a.name, 0.0, ghost_layers=True)
+    dh.fill(pdfs_b.name, 0.0, ghost_layers=True)
+
+    for q in range(stencil.Q):
+        sl = ps.make_slice[:4, :4, q] if stencil.D == 2 else ps.make_slice[:4, :4, :4, q]
+        dh.cpu_arrays[pdfs_a.name][sl] = q
+        dh.cpu_arrays[pdfs_b.name][sl] = q
+
+    dh.all_to_gpu()
+
+    direct_copy = LBMPeriodicityHandling(stencil, dh, pdfs_a.name, streaming_pattern, cupy_direct_copy=True)
+    kernels_copy = LBMPeriodicityHandling(stencil, dh, pdfs_b.name, streaming_pattern, cupy_direct_copy=False)
+
+    direct_copy(Timestep.EVEN)
+    kernels_copy(Timestep.EVEN)
+
+    dh.all_to_cpu()
+
+    np.testing.assert_equal(
+        dh.cpu_arrays[pdfs_a.name],
+        dh.cpu_arrays[pdfs_b.name]
+    )
+
+
+@pytest.mark.parametrize(
+    "stencil_name", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
+)
 def test_optimised_and_full_communication_equivalence(stencil_name):
     target = ps.Target.CPU
     stencil = LBStencil(stencil_name)
-    domain_size = (4, ) * stencil.D
+    domain_size = (4,) * stencil.D
 
-    dh = ps.create_data_handling(domain_size, periodicity=(True, ) * stencil.D,
-                                 parallel=False, default_target=target)
+    dh = ps.create_data_handling(
+        domain_size,
+        periodicity=(True,) * stencil.D,
+        parallel=False,
+        default_target=target,
+    )
 
     pdf = dh.add_array("pdf", values_per_cell=len(stencil), dtype=np.int64)
     dh.fill("pdf", 0, ghost_layers=True)
@@ -82,9 +195,9 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
     gl = dh.ghost_layers_of_field("pdf")
 
     num = 0
-    for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']):
-        dh.cpu_arrays['pdf'][idx] = num
-        dh.cpu_arrays['pdf_tmp'][idx] = num
+    for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]):
+        dh.cpu_arrays["pdf"][idx] = num
+        dh.cpu_arrays["pdf_tmp"][idx] = num
         num += 1
 
     lbm_config = LBMConfig(stencil=stencil, kernel_type="stream_pull_only")
@@ -95,21 +208,27 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
     ast = ps.create_kernel(ac, config=config)
     stream = ast.compile()
 
-    full_communication = dh.synchronization_function(pdf.name, target=dh.default_target, optimization={"openmp": True})
+    full_communication = dh.synchronization_function(
+        pdf.name, target=dh.default_target, optimization={"openmp": True}
+    )
     full_communication()
 
     dh.run_kernel(stream)
     dh.swap("pdf", "pdf_tmp")
-    pdf_full_communication = np.copy(dh.cpu_arrays['pdf'])
+    pdf_full_communication = np.copy(dh.cpu_arrays["pdf"])
 
     num = 0
-    for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']):
-        dh.cpu_arrays['pdf'][idx] = num
-        dh.cpu_arrays['pdf_tmp'][idx] = num
+    for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]):
+        dh.cpu_arrays["pdf"][idx] = num
+        dh.cpu_arrays["pdf_tmp"][idx] = num
         num += 1
 
-    optimised_communication = LBMPeriodicityHandling(stencil=stencil, data_handling=dh, pdf_field_name=pdf.name,
-                                                     streaming_pattern='pull')
+    optimised_communication = LBMPeriodicityHandling(
+        stencil=stencil,
+        data_handling=dh,
+        pdf_field_name=pdf.name,
+        streaming_pattern="pull",
+    )
     optimised_communication()
     dh.run_kernel(stream)
     dh.swap("pdf", "pdf_tmp")
@@ -119,9 +238,14 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
             for j in range(gl, domain_size[1]):
                 for k in range(gl, domain_size[2]):
                     for f in range(len(stencil)):
-                        assert dh.cpu_arrays['pdf'][i, j, k, f] == pdf_full_communication[i, j, k, f], print(f)
+                        assert (
+                            dh.cpu_arrays["pdf"][i, j, k, f]
+                            == pdf_full_communication[i, j, k, f]
+                        ), print(f)
     else:
         for i in range(gl, domain_size[0]):
             for j in range(gl, domain_size[1]):
                 for f in range(len(stencil)):
-                    assert dh.cpu_arrays['pdf'][i, j, f] == pdf_full_communication[i, j, f]
+                    assert (
+                        dh.cpu_arrays["pdf"][i, j, f] == pdf_full_communication[i, j, f]
+                    )