Compare revisions

Markus Holzer · Philipp Suffa · Markus Holzer · Markus Holzer · Frederik Hennig · Markus Holzer
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -29,7 +29,7 @@ stages:
 tests-and-coverage:
  stage: pretest
  extends: .every-commit
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cupy12.3
  script:
    # - pip install sympy --upgrade
    - export NUM_CORES=$(nproc --all)

--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@@ -11,3 +11,4 @@ Contributors:
  - Rudolf Weeber <weeber@icp.uni-stuttgart.de>
  - Christian Godenschwager <christian.godenschwager@fau.de>
  - Jan Hönig <jan.hoenig@fau.de>
+  - Philipp Suffa <philipp.suffa@fau.de>
--- a/COPYING.txt
+++ b/COPYING.txt
------------------------   Important ---------------------------------
-
-lbmpy is under the following GNU AGPLv3 license. 
-This license holds for the sources of lbmpy itself as well 
-as for all kernels generated with lbmpy i.e. 
-the output of lbmpy is also protected by the GNU AGPLv3 license. 
-
----------------------------------------------------------------------
-
-
-
-                    
                    GNU AFFERO GENERAL PUBLIC LICENSE
                       Version 3, 19 November 2007


--- a/src/lbmpy/advanced_streaming/communication.py
+++ b/src/lbmpy/advanced_streaming/communication.py
 import itertools
 from pystencils import CreateKernelConfig, Field, Assignment, AssignmentCollection
-from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice
-from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, \
-    Timestep, get_timesteps, numeric_offsets
+from pystencils.slicing import (
+    shift_slice,
+    get_slice_before_ghost_layer,
+    normalize_slice,
+)
+from lbmpy.advanced_streaming.utility import (
+    is_inplace,
+    get_accessor,
+    numeric_index,
+    Timestep,
+    get_timesteps,
+    numeric_offsets,
+)
 from pystencils.datahandling import SerialDataHandling
 from pystencils.enums import Target
 from itertools import chain
@@ -10,22 +20,28 @@ from itertools import chain

 class LBMPeriodicityHandling:

-    def __init__(self, stencil, data_handling, pdf_field_name,
-                 streaming_pattern='pull', ghost_layers=1,
-                 cupy_direct_copy=True):
+    def __init__(
+        self,
+        stencil,
+        data_handling,
+        pdf_field_name,
+        streaming_pattern="pull",
+        ghost_layers=1,
+        cupy_direct_copy=True,
+    ):
        """
-            Periodicity Handling for Lattice Boltzmann Streaming.
-
-            **On the usage with cuda:**
-            - cupy allows the copying of sliced arrays within device memory using the numpy syntax,
-            e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
-            handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and
-            compiled. The compiled kernels are almost twice as fast in execution as cupy array copying,
-            but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
-            Choose your weapon depending on your use case.
+        Periodicity Handling for Lattice Boltzmann Streaming.
+
+        **On the usage with cuda:**
+        - cupy allows the copying of sliced arrays within device memory using the numpy syntax,
+        e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
+        handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and
+        compiled. The compiled kernels are almost twice as fast in execution as cupy array copying,
+        but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
+        Choose your weapon depending on your use case.
        """
        if not isinstance(data_handling, SerialDataHandling):
-            raise ValueError('Only serial data handling is supported!')
+            raise ValueError("Only serial data handling is supported!")

        self.stencil = stencil
        self.dim = stencil.D
@@ -56,12 +72,16 @@ class LBMPeriodicityHandling:
        self.comm_slices = []
        timesteps = get_timesteps(streaming_pattern)
        for timestep in timesteps:
-            slices_per_comm_dir = get_communication_slices(stencil=stencil,
-                                                           comm_stencil=copy_directions,
-                                                           streaming_pattern=streaming_pattern,
-                                                           prev_timestep=timestep,
-                                                           ghost_layers=ghost_layers)
-            self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
+            slices_per_comm_dir = get_communication_slices(
+                stencil=stencil,
+                comm_stencil=copy_directions,
+                streaming_pattern=streaming_pattern,
+                prev_timestep=timestep,
+                ghost_layers=ghost_layers,
+            )
+            self.comm_slices.append(
+                list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
+            )

        if self.target == Target.GPU and not cupy_direct_copy:
            self.device_copy_kernels = list()
@@ -81,11 +101,11 @@ class LBMPeriodicityHandling:
            arr[dst] = arr[src]

    def _compile_copy_kernels(self, timestep):
+        assert self.target == Target.GPU
        pdf_field = self.dh.fields[self.pdf_field_name]
        kernels = []
        for src, dst in self.comm_slices[timestep.idx]:
-            kernels.append(
-                periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
+            kernels.append(periodic_pdf_gpu_copy_kernel(pdf_field, src, dst))
        return kernels

    def _periodicity_handling_gpu(self, prev_timestep):
@@ -100,7 +120,12 @@ class LBMPeriodicityHandling:


 def get_communication_slices(
-        stencil, comm_stencil=None, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1):
+    stencil,
+    comm_stencil=None,
+    streaming_pattern="pull",
+    prev_timestep=Timestep.BOTH,
+    ghost_layers=1,
+):
    """
    Return the source and destination slices for periodicity handling or communication between blocks.

@@ -116,7 +141,9 @@ def get_communication_slices(
    if comm_stencil is None:
        comm_stencil = itertools.product(*([-1, 0, 1] for _ in range(stencil.D)))

-    pdfs = Field.create_generic('pdfs', spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,))
+    pdfs = Field.create_generic(
+        "pdfs", spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,)
+    )
    write_accesses = get_accessor(streaming_pattern, prev_timestep).write(pdfs, stencil)
    slices_per_comm_direction = dict()

@@ -130,7 +157,9 @@ def get_communication_slices(
            d = stencil.index(streaming_dir)
            write_index = numeric_index(write_accesses[d])[0]

-            origin_slice = get_slice_before_ghost_layer(comm_dir, ghost_layers=ghost_layers, thickness=1)
+            origin_slice = get_slice_before_ghost_layer(
+                comm_dir, ghost_layers=ghost_layers, thickness=1
+            )
            src_slice = _fix_length_one_slices(origin_slice)

            write_offsets = numeric_offsets(write_accesses[d])
@@ -138,13 +167,15 @@ def get_communication_slices(

            # TODO: this is just a hotfix. _trim_slice_in_direction breaks FreeSlip BC with adjacent periodic side
            if streaming_pattern != "pull":
-                src_slice = shift_slice(_trim_slice_in_direction(src_slice, tangential_dir), write_offsets)
+                src_slice = shift_slice(
+                    _trim_slice_in_direction(src_slice, tangential_dir), write_offsets
+                )

            neighbour_transform = _get_neighbour_transform(comm_dir, ghost_layers)
            dst_slice = shift_slice(src_slice, neighbour_transform)

-            src_slice = src_slice + (write_index, )
-            dst_slice = dst_slice + (write_index, )
+            src_slice = src_slice + (write_index,)
+            dst_slice = dst_slice + (write_index,)

            slices_for_dir.append((src_slice, dst_slice))

@@ -152,10 +183,10 @@ def get_communication_slices(
    return slices_per_comm_direction


-def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
-                             domain_size=None, target=Target.GPU):
-    """Copies a rectangular array slice onto another non-overlapping array slice"""
-    from pystencils.gpucuda.kernelcreation import create_cuda_kernel
+def periodic_pdf_gpu_copy_kernel(pdf_field, src_slice, dst_slice, domain_size=None):
+    """Generate a GPU kernel which copies all values from one slice of a field
+    to another non-overlapping slice."""
+    from pystencils import create_kernel

    pdf_idx = src_slice[-1]
    assert isinstance(pdf_idx, int), "PDF index needs to be an integer constant"
@@ -176,18 +207,28 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
    def _stop(s):
        return s.stop if isinstance(s, slice) else s

-    offset = [_start(s1) - _start(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)]
-    assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
-        "Slices have to have same size"
-
-    copy_eq = AssignmentCollection(main_assignments=[Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))])
-    config = CreateKernelConfig(iteration_slice=dst_slice, skip_independence_check=True)
-    ast = create_cuda_kernel(copy_eq, config=config)
-    if target == Target.GPU:
-        from pystencils.gpucuda import make_python_function
-        return make_python_function(ast)
-    else:
-        raise ValueError('Invalid target:', target)
+    offset = [
+        _start(s1) - _start(s2)
+        for s1, s2 in zip(normalized_from_slice, normalized_to_slice)
+    ]
+    assert offset == [
+        _stop(s1) - _stop(s2)
+        for s1, s2 in zip(normalized_from_slice, normalized_to_slice)
+    ], "Slices have to have same size"
+
+    copy_eq = AssignmentCollection(
+        main_assignments=[
+            Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))
+        ]
+    )
+    config = CreateKernelConfig(
+        iteration_slice=dst_slice,
+        skip_independence_check=True,
+        target=Target.GPU,
+    )
+
+    ast = create_kernel(copy_eq, config=config)
+    return ast.compile()


 def _extend_dir(direction):
@@ -196,10 +237,10 @@ def _extend_dir(direction):
    elif direction[0] == 0:
        for d in [-1, 0, 1]:
            for rest in _extend_dir(direction[1:]):
-                yield (d, ) + rest
+                yield (d,) + rest
    else:
        for rest in _extend_dir(direction[1:]):
-            yield (direction[0], ) + rest
+            yield (direction[0],) + rest


 def _get_neighbour_transform(direction, ghost_layers):

--- a/src/lbmpy/boundaries/wall_function_models.py
+++ b/src/lbmpy/boundaries/wall_function_models.py
@@ -157,7 +157,7 @@ class MuskerLaw(ImplicitWallFunctionModel):
        def law(u_p, y_p):
            arctan = sp.Float(5.424) * sp.atan(sp.Float(0.119760479041916168) * y_p - sp.Float(0.488023952095808383))
            logarithm = (sp.Float(0.434) * sp.log((y_p + sp.Float(10.6)) ** sp.Float(9.6)
-                                                  / (y_p ** 2 - sp.Float(8.15) * y_p + sp.Float(86)) ** 2, 10))
+                                                  / (y_p ** 2 - sp.Float(8.15) * y_p + sp.Float(86)) ** 2))
            return (arctan + logarithm - sp.Float(3.50727901936264842)) - u_p

        u_plus = velocity_symbol / self.u_tau[0]

--- a/src/lbmpy/creationfunctions.py
+++ b/src/lbmpy/creationfunctions.py
@@ -67,7 +67,8 @@ from lbmpy.enums import Stencil, Method, ForceModel, CollisionSpace, SubgridScal
 import lbmpy.forcemodels as forcemodels
 from lbmpy.fieldaccess import CollideOnlyInplaceAccessor, PdfFieldAccessor, PeriodicTwoFieldsAccessor
 from lbmpy.fluctuatinglb import add_fluctuations_to_collision_rule
-from lbmpy.partially_saturated_cells import add_psm_to_collision_rule, PSMConfig
+from lbmpy.partially_saturated_cells import (replace_by_psm_collision_rule, PSMConfig,
+                                             add_psm_solid_collision_to_collision_rule)
 from lbmpy.non_newtonian_models import add_cassons_model, CassonsParameters
 from lbmpy.methods import (create_mrt_orthogonal, create_mrt_raw, create_central_moment,
                           create_srt, create_trt, create_trt_kbc)
@@ -376,8 +377,8 @@ class LBMConfig:
        if not self.compressible and self.method in (Method.MONOMIAL_CUMULANT, Method.CUMULANT):
            raise ValueError("Incompressible cumulant-based methods are not supported (yet).")

-        if self.zero_centered and (self.entropic or self.fluctuating):
-            raise ValueError("Entropic and fluctuating methods can only be created with `zero_centered=False`.")
+        if self.zero_centered and self.entropic:
+            raise ValueError("Entropic methods can only be created with `zero_centered=False`.")

        #   Check or infer delta-equilibrium
        if self.delta_equilibrium is not None:
@@ -468,7 +469,7 @@ class LBMConfig:
        }

        if self.psm_config is not None and self.psm_config.fraction_field is not None:
-            self.force = [(1.0 - self.psm_config.fraction_field.center) * f for f in self.force]
+            self.force = [(1.0 - self.psm_config.fraction_field_symbol) * f for f in self.force]

        if isinstance(self.force_model, str):
            new_force_model = ForceModel[self.force_model.upper()]
@@ -684,11 +685,6 @@ def create_lb_collision_rule(lb_method=None, lbm_config=None, lbm_optimisation=N
    else:
        collision_rule = lb_method.get_collision_rule(pre_simplification=pre_simplification)

-    if lbm_config.psm_config is not None:
-        if lbm_config.psm_config.fraction_field is None or lbm_config.psm_config.object_velocity_field is None:
-            raise ValueError("Specify a fraction and object velocity field in the PSM Config")
-        collision_rule = add_psm_to_collision_rule(collision_rule, lbm_config.psm_config)
-
    if lbm_config.galilean_correction:
        from lbmpy.methods.cumulantbased import add_galilean_correction
        collision_rule = add_galilean_correction(collision_rule)
@@ -706,6 +702,11 @@ def create_lb_collision_rule(lb_method=None, lbm_config=None, lbm_optimisation=N
                                                     bulk_relaxation_rate=lbm_config.relaxation_rates[1],
                                                     limiter=cumulant_limiter)

+    if lbm_config.psm_config is not None:
+        if lbm_config.psm_config.fraction_field is None or lbm_config.psm_config.object_velocity_field is None:
+            raise ValueError("Specify a fraction and object velocity field in the PSM Config")
+        collision_rule = replace_by_psm_collision_rule(collision_rule, lbm_config.psm_config)
+
    if lbm_config.entropic:
        if lbm_config.subgrid_scale_model or lbm_config.cassons:
            raise ValueError("Choose either entropic, subgrid-scale or cassons")
@@ -783,7 +784,7 @@ def create_lb_method(lbm_config=None, **params):
    if lbm_config.psm_config is None:
        fraction_field = None
    else:
-        fraction_field = lbm_config.psm_config.fraction_field
+        fraction_field = lbm_config.psm_config.fraction_field_symbol

    common_params = {
        'compressible': lbm_config.compressible,
@@ -869,49 +870,36 @@ def create_lb_method(lbm_config=None, **params):


 def create_psm_update_rule(lbm_config, lbm_optimisation):
-    node_collection = []

-    # Use regular lb update rule for no overlapping particles
-    config_without_psm = copy.deepcopy(lbm_config)
-    config_without_psm.psm_config = None
-    # TODO: the force is still multiplied by (1.0 - self.psm_config.fraction_field.center)
-    #  (should not harm if memory bound since self.psm_config.fraction_field.center should always be 0.0)
+    if lbm_config.psm_config is None:
+        raise ValueError("Specify a PSM Config in the LBM Config, when creating a psm update rule")
+
+    config_without_particles = copy.deepcopy(lbm_config)
+    config_without_particles.psm_config.max_particles_per_cell = 0
+
    lb_update_rule = create_lb_update_rule(
-        lbm_config=config_without_psm, lbm_optimisation=lbm_optimisation
-    )
-    node_collection.append(
-        Conditional(
-            lbm_config.psm_config.fraction_field.center(0) <= 0.0,
-            Block(lb_update_rule.all_assignments),
-        )
-    )
+        lbm_config=config_without_particles, lbm_optimisation=lbm_optimisation)
+
+    node_collection = lb_update_rule.all_assignments

-    # Only one particle, i.e., no individual_fraction_field is provided
    if lbm_config.psm_config.individual_fraction_field is None:
-        assert lbm_config.psm_config.MaxParticlesPerCell == 1
+        assert lbm_config.psm_config.max_particles_per_cell == 1
+        fraction_field = lbm_config.psm_config.fraction_field
+    else:
+        fraction_field = lbm_config.psm_config.individual_fraction_field
+
+    for p in range(lbm_config.psm_config.max_particles_per_cell):
+
+        psm_solid_collision = add_psm_solid_collision_to_collision_rule(lb_update_rule, lbm_config, p)
        psm_update_rule = create_lb_update_rule(
-            lbm_config=lbm_config, lbm_optimisation=lbm_optimisation
-        )
+            collision_rule=psm_solid_collision, lbm_config=lbm_config, lbm_optimisation=lbm_optimisation)
+
        node_collection.append(
            Conditional(
-                lbm_config.psm_config.fraction_field.center(0) > 0.0,
+                fraction_field.center(p) > 0.0,
                Block(psm_update_rule.all_assignments),
            )
        )
-    else:
-        for p in range(lbm_config.psm_config.MaxParticlesPerCell):
-            # Add psm update rule for p overlapping particles
-            config_with_p_particles = copy.deepcopy(lbm_config)
-            config_with_p_particles.psm_config.MaxParticlesPerCell = p + 1
-            psm_update_rule = create_lb_update_rule(
-                lbm_config=config_with_p_particles, lbm_optimisation=lbm_optimisation
-            )
-            node_collection.append(
-                Conditional(
-                    lbm_config.psm_config.individual_fraction_field.center(p) > 0.0,
-                    Block(psm_update_rule.all_assignments),
-                )
-            )

    return NodeCollection(node_collection)


--- a/src/lbmpy/custom_code_nodes.py
+++ b/src/lbmpy/custom_code_nodes.py
@@ -68,7 +68,7 @@ class LbmWeightInfo(CustomCodeNode):
        weights = [f"(({self.weights_symbol.dtype.c_name})({str(w.evalf(17))}))" for w in lb_method.weights]
        weights = ", ".join(weights)
        w_sym = self.weights_symbol
-        code = f"const {self.weights_symbol.dtype.c_name} {w_sym.name} [] = {{{ weights }}};\n"
+        code = f"const {self.weights_symbol.dtype.c_name} {w_sym.name} [] = {{{weights}}};\n"
        super(LbmWeightInfo, self).__init__(code, symbols_read=set(), symbols_defined={w_sym})

    def weight_of_direction(self, dir_idx, lb_method=None):

--- a/src/lbmpy/fluctuatinglb.py
+++ b/src/lbmpy/fluctuatinglb.py
@@ -19,9 +19,7 @@ def add_fluctuations_to_collision_rule(collision_rule, temperature=None, amplitu
    """"""
    if not (temperature and not amplitudes) or (temperature and amplitudes):
        raise ValueError("Fluctuating LBM: Pass either 'temperature' or 'amplitudes'.")
-    if collision_rule.method.conserved_quantity_computation.zero_centered_pdfs:
-        raise ValueError("The fluctuating LBM is not implemented for zero-centered PDF storage.")
-
+    
    method = collision_rule.method
    if not amplitudes:
        amplitudes = fluctuation_amplitude_from_temperature(method, temperature, c_s_sq)
@@ -44,9 +42,7 @@ def fluctuation_amplitude_from_temperature(method, temperature, c_s_sq=sp.Symbol
    """Produces amplitude equations according to (2.60) and (3.54) in Schiller08"""
    normalization_factors = sp.matrix_multiply_elementwise(method.moment_matrix, method.moment_matrix) * \
        sp.Matrix(method.weights)
-    density = method.zeroth_order_equilibrium_moment_symbol
-    if method.conserved_quantity_computation.zero_centered_pdfs:
-        density += 1
+    density = method._cqc.density_symbol
    mu = temperature * density / c_s_sq
    return [sp.sqrt(mu * norm * (1 - (1 - rr) ** 2))
            for norm, rr in zip(normalization_factors, method.relaxation_rates)]

--- a/src/lbmpy/macroscopic_value_kernels.py
+++ b/src/lbmpy/macroscopic_value_kernels.py
@@ -26,7 +26,14 @@ def get_field_accesses(lb_method, pdfs, streaming_pattern, previous_timestep, pr
    return field_accesses


-def pdf_initialization_assignments(lb_method, density, velocity, pdfs,
+def get_individual_or_common_fraction_field(psm_config):
+    if psm_config.individual_fraction_field is not None:
+        return psm_config.individual_fraction_field
+    else:
+        return psm_config.fraction_field
+
+
+def pdf_initialization_assignments(lb_method, density, velocity, pdfs, psm_config=None,
                                   streaming_pattern='pull', previous_timestep=Timestep.BOTH,
                                   set_pre_collision_pdfs=False):
    """Assignments to initialize the pdf field with equilibrium"""
@@ -42,10 +49,35 @@ def pdf_initialization_assignments(lb_method, density, velocity, pdfs,
    setter_eqs = lb_method.get_equilibrium(conserved_quantity_equations=inp_eqs)
    setter_eqs = setter_eqs.new_with_substitutions({sym: field_accesses[i]
                                                    for i, sym in enumerate(lb_method.post_collision_pdf_symbols)})
+
+    if lb_method.fraction_field is not None:
+        if psm_config is None:
+            raise ValueError("If setting up LBM with PSM, please specify a PSM config in the macroscopic setter")
+        else:
+            for equ in setter_eqs:
+                if equ.lhs in lb_method.first_order_equilibrium_moment_symbols:
+                    pos = lb_method.first_order_equilibrium_moment_symbols.index(equ.lhs)
+                    new_rhs = 0
+                    if isinstance(equ.rhs, sp.core.Add):
+                        for summand in equ.rhs.args:
+                            if summand in velocity:
+                                new_rhs += (1.0 - psm_config.fraction_field.center) * summand
+                            else:
+                                new_rhs += summand.subs(lb_method.fraction_field, psm_config.fraction_field.center)
+                    else:
+                        new_rhs += (1.0 - psm_config.fraction_field.center) * equ.rhs
+
+                    fraction_field = get_individual_or_common_fraction_field(psm_config)
+                    for p in range(psm_config.max_particles_per_cell):
+                        new_rhs += psm_config.object_velocity_field(p * lb_method.dim + pos) * fraction_field.center(p)
+
+                    setter_eqs.subexpressions.remove(equ)
+                    setter_eqs.subexpressions.append(Assignment(equ.lhs, new_rhs))
+
    return setter_eqs


-def macroscopic_values_getter(lb_method, density, velocity, pdfs,
+def macroscopic_values_getter(lb_method, density, velocity, pdfs, psm_config=None,
                              streaming_pattern='pull', previous_timestep=Timestep.BOTH,
                              use_pre_collision_pdfs=False):

@@ -58,7 +90,28 @@ def macroscopic_values_getter(lb_method, density, velocity, pdfs,
        output_spec['velocity'] = velocity
    if density is not None:
        output_spec['density'] = density
-    return cqc.output_equations_from_pdfs(field_accesses, output_spec)
+    getter_equ = cqc.output_equations_from_pdfs(field_accesses, output_spec)
+
+    if lb_method.fraction_field is not None:
+        if psm_config.fraction_field is None:
+            raise ValueError("If setting up LBM with PSM, please specify a PSM config in the macroscopic getter")
+        else:
+            if lb_method.force_model is not None:
+                for equ in getter_equ:
+                    if equ.lhs in lb_method.force_model.symbolic_force_vector:
+                        new_rhs = equ.rhs.subs(lb_method.fraction_field, psm_config.fraction_field.center)
+                        getter_equ.subexpressions.remove(equ)
+                        getter_equ.subexpressions.append(Assignment(equ.lhs, new_rhs))
+
+            for i, equ in enumerate(getter_equ.main_assignments[-lb_method.dim:]):
+                new_rhs = (1.0 - psm_config.fraction_field.center) * equ.rhs
+                fraction_field = get_individual_or_common_fraction_field(psm_config)
+                for p in range(psm_config.max_particles_per_cell):
+                    new_rhs += psm_config.object_velocity_field(p * lb_method.dim + i) * fraction_field.center(p)
+                getter_equ.main_assignments.remove(equ)
+                getter_equ.main_assignments.append(Assignment(equ.lhs, new_rhs))
+        getter_equ.topological_sort()
+    return getter_equ


 macroscopic_values_setter = pdf_initialization_assignments

--- a/src/lbmpy/methods/creationfunctions.py
+++ b/src/lbmpy/methods/creationfunctions.py
@@ -201,19 +201,23 @@ def create_from_equilibrium(stencil, equilibrium, conserved_quantity_computation

    if cspace.collision_space == CollisionSpace.POPULATIONS:
        return MomentBasedLbMethod(stencil, equilibrium, mom_to_rr_dict, conserved_quantity_computation=cqc,
-                                   force_model=force_model, zero_centered=zero_centered, fraction_field=fraction_field,
+                                   force_model=force_model, zero_centered=zero_centered,
+                                   fraction_field=fraction_field,
                                   moment_transform_class=None)
    elif cspace.collision_space == CollisionSpace.RAW_MOMENTS:
        return MomentBasedLbMethod(stencil, equilibrium, mom_to_rr_dict, conserved_quantity_computation=cqc,
-                                   force_model=force_model, zero_centered=zero_centered, fraction_field=fraction_field,
+                                   force_model=force_model, zero_centered=zero_centered,
+                                   fraction_field=fraction_field,
                                   moment_transform_class=cspace.raw_moment_transform_class)
    elif cspace.collision_space == CollisionSpace.CENTRAL_MOMENTS:
        return CentralMomentBasedLbMethod(stencil, equilibrium, mom_to_rr_dict, conserved_quantity_computation=cqc,
                                          force_model=force_model, zero_centered=zero_centered,
+                                          fraction_field=fraction_field,
                                          central_moment_transform_class=cspace.central_moment_transform_class)
    elif cspace.collision_space == CollisionSpace.CUMULANTS:
        return CumulantBasedLbMethod(stencil, equilibrium, mom_to_rr_dict, conserved_quantity_computation=cqc,
                                     force_model=force_model, zero_centered=zero_centered,
+                                     fraction_field=fraction_field,
                                     central_moment_transform_class=cspace.central_moment_transform_class,
                                     cumulant_transform_class=cspace.cumulant_transform_class)

@@ -334,7 +338,7 @@ def create_mrt_raw(stencil, relaxation_rates, continuous_equilibrium=True, conse


 def create_central_moment(stencil, relaxation_rates, nested_moments=None,
-                          continuous_equilibrium=True, conserved_moments=True, fraction_field=None, **kwargs):
+                          continuous_equilibrium=True, conserved_moments=True, **kwargs):
    r"""
    Creates moment based LB method where the collision takes place in the central moment space.

@@ -348,7 +352,6 @@ def create_central_moment(stencil, relaxation_rates, nested_moments=None,
        continuous_equilibrium: determines if the discrete or continuous maxwellian equilibrium is
                        used to compute the equilibrium moments.
        conserved_moments: If lower order moments are conserved or not.
-        fraction_field: fraction field for the PSM method
    Returns:
        :class:`lbmpy.methods.momentbased.CentralMomentBasedLbMethod` instance
    """
@@ -371,8 +374,8 @@ def create_central_moment(stencil, relaxation_rates, nested_moments=None,
        nested_moments = cascaded_moment_sets_literature(stencil)

    rr_dict = _get_relaxation_info_dict(relaxation_rates, nested_moments, stencil.D, conserved_moments)
-    if fraction_field is not None:
-        relaxation_rates_modifier = (1.0 - fraction_field.center)
+    if 'fraction_field' in kwargs and kwargs['fraction_field'] is not None:
+        relaxation_rates_modifier = (1.0 - kwargs['fraction_field'])
        rr_dict = _get_relaxation_info_dict(relaxation_rates, nested_moments, stencil.D,
                                            relaxation_rates_modifier=relaxation_rates_modifier)

@@ -527,7 +530,7 @@ def create_mrt_orthogonal(stencil, relaxation_rates, continuous_equilibrium=True


 # ----------------------------------------- Cumulant method creators ---------------------------------------------------
-def create_cumulant(stencil, relaxation_rates, cumulant_groups, conserved_moments=True, fraction_field=None, **kwargs):
+def create_cumulant(stencil, relaxation_rates, cumulant_groups, conserved_moments=True, **kwargs):
    r"""Creates a cumulant-based lattice Boltzmann method.

    Args:
@@ -547,8 +550,8 @@ def create_cumulant(stencil, relaxation_rates, cumulant_groups, conserved_moment
    """
    cumulant_to_rr_dict = _get_relaxation_info_dict(relaxation_rates, cumulant_groups, stencil.D, conserved_moments)

-    if fraction_field is not None:
-        relaxation_rates_modifier = (1.0 - fraction_field.center)
+    if 'fraction_field' in kwargs and kwargs['fraction_field'] is not None:
+        relaxation_rates_modifier = (1.0 - kwargs['fraction_field'])
        cumulant_to_rr_dict = _get_relaxation_info_dict(relaxation_rates, cumulant_groups, stencil.D,
                                                        relaxation_rates_modifier=relaxation_rates_modifier)


--- a/src/lbmpy/methods/cumulantbased/cumulantbasedmethod.py
+++ b/src/lbmpy/methods/cumulantbased/cumulantbasedmethod.py
@@ -47,7 +47,7 @@ class CumulantBasedLbMethod(AbstractLbMethod):

    def __init__(self, stencil, equilibrium, relaxation_dict,
                 conserved_quantity_computation=None,
-                 force_model=None, zero_centered=False,
+                 force_model=None, zero_centered=False, fraction_field=None,
                 central_moment_transform_class=BinomialChimeraTransform,
                 cumulant_transform_class=CentralMomentsToCumulantsByGeneratingFunc):
        assert isinstance(conserved_quantity_computation,
@@ -63,6 +63,7 @@ class CumulantBasedLbMethod(AbstractLbMethod):
        self._cqc = conserved_quantity_computation
        self._force_model = force_model
        self._zero_centered = zero_centered
+        self._fraction_field = fraction_field
        self._weights = None
        self._cumulant_transform_class = cumulant_transform_class
        self._central_moment_transform_class = central_moment_transform_class
@@ -72,6 +73,10 @@ class CumulantBasedLbMethod(AbstractLbMethod):
        """Force model employed by this method."""
        return self._force_model

+    @property
+    def fraction_field(self):
+        return self._fraction_field
+
    @property
    def relaxation_info_dict(self):
        """Dictionary mapping this method's cumulants to their relaxation rates and equilibrium values.

--- a/src/lbmpy/methods/momentbased/centralmomentbasedmethod.py
+++ b/src/lbmpy/methods/momentbased/centralmomentbasedmethod.py
@@ -55,7 +55,7 @@ class CentralMomentBasedLbMethod(AbstractLbMethod):

    def __init__(self, stencil, equilibrium, relaxation_dict,
                 conserved_quantity_computation=None,
-                 force_model=None, zero_centered=False,
+                 force_model=None, zero_centered=False, fraction_field=None,
                 central_moment_transform_class=BinomialChimeraTransform):
        assert isinstance(conserved_quantity_computation, AbstractConservedQuantityComputation)
        super(CentralMomentBasedLbMethod, self).__init__(stencil)
@@ -65,6 +65,7 @@ class CentralMomentBasedLbMethod(AbstractLbMethod):
        self._cqc = conserved_quantity_computation
        self._force_model = force_model
        self._zero_centered = zero_centered
+        self._fraction_field = fraction_field
        self._weights = None
        self._central_moment_transform_class = central_moment_transform_class

@@ -73,6 +74,10 @@ class CentralMomentBasedLbMethod(AbstractLbMethod):
        """Force model employed by this method."""
        return self._force_model

+    @property
+    def fraction_field(self):
+        return self._fraction_field
+
    @property
    def relaxation_info_dict(self):
        """Dictionary mapping this method's moments to their relaxation rates and equilibrium values.

--- a/src/lbmpy/methods/momentbased/momentbasedmethod.py
+++ b/src/lbmpy/methods/momentbased/momentbasedmethod.py
@@ -48,7 +48,7 @@ class MomentBasedLbMethod(AbstractLbMethod):
        self._cqc = conserved_quantity_computation
        self._force_model = force_model
        self._zero_centered = zero_centered
-        self.fraction_field = fraction_field
+        self._fraction_field = fraction_field
        self._weights = None
        self._moment_transform_class = moment_transform_class

@@ -57,6 +57,10 @@ class MomentBasedLbMethod(AbstractLbMethod):
        """Force model employed by this method."""
        return self._force_model

+    @property
+    def fraction_field(self):
+        return self._fraction_field
+
    @property
    def relaxation_info_dict(self):
        """Dictionary mapping this method's moments to their relaxation rates and equilibrium values.
@@ -176,8 +180,8 @@ class MomentBasedLbMethod(AbstractLbMethod):
    def get_collision_rule(self, conserved_quantity_equations: AssignmentCollection = None,
                           pre_simplification: bool = True) -> LbmCollisionRule:

-        if self.fraction_field is not None:
-            relaxation_rates_modifier = (1.0 - self.fraction_field.center)
+        if self._fraction_field is not None:
+            relaxation_rates_modifier = (1.0 - self._fraction_field)
            rr_sub_expressions, d = self._generate_symbolic_relaxation_matrix(
                relaxation_rates_modifier=relaxation_rates_modifier)
        else:

--- a/src/lbmpy/moment_transforms/rawmomenttransforms.py
+++ b/src/lbmpy/moment_transforms/rawmomenttransforms.py
@@ -172,7 +172,7 @@ class PdfsToMomentsByMatrixTransform(AbstractRawMomentTransform):

    #   ----------------------------- Private Members -----------------------------

-    @ property
+    @property
    def _default_simplification(self):
        forward_simp = SimplificationStrategy()
        # forward_simp.add(substitute_moments_in_conserved_quantity_equations)
@@ -218,7 +218,7 @@ class PdfsToMomentsByChimeraTransform(AbstractRawMomentTransform):
                                                                                self.moment_polynomials)
        self.poly_to_mono_matrix = self.mono_to_poly_matrix.inv()

-    @ property
+    @property
    def absorbs_conserved_quantity_equations(self):
        return True

@@ -414,7 +414,7 @@ class PdfsToMomentsByChimeraTransform(AbstractRawMomentTransform):

    #   ----------------------------- Private Members -----------------------------

-    @ property
+    @property
    def _default_simplification(self):
        from lbmpy.methods.momentbased.momentbasedsimplifications import (
            substitute_moments_in_conserved_quantity_equations,

--- a/src/lbmpy/partially_saturated_cells.py
+++ b/src/lbmpy/partially_saturated_cells.py
 import sympy as sp
 from dataclasses import dataclass

+from lbmpy.enums import Method
 from lbmpy.methods.abstractlbmethod import LbmCollisionRule
 from pystencils import Assignment, AssignmentCollection
 from pystencils.field import Field
@@ -13,103 +14,156 @@ class PSMConfig:
    Fraction field for PSM 
    """

+    fraction_field_symbol = sp.Symbol('B')
+    """
+    Fraction field symbol used for simplification 
+    """
+
    object_velocity_field: Field = None
    """
    Object velocity field for PSM 
    """

-    SC: int = 1
+    solid_collision: int = 1
    """
    Solid collision option for PSM
    """

-    MaxParticlesPerCell: int = 1
+    max_particles_per_cell: int = 1
    """
    Maximum number of particles overlapping with a cell 
    """

    individual_fraction_field: Field = None
    """
-    Fraction field for each overlapping particle in PSM 
+    Fraction field for each overlapping object / particle in PSM 
    """

-    particle_force_field: Field = None
+    object_force_field: Field = None
    """
-    Force field for each overlapping particle in PSM 
+    Force field for each overlapping object / particle in PSM 
    """


-def add_psm_to_collision_rule(collision_rule, psm_config):
+def get_psm_solid_collision_term(collision_rule, psm_config, particle_per_cell_counter):
    if psm_config.individual_fraction_field is None:
-        psm_config.individual_fraction_field = psm_config.fraction_field
+        fraction_field = psm_config.fraction_field
+    else:
+        fraction_field = psm_config.individual_fraction_field

    method = collision_rule.method
    pre_collision_pdf_symbols = method.pre_collision_pdf_symbols
    stencil = method.stencil

-    # Get equilibrium from object velocity for solid collision
-    forces_rhs = [0] * psm_config.MaxParticlesPerCell * stencil.D
    solid_collisions = [0] * stencil.Q
-    for p in range(psm_config.MaxParticlesPerCell):
-        equilibrium_fluid = method.get_equilibrium_terms()
-        equilibrium_solid = []
-        for eq in equilibrium_fluid:
-            eq_sol = eq
-            for i in range(stencil.D):
-                eq_sol = eq_sol.subs(sp.Symbol("u_" + str(i)),
-                                     psm_config.object_velocity_field.center(p * stencil.D + i), )
-            equilibrium_solid.append(eq_sol)
-
-        # Build solid collision
-        for i, (eqFluid, eqSolid, f, offset) in enumerate(
-                zip(equilibrium_fluid, equilibrium_solid, pre_collision_pdf_symbols, stencil)):
-            inverse_direction_index = stencil.stencil_entries.index(stencil.inverse_stencil_entries[i])
-            if psm_config.SC == 1:
-                solid_collision = psm_config.individual_fraction_field.center(p) * (
-                    (
-                        pre_collision_pdf_symbols[inverse_direction_index]
-                        - equilibrium_fluid[inverse_direction_index]
-                    )
-                    - (f - eqSolid)
-                )
-            elif psm_config.SC == 2:
-                # TODO get relaxation rate vector from method and use the right relaxation rate [i]
-                solid_collision = psm_config.individual_fraction_field.center(p) * (
-                    (eqSolid - f) + (1.0 - method.relaxation_rates[0]) * (f - eqFluid)
-                )
-            elif psm_config.SC == 3:
-                solid_collision = psm_config.individual_fraction_field.center(p) * (
-                    (
-                        pre_collision_pdf_symbols[inverse_direction_index]
-                        - equilibrium_solid[inverse_direction_index]
-                    )
-                    - (f - eqSolid)
-                )
-            else:
-                raise ValueError("Only SC=1, SC=2 and SC=3 are supported.")
-            solid_collisions[i] += solid_collision
-            for j in range(stencil.D):
-                forces_rhs[p * stencil.D + j] -= solid_collision * int(offset[j])
-
-    # Add solid collision to main assignments of collision rule
+    equilibrium_fluid = method.get_equilibrium_terms()
+    equilibrium_solid = []
+
+    # get equilibrium form object velocity
+    for eq in equilibrium_fluid:
+        eq_sol = eq
+        for i in range(stencil.D):
+            eq_sol = eq_sol.subs(sp.Symbol("u_" + str(i)),
+                                 psm_config.object_velocity_field.center(particle_per_cell_counter * stencil.D + i), )
+        equilibrium_solid.append(eq_sol)
+
+    # Build solid collision
+    for i, (eqFluid, eqSolid, f, offset) in enumerate(
+            zip(equilibrium_fluid, equilibrium_solid, pre_collision_pdf_symbols, stencil)):
+        inverse_direction_index = stencil.stencil_entries.index(stencil.inverse_stencil_entries[i])
+        if psm_config.solid_collision == 1:
+            solid_collision = (fraction_field.center(particle_per_cell_counter)
+                               * ((pre_collision_pdf_symbols[inverse_direction_index]
+                                   - equilibrium_fluid[inverse_direction_index]) - (f - eqSolid)))
+        elif psm_config.solid_collision == 2:
+            # TODO get relaxation rate vector from method and use the right relaxation rate [i]
+            solid_collision = (fraction_field.center(particle_per_cell_counter)
+                               * ((eqSolid - f) + (1.0 - method.relaxation_rates[0]) * (f - eqFluid)))
+        elif psm_config.solid_collision == 3:
+            solid_collision = (fraction_field.center(particle_per_cell_counter)
+                               * ((pre_collision_pdf_symbols[inverse_direction_index]
+                                   - equilibrium_solid[inverse_direction_index]) - (f - eqSolid)))
+        else:
+            raise ValueError("Only sc=1, sc=2 and sc=3 are supported.")
+
+        solid_collisions[i] += solid_collision
+
+    return solid_collisions
+
+
+def get_psm_force_from_solid_collision(solid_collisions, stencil, object_force_field, particle_per_cell_counter):
+    force_assignments = []
+    for d in range(stencil.D):
+        forces_rhs = 0
+        for sc, offset in zip(solid_collisions, stencil):
+            forces_rhs -= sc * int(offset[d])
+
+        force_assignments.append(Assignment(
+            object_force_field.center(particle_per_cell_counter * stencil.D + d), forces_rhs
+        ))
+    return AssignmentCollection(force_assignments)
+
+
+def replace_fraction_symbol_with_field(assignments, fraction_field_symbol, fraction_field_access):
+    new_assignments = []
+    for ass in assignments:
+        rhs = ass.rhs.subs(fraction_field_symbol, fraction_field_access.center(0))
+        new_assignments.append(Assignment(ass.lhs, rhs))
+    return new_assignments
+
+
+def add_psm_solid_collision_to_collision_rule(collision_rule, lbm_config, particle_per_cell_counter):
+
+    method = collision_rule.method
+    solid_collisions = get_psm_solid_collision_term(collision_rule, lbm_config.psm_config, particle_per_cell_counter)
+    post_collision_pdf_symbols = method.post_collision_pdf_symbols
+
+    assignments = []
+    for sc, post in zip(solid_collisions, post_collision_pdf_symbols):
+        assignments.append(Assignment(post, post + sc))
+
+    if lbm_config.psm_config.object_force_field is not None:
+        assignments += get_psm_force_from_solid_collision(solid_collisions, method.stencil,
+                                                          lbm_config.psm_config.object_force_field,
+                                                          particle_per_cell_counter)
+
+    # exchanging rho with zeroth order moment symbol
+    if lbm_config.method in (Method.CENTRAL_MOMENT, Method.MONOMIAL_CUMULANT, Method.CUMULANT):
+        new_assignments = []
+        zeroth_moment_symbol = 'm_00' if lbm_config.stencil.D == 2 else 'm_000'
+        for ass in assignments:
+            new_assignments.append(ass.subs(sp.Symbol('rho'), sp.Symbol(zeroth_moment_symbol)))
+        assignments = new_assignments
+
+    collision_assignments = AssignmentCollection(assignments)
+    ac = LbmCollisionRule(method, collision_assignments, [],
+                          collision_rule.simplification_hints)
+    return ac
+
+
+def replace_by_psm_collision_rule(collision_rule, psm_config):
+
+    method = collision_rule.method
    collision_assignments = []
-    for main, sc in zip(collision_rule.main_assignments, solid_collisions):
-        collision_assignments.append(Assignment(main.lhs, main.rhs + sc))
-
-    # Add hydrodynamic force calculations to collision assignments if two-way coupling is used
-    # (i.e., force field is not None)
-    if psm_config.particle_force_field is not None:
-        for p in range(psm_config.MaxParticlesPerCell):
-            for i in range(stencil.D):
-                collision_assignments.append(
-                    Assignment(
-                        psm_config.particle_force_field.center(p * stencil.D + i),
-                        forces_rhs[p * stencil.D + i],
-                    )
-                )
+    solid_collisions = [0] * psm_config.max_particles_per_cell
+    for p in range(psm_config.max_particles_per_cell):
+        solid_collisions[p] = get_psm_solid_collision_term(collision_rule, psm_config, p)
+
+        if psm_config.object_force_field is not None:
+            collision_assignments += get_psm_force_from_solid_collision(solid_collisions[p], method.stencil,
+                                                                        psm_config.object_force_field, p)
+
+    for i, main in enumerate(collision_rule.main_assignments):
+        rhs = main.rhs
+        for p in range(psm_config.max_particles_per_cell):
+            rhs += solid_collisions[p][i]
+        collision_assignments.append(Assignment(main.lhs, rhs))

    collision_assignments = AssignmentCollection(collision_assignments)
-    ac = LbmCollisionRule(method, collision_assignments, collision_rule.subexpressions,
+    ac = LbmCollisionRule(method, replace_fraction_symbol_with_field(collision_assignments,
+                          psm_config.fraction_field_symbol, psm_config.fraction_field),
+                          replace_fraction_symbol_with_field(collision_rule.subexpressions,
+                          psm_config.fraction_field_symbol, psm_config.fraction_field),
                          collision_rule.simplification_hints)
    ac.topological_sort()
    return ac
--- a/src/lbmpy/updatekernels.py
+++ b/src/lbmpy/updatekernels.py
@@ -120,6 +120,26 @@ def create_stream_pull_with_output_kernel(lb_method, src_field, dst_field=None,
                            simplification_hints=output_eq_collection.simplification_hints)


+def create_copy_kernel(stencil, src_field, dst_field, accessor=StreamPullTwoFieldsAccessor()):
+    """Creates a copy kernel, which can be used to transfer information from src to dst field.
+
+    Args:
+        stencil: lattice Boltzmann stencil which is used in the form of a tuple of tuples
+        src_field: field used for reading pdf values
+        dst_field: field used for writing pdf values
+        accessor: instance of PdfFieldAccessor, defining where to read and write values
+                  to create e.g. a fused stream-collide kernel See 'fieldaccess.PdfFieldAccessor'
+
+    Returns:
+        AssignmentCollection of a copy update rule
+    """
+
+    temporary_symbols = sp.symbols(f'copied:{stencil.Q}')
+    subexpressions = [Assignment(tmp, acc) for tmp, acc in zip(temporary_symbols, accessor.write(src_field, stencil))]
+    main_assignments = [Assignment(acc, tmp) for acc, tmp in zip(accessor.write(dst_field, stencil), temporary_symbols)]
+    return AssignmentCollection(main_assignments, subexpressions=subexpressions)
+
+
 # ---------------------------------- Pdf array creation for various layouts --------------------------------------------



--- a/tests/advanced_streaming/test_communication.py
+++ b/tests/advanced_streaming/test_communication.py
@@ -5,36 +5,52 @@ import numpy as np
 from lbmpy.stencils import LBStencil
 from pystencils.slicing import get_slice_before_ghost_layer, get_ghost_region_slice
 from lbmpy.creationfunctions import create_lb_update_rule, LBMConfig, LBMOptimisation
-from lbmpy.advanced_streaming.communication import get_communication_slices, _fix_length_one_slices, \
-    LBMPeriodicityHandling
+from lbmpy.advanced_streaming.communication import (
+    get_communication_slices,
+    _fix_length_one_slices,
+    LBMPeriodicityHandling,
+    periodic_pdf_gpu_copy_kernel,
+)
 from lbmpy.advanced_streaming.utility import streaming_patterns, Timestep
 from lbmpy.enums import Stencil

 import pytest


-@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
-@pytest.mark.parametrize('streaming_pattern', streaming_patterns)
-@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD])
+@pytest.mark.parametrize(
+    "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
+)
+@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
+@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD])
 def test_slices_not_empty(stencil, streaming_pattern, timestep):
    stencil = LBStencil(stencil)
    arr = np.zeros((4,) * stencil.D + (stencil.Q,))
-    slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep,
-                                      ghost_layers=1)
+    slices = get_communication_slices(
+        stencil,
+        streaming_pattern=streaming_pattern,
+        prev_timestep=timestep,
+        ghost_layers=1,
+    )
    for _, slices_list in slices.items():
        for src, dst in slices_list:
            assert all(s != 0 for s in arr[src].shape)
            assert all(s != 0 for s in arr[dst].shape)


-@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
-@pytest.mark.parametrize('streaming_pattern', streaming_patterns)
-@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD])
+@pytest.mark.parametrize(
+    "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
+)
+@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
+@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD])
 def test_src_dst_same_shape(stencil, streaming_pattern, timestep):
    stencil = LBStencil(stencil)
    arr = np.zeros((4,) * stencil.D + (stencil.Q,))
-    slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep,
-                                      ghost_layers=1)
+    slices = get_communication_slices(
+        stencil,
+        streaming_pattern=streaming_pattern,
+        prev_timestep=timestep,
+        ghost_layers=1,
+    )
    for _, slices_list in slices.items():
        for src, dst in slices_list:
            src_shape = arr[src].shape
@@ -42,12 +58,15 @@ def test_src_dst_same_shape(stencil, streaming_pattern, timestep):
            assert src_shape == dst_shape


-@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
+@pytest.mark.parametrize(
+    "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
+)
 def test_pull_communication_slices(stencil):
    stencil = LBStencil(stencil)

    slices = get_communication_slices(
-        stencil, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1)
+        stencil, streaming_pattern="pull", prev_timestep=Timestep.BOTH, ghost_layers=1
+    )
    for i, d in enumerate(stencil):
        if i == 0:
            continue
@@ -58,21 +77,115 @@ def test_pull_communication_slices(stencil):
                dst = s[1][:-1]
                break

-        inner_slice = _fix_length_one_slices(get_slice_before_ghost_layer(d, ghost_layers=1))
+        inner_slice = _fix_length_one_slices(
+            get_slice_before_ghost_layer(d, ghost_layers=1)
+        )
        inv_dir = (-e for e in d)
-        gl_slice = _fix_length_one_slices(get_ghost_region_slice(inv_dir, ghost_layers=1))
+        gl_slice = _fix_length_one_slices(
+            get_ghost_region_slice(inv_dir, ghost_layers=1)
+        )
        assert src == inner_slice
        assert dst == gl_slice


-@pytest.mark.parametrize('stencil_name', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
+@pytest.mark.parametrize("direction", LBStencil(Stencil.D3Q27).stencil_entries)
+@pytest.mark.parametrize("pull", [False, True])
+def test_gpu_comm_kernels(direction: tuple, pull: bool):
+    pytest.importorskip("cupy")
+
+    stencil = LBStencil(Stencil.D3Q27)
+    inv_dir = stencil[stencil.inverse_index(direction)]
+    target = ps.Target.GPU
+
+    domain_size = (4,) * stencil.D
+
+    dh: ps.datahandling.SerialDataHandling = ps.create_data_handling(
+        domain_size,
+        periodicity=(True,) * stencil.D,
+        parallel=False,
+        default_target=target,
+    )
+
+    field = dh.add_array("field", values_per_cell=2)
+
+    if pull:
+        dst_slice = get_ghost_region_slice(inv_dir)
+        src_slice = get_slice_before_ghost_layer(direction)
+    else:
+        dst_slice = get_slice_before_ghost_layer(direction)
+        src_slice = get_ghost_region_slice(inv_dir)
+
+    src_slice += (1,)
+    dst_slice += (1,)
+
+    kernel = periodic_pdf_gpu_copy_kernel(field, src_slice, dst_slice)
+
+    dh.cpu_arrays[field.name][src_slice] = 42.0
+    dh.all_to_gpu()
+
+    dh.run_kernel(kernel)
+
+    dh.all_to_cpu()
+    np.testing.assert_equal(dh.cpu_arrays[field.name][dst_slice], 42.0)
+
+
+@pytest.mark.parametrize("stencil", [Stencil.D2Q9, Stencil.D3Q19])
+@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
+def test_direct_copy_and_kernels_equivalence(stencil: Stencil, streaming_pattern: str):
+    pytest.importorskip("cupy")
+
+    target = ps.Target.GPU
+    stencil = LBStencil(stencil)
+    domain_size = (4,) * stencil.D
+
+    dh: ps.datahandling.SerialDataHandling = ps.create_data_handling(
+        domain_size,
+        periodicity=(True,) * stencil.D,
+        parallel=False,
+        default_target=target,
+    )
+
+    pdfs_a = dh.add_array("pdfs_a", values_per_cell=stencil.Q)
+    pdfs_b = dh.add_array("pdfs_b", values_per_cell=stencil.Q)
+
+    dh.fill(pdfs_a.name, 0.0, ghost_layers=True)
+    dh.fill(pdfs_b.name, 0.0, ghost_layers=True)
+
+    for q in range(stencil.Q):
+        sl = ps.make_slice[:4, :4, q] if stencil.D == 2 else ps.make_slice[:4, :4, :4, q]
+        dh.cpu_arrays[pdfs_a.name][sl] = q
+        dh.cpu_arrays[pdfs_b.name][sl] = q
+
+    dh.all_to_gpu()
+
+    direct_copy = LBMPeriodicityHandling(stencil, dh, pdfs_a.name, streaming_pattern, cupy_direct_copy=True)
+    kernels_copy = LBMPeriodicityHandling(stencil, dh, pdfs_b.name, streaming_pattern, cupy_direct_copy=False)
+
+    direct_copy(Timestep.EVEN)
+    kernels_copy(Timestep.EVEN)
+
+    dh.all_to_cpu()
+
+    np.testing.assert_equal(
+        dh.cpu_arrays[pdfs_a.name],
+        dh.cpu_arrays[pdfs_b.name]
+    )
+
+
+@pytest.mark.parametrize(
+    "stencil_name", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
+)
 def test_optimised_and_full_communication_equivalence(stencil_name):
    target = ps.Target.CPU
    stencil = LBStencil(stencil_name)
-    domain_size = (4, ) * stencil.D
+    domain_size = (4,) * stencil.D

-    dh = ps.create_data_handling(domain_size, periodicity=(True, ) * stencil.D,
-                                 parallel=False, default_target=target)
+    dh = ps.create_data_handling(
+        domain_size,
+        periodicity=(True,) * stencil.D,
+        parallel=False,
+        default_target=target,
+    )

    pdf = dh.add_array("pdf", values_per_cell=len(stencil), dtype=np.int64)
    dh.fill("pdf", 0, ghost_layers=True)
@@ -82,9 +195,9 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
    gl = dh.ghost_layers_of_field("pdf")

    num = 0
-    for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']):
-        dh.cpu_arrays['pdf'][idx] = num
-        dh.cpu_arrays['pdf_tmp'][idx] = num
+    for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]):
+        dh.cpu_arrays["pdf"][idx] = num
+        dh.cpu_arrays["pdf_tmp"][idx] = num
        num += 1

    lbm_config = LBMConfig(stencil=stencil, kernel_type="stream_pull_only")
@@ -95,21 +208,27 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
    ast = ps.create_kernel(ac, config=config)
    stream = ast.compile()

-    full_communication = dh.synchronization_function(pdf.name, target=dh.default_target, optimization={"openmp": True})
+    full_communication = dh.synchronization_function(
+        pdf.name, target=dh.default_target, optimization={"openmp": True}
+    )
    full_communication()

    dh.run_kernel(stream)
    dh.swap("pdf", "pdf_tmp")
-    pdf_full_communication = np.copy(dh.cpu_arrays['pdf'])
+    pdf_full_communication = np.copy(dh.cpu_arrays["pdf"])

    num = 0
-    for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']):
-        dh.cpu_arrays['pdf'][idx] = num
-        dh.cpu_arrays['pdf_tmp'][idx] = num
+    for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]):
+        dh.cpu_arrays["pdf"][idx] = num
+        dh.cpu_arrays["pdf_tmp"][idx] = num
        num += 1

-    optimised_communication = LBMPeriodicityHandling(stencil=stencil, data_handling=dh, pdf_field_name=pdf.name,
-                                                     streaming_pattern='pull')
+    optimised_communication = LBMPeriodicityHandling(
+        stencil=stencil,
+        data_handling=dh,
+        pdf_field_name=pdf.name,
+        streaming_pattern="pull",
+    )
    optimised_communication()
    dh.run_kernel(stream)
    dh.swap("pdf", "pdf_tmp")
@@ -119,9 +238,14 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
            for j in range(gl, domain_size[1]):
                for k in range(gl, domain_size[2]):
                    for f in range(len(stencil)):
-                        assert dh.cpu_arrays['pdf'][i, j, k, f] == pdf_full_communication[i, j, k, f], print(f)
+                        assert (
+                            dh.cpu_arrays["pdf"][i, j, k, f]
+                            == pdf_full_communication[i, j, k, f]
+                        ), print(f)
    else:
        for i in range(gl, domain_size[0]):
            for j in range(gl, domain_size[1]):
                for f in range(len(stencil)):
-                    assert dh.cpu_arrays['pdf'][i, j, f] == pdf_full_communication[i, j, f]
+                    assert (
+                        dh.cpu_arrays["pdf"][i, j, f] == pdf_full_communication[i, j, f]
+                    )
--- a/tests/test_fluctuating_lb.py
+++ b/tests/test_fluctuating_lb.py
@@ -5,13 +5,16 @@ import pytest
 import pystencils as ps

 from pystencils import get_code_str
-from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
+from pystencils.backends.simd_instruction_sets import (
+    get_supported_instruction_sets,
+    get_vector_instruction_set,
+)
 from pystencils.cpu.cpujit import get_compiler_config
 from pystencils.enums import Target
 from pystencils.rng import PhiloxTwoDoubles

 from lbmpy.creationfunctions import *
-from lbmpy.forcemodels import Guo
+from lbmpy.forcemodels import Guo 
 from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
 import numpy as np
 from lbmpy.enums import Stencil
@@ -20,13 +23,18 @@ from lbmpy.stencils import LBStencil


 def _skip_instruction_sets_windows(instruction_sets):
-    if get_compiler_config()['os'] == 'windows':
+    if get_compiler_config()["os"] == "windows":
        # skip instruction sets supported by the CPU but not by the compiler
-        if 'avx' in instruction_sets and ('/arch:avx2' not in get_compiler_config()['flags'].lower()
-                                          and '/arch:avx512' not in get_compiler_config()['flags'].lower()):
-            instruction_sets.remove('avx')
-        if 'avx512' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower():
-            instruction_sets.remove('avx512')
+        if "avx" in instruction_sets and (
+            "/arch:avx2" not in get_compiler_config()["flags"].lower()
+            and "/arch:avx512" not in get_compiler_config()["flags"].lower()
+        ):
+            instruction_sets.remove("avx")
+        if (
+            "avx512" in instruction_sets
+            and "/arch:avx512" not in get_compiler_config()["flags"].lower()
+        ):
+            instruction_sets.remove("avx512")
    return instruction_sets


@@ -35,11 +43,13 @@ def single_component_maxwell(x1, x2, kT, mass):
    x = np.linspace(x1, x2, 1000)

    try:
-        trapezoid = np.trapezoid # since numpy 2.0
+        trapezoid = np.trapezoid  # since numpy 2.0
    except AttributeError:
        trapezoid = np.trapz

-    return trapezoid(np.exp(-mass * x ** 2 / (2. * kT)), x) / np.sqrt(2. * np.pi * kT / mass)
+    return trapezoid(np.exp(-mass * x**2 / (2.0 * kT)), x) / np.sqrt(
+        2.0 * np.pi * kT / mass
+    )


 def rr_getter(moment_group):
@@ -71,53 +81,86 @@ def second_order_moment_tensor_assignments(function_values, stencil, output_fiel
    """Assignments for calculating the pressure tensor"""
    assert len(function_values) == len(stencil)
    dim = len(stencil[0])
-    return [ps.Assignment(output_field(i, j),
-                          sum(c[i] * c[j] * f for f, c in zip(function_values, stencil)))
-            for i in range(dim) for j in range(dim)]
+    return [
+        ps.Assignment(
+            output_field(i, j),
+            sum(c[i] * c[j] * f for f, c in zip(function_values, stencil)),
+        )
+        for i in range(dim)
+        for j in range(dim)
+    ]


 def add_pressure_output_to_collision_rule(collision_rule, pressure_field):
-    pressure_ouput = second_order_moment_tensor_assignments(collision_rule.method.pre_collision_pdf_symbols,
-                                                            collision_rule.method.stencil, pressure_field)
+    pressure_ouput = second_order_moment_tensor_assignments(
+        collision_rule.method.pre_collision_pdf_symbols,
+        collision_rule.method.stencil,
+        pressure_field,
+    )
    collision_rule.main_assignments = collision_rule.main_assignments + pressure_ouput


-def get_fluctuating_lb(size=None, kT=None,
-                       omega_shear=None, omega_bulk=None, omega_odd=None, omega_even=None,
-                       rho_0=None, target=None):
+def get_fluctuating_lb(
+    size=None,
+    kT=None,
+    omega_shear=None,
+    omega_bulk=None,
+    omega_odd=None,
+    omega_even=None,
+    rho_0=None,
+    target=None,
+    zero_centered: bool = False,
+):
    # Parameters
    stencil = LBStencil(Stencil.D3Q19)

    # Setup data handling
-    dh = ps.create_data_handling((size,) * stencil.D, periodicity=True, default_target=target)
-    src = dh.add_array('src', values_per_cell=stencil.Q, layout='f')
-    dst = dh.add_array_like('dst', 'src')
-    rho = dh.add_array('rho', layout='f', latex_name='\\rho', values_per_cell=1)
-    u = dh.add_array('u', values_per_cell=dh.dim, layout='f')
-    pressure_field = dh.add_array('pressure', values_per_cell=(
-        3, 3), layout='f', gpu=target == Target.GPU)
+    dh = ps.create_data_handling(
+        (size,) * stencil.D, periodicity=True, default_target=target
+    )
+    src = dh.add_array("src", values_per_cell=stencil.Q, layout="f")
+    dst = dh.add_array_like("dst", "src")
+    rho = dh.add_array("rho", layout="f", latex_name="\\rho", values_per_cell=1)
+    u = dh.add_array("u", values_per_cell=dh.dim, layout="f")
+    pressure_field = dh.add_array(
+        "pressure", values_per_cell=(3, 3), layout="f", gpu=target == Target.GPU
+    )
    force_field = dh.add_array(
-        'force', values_per_cell=stencil.D, layout='f', gpu=target == Target.GPU)
+        "force", values_per_cell=stencil.D, layout="f", gpu=target == Target.GPU
+    )

    # Method setup
-    lbm_config = LBMConfig(stencil=stencil, method=Method.MRT, compressible=True,
-                           weighted=True, zero_centered=False, relaxation_rates=rr_getter,
-                           force_model=Guo(force=force_field.center_vector),
-                           fluctuating={'temperature': kT},
-                           kernel_type='collide_only')
+    lbm_config = LBMConfig(
+        stencil=stencil,
+        method=Method.MRT,
+        compressible=True,
+        weighted=True,
+        zero_centered=zero_centered,
+        relaxation_rates=rr_getter,
+        force_model=Guo(force=force_field.center_vector),
+        fluctuating={"temperature": kT},
+        kernel_type="collide_only",
+    )

    lb_method = create_lb_method(lbm_config=lbm_config)
    lbm_config.lb_method = lb_method
-    
-    lbm_opt = LBMOptimisation(symbolic_field=src, cse_global=True)
-    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
-
-    add_pressure_output_to_collision_rule(collision_rule, pressure_field)

-    collision = create_lb_update_rule(collision_rule=collision_rule,
-                                      lbm_config=lbm_config, lbm_optimisation=lbm_opt)
-    stream = create_stream_pull_with_output_kernel(collision.method, src, dst,
-                                                   {'density': rho, 'velocity': u})
+    lbm_opt = LBMOptimisation(symbolic_field=src, cse_global=True)
+    collision_rule = create_lb_collision_rule(
+        lbm_config=lbm_config, lbm_optimisation=lbm_opt
+    )
+
+    # add_pressure_output_to_collision_rule(collision_rule, pressure_field)
+
+    collision = create_lb_update_rule(
+        collision_rule=collision_rule, lbm_config=lbm_config, lbm_optimisation=lbm_opt
+    )
+    stream = create_stream_pull_with_output_kernel(
+        collision.method,
+        src,
+        dst,
+        {"density": rho, "velocity": u, "moment2": pressure_field},
+    )

    config = ps.CreateKernelConfig(cpu_openmp=False, target=dh.default_target)

@@ -128,15 +171,18 @@ def get_fluctuating_lb(size=None, kT=None,
    sync_pdfs = dh.synchronization_function([src.name])

    # Initialization
-    init = macroscopic_values_setter(collision.method, velocity=(0,) * dh.dim,
-                                     pdfs=src.center_vector, density=rho.center)
+    init = macroscopic_values_setter(
+        collision.method,
+        velocity=(0,) * dh.dim,
+        pdfs=src.center_vector,
+        density=rho_0
+    )
    init_kernel = ps.create_kernel(init, ghost_layers=0).compile()

    dh.fill(rho.name, rho_0)
    dh.fill(u.name, np.nan, ghost_layers=True, inner_ghost_layers=True)
    dh.fill(u.name, 0)
-    dh.fill(force_field.name, np.nan,
-            ghost_layers=True, inner_ghost_layers=True)
+    dh.fill(force_field.name, np.nan, ghost_layers=True, inner_ghost_layers=True)
    dh.fill(force_field.name, 0)
    dh.run_kernel(init_kernel)

@@ -144,8 +190,15 @@ def get_fluctuating_lb(size=None, kT=None,
    def time_loop(start, steps):
        dh.all_to_gpu()
        for i in range(start, start + steps):
-            dh.run_kernel(collision_kernel, omega_shear=omega_shear, omega_bulk=omega_bulk,
-                          omega_odd=omega_odd, omega_even=omega_even, seed=42, time_step=i)
+            dh.run_kernel(
+                collision_kernel,
+                omega_shear=omega_shear,
+                omega_bulk=omega_bulk,
+                omega_odd=omega_odd,
+                omega_even=omega_even,
+                seed=42,
+                time_step=i,
+            )

            sync_pdfs()
            dh.run_kernel(stream_kernel)
@@ -156,13 +209,27 @@ def get_fluctuating_lb(size=None, kT=None,
    return dh, time_loop


-def test_resting_fluid(target=Target.CPU):
-    rho_0 = 0.86
-    kT = 4E-4
-    L = [60] * 3
-    dh, time_loop = get_fluctuating_lb(size=L[0], target=target,
-                                       rho_0=rho_0, kT=kT,
-                                       omega_shear=0.8, omega_bulk=0.5, omega_even=.04, omega_odd=0.3)
+@pytest.mark.parametrize(
+    "zero_centered", [False, True], ids=["regular-storage", "zero-centered"]
+)
+@pytest.mark.parametrize(
+    "domain_size", [8, 60]
+)
+def test_resting_fluid(zero_centered: bool, domain_size: int, target=Target.CPU):
+    rho_0 = 0.86 
+    kT = 4e-4
+    L = [domain_size] * 3 
+    dh, time_loop = get_fluctuating_lb(
+        size=L[0],
+        target=target,
+        rho_0=rho_0,
+        kT=kT,
+        omega_shear=0.8,
+        omega_bulk=0.5,
+        omega_even=0.04,
+        omega_odd=0.3,
+        zero_centered=zero_centered,
+    )

    # Test
    t = 0
@@ -176,38 +243,43 @@ def test_resting_fluid(target=Target.CPU):
        res_u = dh.gather_array("u").reshape((-1, 3))
        res_rho = dh.gather_array("rho").reshape((-1,))

-        # mass conservation
+        # mass conservationo
+        # density per cell fluctuates, but toal mass is conserved
        np.testing.assert_allclose(np.mean(res_rho), rho_0, atol=3E-12)

        # momentum conservation
        momentum = np.dot(res_rho, res_u)
-        np.testing.assert_allclose(momentum, [0, 0, 0], atol=1E-10)
+        np.testing.assert_allclose(momentum, [0, 0, 0], atol=1e-10)

-        # temperature
+        # temperature (fluctuates around pre-set kT)
        kinetic_energy = 1 / 2 * np.dot(res_rho, res_u * res_u) / np.prod(L)
-        np.testing.assert_allclose(
-            kinetic_energy, [kT / 2] * 3, atol=kT * 0.01)
+        kT_tol = 0.075 *(16/domain_size)**(3/2) 
+        np.testing.assert_allclose(kinetic_energy, [kT / 2] * 3, rtol=kT_tol)

        # velocity distribution
        v_hist, v_bins = np.histogram(
-            res_u, bins=11, range=(-.075, .075), density=True)
+            res_u, bins=11, range=(-0.075, 0.075), density=True
+        )

        # Calculate expected values from single
        v_expected = []
        for j in range(len(v_hist)):
            # Maxwell distribution
-            res = 1. / (v_bins[j + 1] - v_bins[j]) * \
-                  single_component_maxwell(
-                      v_bins[j], v_bins[j + 1], kT, rho_0)
+            res = (
+                1.0
+                / (v_bins[j + 1] - v_bins[j])
+                * single_component_maxwell(v_bins[j], v_bins[j + 1], kT, rho_0)
+            )
            v_expected.append(res)
        v_expected = np.array(v_expected)

-        # 10% accuracy on the entire histogram
-        np.testing.assert_allclose(v_hist, v_expected, rtol=0.1)
-        # 1% accuracy on the middle part
+        hist_tol_all = 0.75 *(16/domain_size)**(3/2)
+        np.testing.assert_allclose(v_hist, v_expected, rtol=hist_tol_all)
+        hist_tol_center = hist_tol_all/10
        remove = 3
        np.testing.assert_allclose(
-            v_hist[remove:-remove], v_expected[remove:-remove], rtol=0.01)
+            v_hist[remove:-remove], v_expected[remove:-remove], rtol=hist_tol_center
+        )

        # pressure tensor against expressions from
        # Duenweg, Schiller, Ladd, https://arxiv.org/abs/0707.1581
@@ -220,19 +292,35 @@ def test_resting_fluid(target=Target.CPU):
        # Diagonal elements are rho c_s^22 +<u,u>. When the fluid is
        # thermalized, the expectation value of <u,u> = kT due to the
        # equi-partition theorem.
-        p_av_expected = np.diag([rho_0 * c_s ** 2 + kT] * 3)
+        p_av_expected = np.diag([rho_0 * c_s**2 + kT] * 3)
+        pressure_atol = c_s**2 / 200 *(16/domain_size)**(3/2)
        np.testing.assert_allclose(
-            np.mean(res_pressure, axis=0), p_av_expected, atol=c_s ** 2 / 2000)
+            np.mean(res_pressure, axis=0), p_av_expected, atol=pressure_atol)


-def test_point_force(target=Target.CPU):
+
+@pytest.mark.parametrize(
+    "zero_centered", [False, True], ids=["regular-storage", "zero-centered"]
+)
+@pytest.mark.parametrize(
+    "domain_size", [8, 60]
+)
+def test_point_force(zero_centered: bool, domain_size: int, target=Target.CPU):
    """Test momentum balance for thermalized fluid with applied poitn forces"""
    rho_0 = 0.86
-    kT = 4E-4
-    L = [8] * 3
-    dh, time_loop = get_fluctuating_lb(size=L[0], target=target,
-                                       rho_0=rho_0, kT=kT,
-                                       omega_shear=0.8, omega_bulk=0.5, omega_even=.04, omega_odd=0.3)
+    kT = 4e-4
+    L = [domain_size] * 3
+    dh, time_loop = get_fluctuating_lb(
+        size=L[0],
+        target=target,
+        rho_0=rho_0,
+        kT=kT,
+        omega_shear=0.8,
+        omega_bulk=0.5,
+        omega_even=0.8,
+        omega_odd=0.8,
+        zero_centered=zero_centered
+    )

    # Test
    t = 0
@@ -241,17 +329,17 @@ def test_point_force(target=Target.CPU):

    introduced_momentum = np.zeros(3)
    for i in range(100):
-        point_force = 1E-5 * (np.random.random(3) - .5)
+        point_force = 1e-2/domain_size**(3/2) * (np.random.random(3) - 0.5)
        introduced_momentum += point_force

        # Note that ghost layers are included in the indexing
        force_pos = np.random.randint(1, L[0] - 2, size=3)

-        dh.cpu_arrays["force"][force_pos[0],
-                               force_pos[1], force_pos[2]] = point_force
+        dh.cpu_arrays["force"][force_pos[0], force_pos[1], force_pos[2]] = point_force
        t = time_loop(t, 1)
        res_u = dh.gather_array("u").reshape((-1, 3))
        res_rho = dh.gather_array("rho").reshape((-1,))
+        

        # mass conservation
        np.testing.assert_allclose(np.mean(res_rho), rho_0, atol=3E-12)
@@ -259,52 +347,72 @@ def test_point_force(target=Target.CPU):
        # momentum conservation
        momentum = np.dot(res_rho, res_u)
        np.testing.assert_allclose(
-            momentum, introduced_momentum + 0.5 * point_force, atol=1E-10)
-        dh.cpu_arrays["force"][force_pos[0],
-                               force_pos[1], force_pos[2]] = np.zeros(3)
-
-
-@pytest.mark.skipif(not get_supported_instruction_sets(), reason="No vector instruction sets supported")
-@pytest.mark.parametrize('data_type', ("float32", "float64"))
-@pytest.mark.parametrize('assume_aligned', (True, False))
-@pytest.mark.parametrize('assume_inner_stride_one', (True, False))
-@pytest.mark.parametrize('assume_sufficient_line_padding', (True, False))
-def test_vectorization(data_type, assume_aligned, assume_inner_stride_one, assume_sufficient_line_padding):
+            momentum, introduced_momentum + 0.5 * point_force, atol=1e-10
+        )
+        dh.cpu_arrays["force"][force_pos[0], force_pos[1], force_pos[2]] = np.zeros(3)
+
+
+@pytest.mark.skipif(
+    not get_supported_instruction_sets(), reason="No vector instruction sets supported"
+)
+@pytest.mark.parametrize("data_type", ("float32", "float64"))
+@pytest.mark.parametrize("assume_aligned", (True, False))
+@pytest.mark.parametrize("assume_inner_stride_one", (True, False))
+@pytest.mark.parametrize("assume_sufficient_line_padding", (True, False))
+def test_vectorization(
+    data_type, assume_aligned, assume_inner_stride_one, assume_sufficient_line_padding
+):
    stencil = LBStencil(Stencil.D3Q19)
-    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[3D]", layout='fzyx')
+    pdfs, pdfs_tmp = ps.fields(
+        f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[3D]", layout="fzyx"
+    )

    method = create_mrt_orthogonal(
-        stencil=stencil,
+        stencil=stencil, compressible=True, weighted=True, relaxation_rates=rr_getter
+    )
+
+    rng_node = (
+        ps.rng.PhiloxTwoDoubles if data_type == "float64" else ps.rng.PhiloxFourFloats
+    )
+    lbm_config = LBMConfig(
+        lb_method=method,
+        fluctuating={
+            "temperature": sp.Symbol("kT"),
+            "rng_node": rng_node,
+            "block_offsets": tuple([0] * stencil.D),
+        },
        compressible=True,
-        weighted=True,
-        relaxation_rates=rr_getter)
-
-    rng_node = ps.rng.PhiloxTwoDoubles if data_type == "float64" else ps.rng.PhiloxFourFloats
-    lbm_config = LBMConfig(lb_method=method, fluctuating={'temperature': sp.Symbol("kT"),
-                                                          'rng_node': rng_node,
-                                                          'block_offsets': tuple([0] * stencil.D)},
-                           compressible=True, zero_centered=False, 
-                           stencil=method.stencil, kernel_type='collide_only')
-    lbm_opt = LBMOptimisation(cse_global=True, symbolic_field=pdfs, symbolic_temporary_field=pdfs_tmp)
+        zero_centered=False,
+        stencil=method.stencil,
+        kernel_type="collide_only",
+    )
+    lbm_opt = LBMOptimisation(
+        cse_global=True, symbolic_field=pdfs, symbolic_temporary_field=pdfs_tmp
+    )

    collision = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)

    instruction_sets = _skip_instruction_sets_windows(get_supported_instruction_sets())
    instruction_set = instruction_sets[-1]

-    config = ps.CreateKernelConfig(target=Target.CPU,
-                                   data_type=data_type, default_number_float=data_type,
-                                   cpu_vectorize_info={'instruction_set': instruction_set,
-                                                       'assume_aligned': assume_aligned,
-                                                       'assume_inner_stride_one': assume_inner_stride_one,
-                                                       'assume_sufficient_line_padding': assume_sufficient_line_padding,
-                                                       }
-                                   )
-
-    if not assume_inner_stride_one and 'storeS' not in get_vector_instruction_set(data_type, instruction_set):
+    config = ps.CreateKernelConfig(
+        target=Target.CPU,
+        data_type=data_type,
+        default_number_float=data_type,
+        cpu_vectorize_info={
+            "instruction_set": instruction_set,
+            "assume_aligned": assume_aligned,
+            "assume_inner_stride_one": assume_inner_stride_one,
+            "assume_sufficient_line_padding": assume_sufficient_line_padding,
+        },
+    )
+
+    if not assume_inner_stride_one and "storeS" not in get_vector_instruction_set(
+        data_type, instruction_set
+    ):
        with pytest.warns(UserWarning) as pytest_warnings:
            ast = ps.create_kernel(collision, config=config)
-            assert 'Could not vectorize loop' in pytest_warnings[0].message.args[0]
+            assert "Could not vectorize loop" in pytest_warnings[0].message.args[0]
    else:
        ast = ps.create_kernel(collision, config=config)
    ast.compile()
@@ -312,31 +420,54 @@ def test_vectorization(data_type, assume_aligned, assume_inner_stride_one, assum
    print(code)


-@pytest.mark.parametrize('data_type', ("float32", "float64"))
-@pytest.mark.parametrize('assume_aligned', (True, False))
-@pytest.mark.parametrize('assume_inner_stride_one', (True, False))
-@pytest.mark.parametrize('assume_sufficient_line_padding', (True, False))
-def test_fluctuating_lb_issue_188_wlb(data_type, assume_aligned,
-                                      assume_inner_stride_one, assume_sufficient_line_padding):
+@pytest.mark.parametrize("data_type", ("float32", "float64"))
+@pytest.mark.parametrize("assume_aligned", (True, False))
+@pytest.mark.parametrize("assume_inner_stride_one", (True, False))
+@pytest.mark.parametrize("assume_sufficient_line_padding", (True, False))
+def test_fluctuating_lb_issue_188_wlb(
+    data_type, assume_aligned, assume_inner_stride_one, assume_sufficient_line_padding
+):
    stencil = LBStencil(Stencil.D3Q19)
    temperature = sp.symbols("temperature")
-    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[3D]", layout='fzyx')
-
-    rng_node = ps.rng.PhiloxTwoDoubles if data_type == "float64" else ps.rng.PhiloxFourFloats
-    fluctuating = {'temperature': temperature,
-                   'block_offsets': 'walberla',
-                   'rng_node': rng_node}
-
-    lbm_config = LBMConfig(stencil=stencil, method=Method.MRT, compressible=True,
-                           weighted=True, zero_centered=False, relaxation_rate=1.4,
-                           fluctuating=fluctuating)
-    lbm_opt = LBMOptimisation(symbolic_field=pdfs, symbolic_temporary_field=pdfs_tmp, cse_global=True)
+    pdfs, pdfs_tmp = ps.fields(
+        f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[3D]", layout="fzyx"
+    )
+
+    rng_node = (
+        ps.rng.PhiloxTwoDoubles if data_type == "float64" else ps.rng.PhiloxFourFloats
+    )
+    fluctuating = {
+        "temperature": temperature,
+        "block_offsets": "walberla",
+        "rng_node": rng_node,
+    }
+
+    lbm_config = LBMConfig(
+        stencil=stencil,
+        method=Method.MRT,
+        compressible=True,
+        weighted=True,
+        zero_centered=False,
+        relaxation_rate=1.4,
+        fluctuating=fluctuating,
+    )
+    lbm_opt = LBMOptimisation(
+        symbolic_field=pdfs, symbolic_temporary_field=pdfs_tmp, cse_global=True
+    )

    up = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)

-    cpu_vectorize_info = {'instruction_set': 'avx', 'assume_inner_stride_one': True, 'assume_aligned': True}
-    config = ps.CreateKernelConfig(target=ps.Target.CPU, data_type=data_type, default_number_float=data_type,
-                                   cpu_vectorize_info=cpu_vectorize_info)
+    cpu_vectorize_info = {
+        "instruction_set": "avx",
+        "assume_inner_stride_one": True,
+        "assume_aligned": True,
+    }
+    config = ps.CreateKernelConfig(
+        target=ps.Target.CPU,
+        data_type=data_type,
+        default_number_float=data_type,
+        cpu_vectorize_info=cpu_vectorize_info,
+    )

    ast = create_kernel(up, config=config)
    code = ps.get_code_str(ast)
No results found