diff --git a/src/pystencils_autodiff/graph_datahandling.py b/src/pystencils_autodiff/graph_datahandling.py
index f33b89f5797c525e4a6539bd684a453aac2d9c90..0e68177b70a83775270a84ca2743b67c5b1c89ff 100644
--- a/src/pystencils_autodiff/graph_datahandling.py
+++ b/src/pystencils_autodiff/graph_datahandling.py
@@ -7,6 +7,7 @@
 """
 
 """
+from copy import copy
 from enum import Enum
 
 import numpy as np
@@ -37,6 +38,17 @@ class DataTransferKind(str, Enum):
         return self in [self.HOST_TO_DEVICE, self.DEVICE_TO_HOST, self.SWAP]
 
 
+# class BoundaryHandling:
+    # def __init__(self, field: pystencils.Field):
+        # self.field = field
+
+    # def __str__(self):
+        # return f'BoundaryHandling on {self.field}'
+
+    # def __repr__(self):
+        # return self.__str__()
+
+
 class DataTransfer:
     def __init__(self, field: pystencils.Field, kind: DataTransferKind):
         self.field = field
@@ -67,9 +79,10 @@ class Communication(DataTransfer):
 
 
 class KernelCall:
-    def __init__(self, kernel: pystencils.kernel_wrapper.KernelWrapper, kwargs):
+    def __init__(self, kernel: pystencils.kernel_wrapper.KernelWrapper, kwargs, tmp_field_swaps=[]):
         self.kernel = kernel
         self.kwargs = kwargs
+        self.tmp_field_swaps = tmp_field_swaps
 
     def __str__(self):
         return "Call " + str(self.kernel.ast.function_name)
@@ -91,7 +104,7 @@ class TimeloopRun:
 
     @property
     def asts(self):
-        return self.timeloop._single_step_asts
+        return [s.kernel.ast for s in self.timeloop._single_step_asts if hasattr(s, 'kernel')]
 
 
 class GraphDataHandling(pystencils.datahandling.SerialDataHandling):
@@ -113,17 +126,22 @@ class GraphDataHandling(pystencils.datahandling.SerialDataHandling):
             self._single_step_functions.append(f)
 
         def add_call(self, functor, argument_list):
-            for argument_dict in argument_list:
-                self._single_step_asts.append((functor, argument_dict) if not hasattr(functor, 'ast') else functor.ast)
-
-            if hasattr(functor, 'kernel'):
-                functor = functor.kernel
-            if not isinstance(argument_list, list):
-                argument_list = [argument_list]
+            if hasattr(functor, 'ast'):
+                self._single_step_asts.append(KernelCall(functor, {}))
+            else:
+                former_queue = self.parent.call_queue
+                self.parent.call_queue = []
+                #
+                functor(*argument_list)
+                self._single_step_asts.extend(self.parent.call_queue)
+                self.parent.call_queue = former_queue
 
         def run(self, time_steps=1):
-            self.parent.call_queue.append(TimeloopRun(self, time_steps))
+            former_call_queue = copy(self.parent.call_queue)
+            self.parent.call_queue = []
             super().run(time_steps)
+            self.parent.call_queue = former_call_queue
+            former_call_queue.append(TimeloopRun(self, time_steps))
 
         def swap(self, src, dst, is_gpu):
             if isinstance(src, str):
@@ -135,10 +153,20 @@ class GraphDataHandling(pystencils.datahandling.SerialDataHandling):
     def __init__(self, *args, **kwargs):
 
         self.call_queue = []
+        self._timeloop_record = None
         super().__init__(*args, **kwargs)
 
-    def add_array(self, name, values_per_cell=1, dtype=np.float64, latex_name=None, ghost_layers=None, layout=None,
-                  cpu=True, gpu=None, alignment=False, field_type=FieldType.GENERIC):
+    def add_array(self,
+                  name,
+                  values_per_cell=1,
+                  dtype=np.float64,
+                  latex_name=None,
+                  ghost_layers=None,
+                  layout=None,
+                  cpu=True,
+                  gpu=None,
+                  alignment=False,
+                  field_type=FieldType.GENERIC):
 
         super().add_array(name,
                           values_per_cell,
@@ -189,10 +217,17 @@ class GraphDataHandling(pystencils.datahandling.SerialDataHandling):
             self.call_queue.append(DataTransfer(self._fields[name], DataTransferKind.HOST_TO_DEVICE))
 
     def synchronization_function(self, names, stencil=None, target=None, **_):
-        for name in names:
-            gpu = target == 'gpu'
-            self.call_queue.append(Communication(self._fields[name], stencil, gpu))
-        super().synchronization_function(names, stencil=None, target=None, **_)
+        def func():
+            for name in names:
+                gpu = target == 'gpu'
+                self.call_queue.append(Communication(self._fields[name], stencil, gpu))
+            pystencils.datahandling.SerialDataHandling.synchronization_function(self,
+                                                                                names,
+                                                                                stencil=None,
+                                                                                target=None,
+                                                                                **_)
+
+        return func
 
     def __str__(self):
         return '\n'.join(str(c) for c in self.call_queue)
@@ -205,6 +240,23 @@ class GraphDataHandling(pystencils.datahandling.SerialDataHandling):
         self.call_queue.append('Fill ' + array_name)
         super().fill(array_name, val, value_idx, slice_obj, ghost_layers, inner_ghost_layers)
 
+    def merge_swaps_with_kernel_calls(self, call_queue=None):
+        # TODO(seitz): should be moved to ComputationGraph
+
+        if call_queue is None:
+            call_queue = self.call_queue
+
+        relevant_swaps = [(swap, predecessor) for (swap, predecessor) in zip(call_queue[1:], call_queue[:-1])
+                          if isinstance(swap, Swap) and isinstance(predecessor, KernelCall)]
+        for s, pred in relevant_swaps:
+            call_queue.remove(s)
+            if (s.field, s.destination) not in pred.tmp_field_swaps:
+                pred.tmp_field_swaps.append((s.field, s.destination))
+
+        for t in call_queue:
+            if isinstance(t, TimeloopRun):
+                self.merge_swaps_with_kernel_calls(t.timeloop._single_step_asts)
+
     # TODO
     # def reduce_float_sequence(self, sequence, operation, all_reduce=False) -> np.array:
         # return np.array(sequence)
diff --git a/src/pystencils_autodiff/walberla.py b/src/pystencils_autodiff/walberla.py
index cf2839d35ea5b46fc01d45252bb2adf405f9f059..87d2066eb6ab661f10f7ae66a0d5a0fa16c8703d 100644
--- a/src/pystencils_autodiff/walberla.py
+++ b/src/pystencils_autodiff/walberla.py
@@ -17,20 +17,25 @@ from stringcase import camelcase, pascalcase
 
 import pystencils
 from pystencils.astnodes import SympyAssignment
-from pystencils.data_types import TypedSymbol
+from pystencils.data_types import TypedSymbol, create_type
 from pystencils_autodiff._file_io import read_template_from_file
 from pystencils_autodiff.framework_integration.astnodes import JinjaCppFile
 
 
+def _make_field_type(field, on_gpu):
+    from pystencils_walberla.jinja_filters import make_field_type, get_field_fsize
+    f_size = get_field_fsize(field)
+
+    return make_field_type(pystencils.data_types.get_base_type(field.dtype), f_size, on_gpu)
+
+
 class FieldType(JinjaCppFile):
 
     TEMPLATE = jinja2.Template("{{ field_type }}")
 
     def __init__(self, field: pystencils.Field, on_gpu: bool):
-        from pystencils_walberla.jinja_filters import make_field_type, get_field_fsize
 
-        f_size = get_field_fsize(field)
-        field_type = make_field_type(pystencils.data_types.get_base_type(field.dtype), f_size, on_gpu)
+        field_type = _make_field_type(field, on_gpu)
 
         ast_dict = {'on_gpu': on_gpu,
                     'field_type': field_type
@@ -393,16 +398,22 @@ class FillFromFlagField(JinjaCppFile):
 
 class LbCommunicationSetup(JinjaCppFile):
 
-    TEMPLATE = jinja2.Template("""blockforest::communication::UniformBufferedScheme<lbm::{{ lb_model_type }}::CommunicationStencil> {{ communication }}( blocks );
-{{ communication }}.addPackInfo( make_shared< lbm::PdfFieldPackInfo<lbm::{{ lb_model_type }}> >( {{ pdf_id }} ) );
+    TEMPLATE = jinja2.Template("""
+{{CommonicationScheme}}<lbm::{{ lb_model_type }}::CommunicationStencil> {{ communication }}( blocks );
+{{ communication }}.addPackInfo( make_shared< {{ packinfo_class }} >( {{ pdf_id }} ) );
  """)  # noqa
 
-    def __init__(self, lb_model_type, pdf_id):
+    def __init__(self, lb_model_type, pdf_id, packinfo_class, gpu):
         self._symbol = TypedSymbol('communication', 'auto')
+        self.gpu = gpu
+
         ast_dict = {
             'lb_model_type': lb_model_type,
             'pdf_id': pdf_id,
             'communication': self._symbol,
+            'packinfo_class': packinfo_class,
+            'CommonicationScheme': ('cuda::communication::UniformGPUScheme'
+                                    if gpu else 'blockforest::communication::UniformBufferedScheme')
         }
         super().__init__(ast_dict)
 
@@ -414,7 +425,14 @@ class LbCommunicationSetup(JinjaCppFile):
     def symbols_defined(self):
         return {self.symbol}
 
-    headers = ['"blockforest/communication/UniformBufferedScheme.h"', '"lbm/communication/PdfFieldPackInfo.h"']
+    @property
+    def headers(self):
+        if self.gpu:
+            return ['"cuda/communication/UniformGPUScheme.h"', '"lbm/communication/PdfFieldPackInfo.h"', '"PackInfo.h"']
+        else:
+            return ['"blockforest/communication/UniformBufferedScheme.h"',
+                    '"lbm/communication/PdfFieldPackInfo.h"',
+                    '"PackInfo.h"']
 
 
 class BeforeFunction(JinjaCppFile):
@@ -490,6 +508,40 @@ class TimeLoop(JinjaCppFile):
     headers = ['"timeloop/all.h"']
 
 
+class ForLoop(JinjaCppFile):
+
+    TEMPLATE = jinja2.Template("""
+for( {{node.loop_symbol.dtype}} {{node.loop_symbol}} = {{node.loop_start}}; {{node.loop_symbol}} <= {{node.loop_end}}; {{node.loop_symbol}}+= {{node.loop_increment}} )  {
+{%- for c in children %}
+{{ c | indent(3) }}
+{%- endfor -%}
+}
+""")  # noqa
+
+    def __init__(self,
+                 loop_start,
+                 loop_end,
+                 children,
+                 loop_symbol=TypedSymbol('t_', create_type('int64')),
+                 loop_increment=1):
+        ast_dict = {
+            'loop_symbol': loop_symbol,
+            'loop_start': loop_start,
+            'loop_end': loop_end,
+            'children': children,
+            'loop_increment': loop_increment
+        }
+        super().__init__(ast_dict)
+
+    @property
+    def loop_symbol(self):
+        return self.ast_dict.loop_symbol
+
+    @property
+    def symbols_defined(self):
+        return {self.loop_symbol}
+
+
 class U_Rho_Adaptor(JinjaCppFile):
     """Docstring for U_Rho_Adaptor. """
 
@@ -683,7 +735,13 @@ class InitBoundaryHandling(JinjaCppFile):
 {% endfor %}
 """)  # noqa
 
-    def __init__(self, block_forest, flag_field_id, pdf_field_id, boundary_conditions, boundary_kernel: dict, field_allocations):
+    def __init__(self,
+                 block_forest,
+                 flag_field_id,
+                 pdf_field_id,
+                 boundary_conditions,
+                 boundary_kernel: dict,
+                 field_allocations):
         self.fluid = FlagUidDefinition("fluid")
         ast_dict = {'fluid_uid_definition': self.fluid,
                     'geometry_initialization': BoundaryHandlingFromConfig(block_forest,
@@ -768,7 +826,11 @@ class GeneratedBoundaryInitialization(JinjaCppFile):
 class SweepCreation(JinjaCppFile):
     TEMPLATE = jinja2.Template("""{{ sweep_class_name }}( {{ parameter_str }} )""")  # noqa
 
-    def __init__(self, sweep_class_name: str, field_allocation: AllocateAllFields, ast, parameters_to_ignore=None):
+    def __init__(self,
+                 sweep_class_name: str,
+                 field_allocation: AllocateAllFields,
+                 ast,
+                 parameters_to_ignore=[]):
         def resolve_parameter(p):
             if ast.target == 'cpu':
                 dict = field_allocation._cpu_allocations
@@ -780,7 +842,8 @@ class SweepCreation(JinjaCppFile):
         parameters = ast.get_parameters()
         parameter_ids = [resolve_parameter(p)
                          for p in parameters
-                         if p.is_field_pointer or not p.is_field_parameter]
+                         if (p.is_field_pointer and p.symbol.field_name not in parameters_to_ignore)
+                         or not p.is_field_parameter]
 
         ast_dict = {'sweep_class_name': sweep_class_name,
                     'parameter_ids': parameter_ids,
@@ -806,3 +869,52 @@ for( auto& block : *{{block_forest}}) {{sweep_class_name | lower() }}(&block);""
                     'sweep_class_name': functor.ast_dict.sweep_class_name,
                     'block_forest': block_forest}
         super().__init__(ast_dict)
+
+
+class FieldCopy(JinjaCppFile):
+    TEMPLATE = jinja2.Template("""cuda::fieldCpy<{{ src_type }}, {{ dst_type }}>({{ block_forest }}, {{ src_id }}, {{ dst_id }});""")  # noqa
+
+    def __init__(self, block_forest, src_id, src_field, src_gpu, dst_id, dst_field, dst_gpu):
+        src_type = _make_field_type(src_field, src_gpu)
+        dst_type = _make_field_type(dst_field, dst_gpu)
+        ast_dict = {'src_id': src_id,
+                    'dst_id': dst_id,
+                    'src_type': src_type,
+                    'dst_type': dst_type,
+                    'block_forest': block_forest}
+        super().__init__(ast_dict)
+
+    headers = ['"cuda/FieldCopy.h"']
+
+
+class SwapFields(JinjaCppFile):
+    """
+     .. warn::
+
+         Prefer temporary fields in sweeps over this class! Two full fields have higher memory usage.
+
+    """
+    TEMPLATE = jinja2.Template("""std::swap({{field_id0}}, {{field_id1}});""")
+
+    def __init__(self, field_id0, field_id1):
+        ast_dict = {'field_id0': field_id0,
+                    'field_id1': field_id1}
+        super().__init__(ast_dict)
+
+    headers = ["<algorithm>"]
+
+
+class Communication(JinjaCppFile):
+    """
+     .. warn::
+
+         Prefer temporary fields in sweeps over this class! Two full fields have higher memory usage.
+
+    """
+    TEMPLATE = jinja2.Template("""communication()""")
+
+    def __init__(self, gpu):
+        ast_dict = {'gpu': gpu}
+        super().__init__(ast_dict)
+
+    headers = ["<algorithm>"]
diff --git a/src/pystencils_autodiff/wald_und_wiesen_simulation.py b/src/pystencils_autodiff/wald_und_wiesen_simulation.py
index 662a6d0f0d04dcee9c46645f1913f67eb99bf38a..9100dfb82720c616f9ecf2c35bbb75c030aef27d 100644
--- a/src/pystencils_autodiff/wald_und_wiesen_simulation.py
+++ b/src/pystencils_autodiff/wald_und_wiesen_simulation.py
@@ -7,19 +7,20 @@
 
 """
 
+import itertools
 from typing import Dict
 
 import sympy as sp
-from stringcase import pascalcase
+from stringcase import camelcase, pascalcase
 
 import lbmpy_walberla
 import pystencils
 import pystencils_walberla.codegen
 from pystencils.astnodes import Block, EmptyLine
 from pystencils_autodiff.walberla import (
-    AllocateAllFields, CMakeLists, DefinitionsHeader, InitBoundaryHandling, LbCommunicationSetup,
-    ResolveUndefinedSymbols, RunTimeLoop, SweepCreation, SweepOverAllBlocks, TimeLoop,
-    UniformBlockforestFromConfig, WalberlaMain, WalberlaModule)
+    AllocateAllFields, CMakeLists, DefinitionsHeader, FieldCopy, InitBoundaryHandling,
+    LbCommunicationSetup, ResolveUndefinedSymbols, RunTimeLoop, SwapFields, SweepCreation,
+    SweepOverAllBlocks, TimeLoop, UniformBlockforestFromConfig, WalberlaMain, WalberlaModule)
 
 
 class WaldUndWiesenSimulation():
@@ -35,7 +36,8 @@ class WaldUndWiesenSimulation():
                  codegen_context,
                  boundary_handling: pystencils.boundaries.BoundaryHandling = None,
                  lb_rule=None,
-                 refinement_scaling=None):
+                 refinement_scaling=None,
+                 boundary_handling_target='gpu'):
         self._data_handling = graph_data_handling
         self._lb_rule = lb_rule
         self._refinement_scaling = refinement_scaling
@@ -49,6 +51,17 @@ class WaldUndWiesenSimulation():
         self._with_gui = False
         self._with_gui_default = False
         self._boundary_kernels = {}
+        self._boundary_handling_target = boundary_handling_target
+        pystencils_walberla.codegen.generate_pack_info_for_field(
+            self._codegen_context,
+            'PackInfo',
+            pystencils.Field.create_generic(graph_data_handling.fields['ldc_pdf'].name,
+                                            graph_data_handling.fields['ldc_pdf'].spatial_dimensions,
+                                            graph_data_handling.fields['ldc_pdf'].dtype.numpy_dtype,
+                                            graph_data_handling.fields['ldc_pdf'].index_dimensions,
+                                            index_shape=graph_data_handling.fields['ldc_pdf'].index_shape,),
+            target=self._boundary_handling_target)
+        self._packinfo_class = 'PackInfo'
 
     def _create_helper_files(self) -> Dict[str, str]:
         if self._lb_rule:
@@ -58,7 +71,13 @@ class WaldUndWiesenSimulation():
             if self._boundary_handling:
                 for bc in self.boundary_conditions:
                     self._boundary_kernels.update({bc.name: lbmpy_walberla.generate_boundary(
-                        self._codegen_context, pascalcase(bc.name), bc, self._lb_rule.method)})
+                        self._codegen_context,
+                        pascalcase(bc.name),
+                        bc,
+                        self._lb_rule.method,
+                        target=self._boundary_handling_target)},
+                    )
+                    self._bh_cycler = itertools.cycle(self._boundary_kernels.keys())
 
     def _create_module(self):
         if self._lb_rule:
@@ -77,10 +96,11 @@ class WaldUndWiesenSimulation():
 
         if self._lb_rule:
             pdf_field_id = field_allocations._gpu_allocations.get(
-                'ldc_pdfSrc', field_allocations._cpu_allocations['ldc_pdfSrc']).symbol
+                'ldc_pdf', field_allocations._cpu_allocations['ldc_pdf']).symbol
         else:
             pdf_field_id = None
 
+        self._data_handling.merge_swaps_with_kernel_calls()
         call_nodes = filter(lambda x: x, [self._graph_to_sweep(c) for c in self._data_handling.call_queue])
 
         module = WalberlaModule(WalberlaMain(Block([
@@ -96,7 +116,9 @@ class WaldUndWiesenSimulation():
                                          self._field_allocations)
                     if self._boundary_handling else EmptyLine(),
                     LbCommunicationSetup(self._lb_model_name,
-                                         pdf_field_id)
+                                         pdf_field_id,
+                                         self._packinfo_class,
+                                         self._boundary_handling_target)
                     if self._lb_rule else EmptyLine(),
                     *call_nodes
                 ]), self.parameter_config_block)
@@ -130,28 +152,79 @@ class WaldUndWiesenSimulation():
         return self._boundary_handling._boundary_object_to_boundary_info.keys()
 
     def _graph_to_sweep(self, c):
-        from pystencils_autodiff.graph_datahandling import KernelCall, TimeloopRun
+        from pystencils_autodiff.graph_datahandling import KernelCall, TimeloopRun, DataTransferKind, DataTransfer
 
         if isinstance(c, KernelCall):
             sweep_class_name = next(self._kernel_class_generator)
             pystencils_walberla.codegen.generate_sweep(
                 self._codegen_context, sweep_class_name, c.kernel.ast)
-            rtn = SweepOverAllBlocks(SweepCreation(sweep_class_name, self._field_allocations,
-                                                   c.kernel.ast), self._block_forest.blocks)
+            rtn = SweepOverAllBlocks(SweepCreation(sweep_class_name,
+                                                   self._field_allocations,
+                                                   c.kernel.ast,
+                                                   parameters_to_ignore=[s[1].name for s in c.tmp_field_swaps]),
+                                     self._block_forest.blocks)
 
         elif isinstance(c, TimeloopRun):
             sweeps = []
             for a in c.timeloop._single_step_asts:
-                if 'indexField' in [f.name for f in a.fields_accessed]:
-                    continue
-                sweep_class_name = next(self._kernel_class_generator)
-                pystencils_walberla.codegen.generate_sweep(
-                    self._codegen_context, sweep_class_name, a)
-                sweeps.append(SweepCreation(sweep_class_name, self._field_allocations, a))
+                if isinstance(a, KernelCall):
+                    if 'indexField' in [f.name for f in a.kernel.ast.fields_accessed]:
+                        bh = next(self._bh_cycler)
+                        sweeps.append(camelcase(bh))
+                        continue
+                    sweep_class_name = next(self._kernel_class_generator)
+                    pystencils_walberla.codegen.generate_sweep(
+                        self._codegen_context, sweep_class_name, a.kernel.ast, field_swaps=a.tmp_field_swaps)
+                    sweeps.append(SweepCreation(sweep_class_name,
+                                                self._field_allocations,
+                                                a.kernel.ast,
+                                                parameters_to_ignore=[s[1].name for s in a.tmp_field_swaps]))
+
+                elif isinstance(c, DataTransfer):
+                    if c.kind in (DataTransferKind.HOST_COMMUNICATION, DataTransferKind.DEVICE_COMMUNICATION):
+                        src = self._field_allocations._cpu_allocations[c.field.name].symbol
+                        dst = self._field_allocations._cpu_allocations[c.destination.name].symbol
+                        rtn = SwapFields(src, dst)
+                    elif c.kind == DataTransferKind.DEVICE_SWAP:
+                        src = self._field_allocations._gpu_allocations[c.field.name].symbol
+                        dst = self._field_allocations._gpu_allocations[c.destination.name].symbol
+                        rtn = SwapFields(src, dst)
+                    elif c.kind == DataTransferKind.HOST_TO_DEVICE:
+                        src = self._field_allocations._cpu_allocations[c.field.name].symbol
+                        dst = self._field_allocations._gpu_allocations[c.field.name].symbol
+                        rtn = FieldCopy(self._block_forest.blocks, src, c.field, False, dst, c.field, True)
+                    elif c.kind == DataTransferKind.DEVICE_TO_HOST:
+                        src = self._field_allocations._gpu_allocations[c.field.name].symbol
+                        dst = self._field_allocations._cpu_allocations[c.field.name].symbol
+                        rtn = FieldCopy(self._block_forest.blocks, src, c.field, True, dst, c.field, False)
+                    else:
+                        rtn = None
+                else:
+                    print(f'time {c}')
 
             loop = TimeLoop(self._block_forest.blocks, [], sweeps, [], sp.S(c.time_steps))
             rtn = Block([loop, RunTimeLoop(self._block_forest.blocks, loop, self._with_gui, self._with_gui_default)])
 
+        elif isinstance(c, DataTransfer):
+            if c.kind == DataTransferKind.HOST_SWAP:
+                src = self._field_allocations._cpu_allocations[c.field.name].symbol
+                dst = self._field_allocations._cpu_allocations[c.destination.name].symbol
+                rtn = SwapFields(src, dst)
+            elif c.kind == DataTransferKind.DEVICE_SWAP:
+                src = self._field_allocations._gpu_allocations[c.field.name].symbol
+                dst = self._field_allocations._gpu_allocations[c.destination.name].symbol
+                rtn = SwapFields(src, dst)
+            elif c.kind == DataTransferKind.HOST_TO_DEVICE:
+                src = self._field_allocations._cpu_allocations[c.field.name].symbol
+                dst = self._field_allocations._gpu_allocations[c.field.name].symbol
+                rtn = FieldCopy(self._block_forest.blocks, src, c.field, False, dst, c.field, True)
+            elif c.kind == DataTransferKind.DEVICE_TO_HOST:
+                src = self._field_allocations._gpu_allocations[c.field.name].symbol
+                dst = self._field_allocations._cpu_allocations[c.field.name].symbol
+                rtn = FieldCopy(self._block_forest.blocks, src, c.field, True, dst, c.field, False)
+            else:
+                rtn = None
         else:
-            return None
+            rtn = None
+
         return rtn
diff --git a/tests/test_walberla.py b/tests/test_walberla.py
index a9954b779baf1969b22f5810b185de2e1232555a..40ae8cbe6646b14f63d496e7cae05e461d2c18bb 100644
--- a/tests/test_walberla.py
+++ b/tests/test_walberla.py
@@ -12,9 +12,8 @@ from os.path import dirname, join
 
 import numpy as np
 
-import lbmpy_walberla
 import pystencils
-from lbmpy.creationfunctions import create_lb_collision_rule, create_lbm_kernel
+from lbmpy.creationfunctions import create_lb_collision_rule
 from pystencils.astnodes import Block, EmptyLine, SympyAssignment
 from pystencils.data_types import TypedSymbol, create_type
 from pystencils_autodiff._file_io import write_file
@@ -83,6 +82,8 @@ def test_wald_wiesen_lbm():
         lbm_step = ldc_setup(domain_size=(30, 30), optimization=opt_params,
                              fixed_loop_sizes=False, lid_velocity=lid_velocity)
 
+        # del lbm_step.data_handling.gpu_arrays.ldc_pdf_tmp
+
         sim = WaldUndWiesenSimulation(lbm_step.data_handling,
                                       ctx,
                                       lbm_step.boundary_handling,