Markus Holzer · Markus Holzer
--- a/lbmpy/advanced_streaming/communication.py

+ 108

− 123
+++ b/lbmpy/advanced_streaming/communication.py

+ 108

− 123
 import itertools
-from pystencils import Field, Assignment
+from pystencils import CreateKernelConfig, Field, Assignment, AssignmentCollection
 from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice
 from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, Timestep, get_timesteps
 from pystencils.datahandling import SerialDataHandling
 @@ -7,50 +7,95 @@ from pystencils.enums import Target
 from itertools import chain


-def _trim_slice_in_direction(slices, direction):
-    assert len(slices) == len(direction)
+class LBMPeriodicityHandling:

-    result = []
-    for s, d in zip(slices, direction):
-        if isinstance(s, int):
-            result.append(s)
-            continue
-        start = s.start + 1 if d == -1 else s.start
-        stop = s.stop - 1 if d == 1 else s.stop
-        result.append(slice(start, stop, s.step))
+    def __init__(self, stencil, data_handling, pdf_field_name,
+                 streaming_pattern='pull', ghost_layers=1,
+                 pycuda_direct_copy=True):
+        """
+            Periodicity Handling for Lattice Boltzmann Streaming.

-    return tuple(result)
+            **On the usage with cuda:**
+            - pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
+            e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
+            handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
+            compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
+            but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
+            Choose your weapon depending on your use case.
+        """
+        if not isinstance(data_handling, SerialDataHandling):
+            raise ValueError('Only serial data handling is supported!')

+        self.stencil = stencil
+        self.dim = stencil.D
+        self.dh = data_handling

-def _extend_dir(direction):
-    if len(direction) == 0:
-        yield tuple()
-    elif direction[0] == 0:
-        for d in [-1, 0, 1]:
-            for rest in _extend_dir(direction[1:]):
-                yield (d, ) + rest
-    else:
-        for rest in _extend_dir(direction[1:]):
-            yield (direction[0], ) + rest
+        assert data_handling.default_target in [Target.CPU, Target.GPU]
+        self.target = data_handling.default_target

+        self.pdf_field_name = pdf_field_name
+        self.ghost_layers = ghost_layers
+        self.periodicity = data_handling.periodicity
+        self.inplace_pattern = is_inplace(streaming_pattern)

-def _get_neighbour_transform(direction, ghost_layers):
-    return tuple(d * (ghost_layers + 1) for d in direction)
+        self.cpu = self.target == Target.CPU
+        self.pycuda_direct_copy = self.target == Target.GPU and pycuda_direct_copy

+        def is_copy_direction(direction):
+            s = 0
+            for d, p in zip(direction, self.periodicity):
+                s += abs(d)
+                if d != 0 and not p:
+                    return False

-def _fix_length_one_slices(slices):
-    """Slices of length one are replaced by their start value for correct periodic shifting"""
-    if isinstance(slices, int):
-        return slices
-    elif isinstance(slices, slice):
-        if slices.stop is not None and abs(slices.start - slices.stop) == 1:
-            return slices.start
-        elif slices.stop is None and slices.start == -1:
-            return -1  # [-1:] also has length one
+            return s != 0
+
+        full_stencil = itertools.product(*([-1, 0, 1] for _ in range(self.dim)))
+        copy_directions = tuple(filter(is_copy_direction, full_stencil))
+        self.comm_slices = []
+        timesteps = get_timesteps(streaming_pattern)
+        for timestep in timesteps:
+            slices_per_comm_dir = get_communication_slices(stencil=stencil,
+                                                           comm_stencil=copy_directions,
+                                                           streaming_pattern=streaming_pattern,
+                                                           prev_timestep=timestep,
+                                                           ghost_layers=ghost_layers)
+            self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
+
+        if self.target == Target.GPU and not pycuda_direct_copy:
+            self.device_copy_kernels = list()
+            for timestep in timesteps:
+                self.device_copy_kernels.append(self._compile_copy_kernels(timestep))
+
+    def __call__(self, prev_timestep=Timestep.BOTH):
+        if self.cpu:
+            self._periodicity_handling_cpu(prev_timestep)
        else:
-            return slices
-    else:
-        return tuple(_fix_length_one_slices(s) for s in slices)
+            self._periodicity_handling_gpu(prev_timestep)
+
+    def _periodicity_handling_cpu(self, prev_timestep):
+        arr = self.dh.cpu_arrays[self.pdf_field_name]
+        comm_slices = self.comm_slices[prev_timestep.idx]
+        for src, dst in comm_slices:
+            arr[dst] = arr[src]
+
+    def _compile_copy_kernels(self, timestep):
+        pdf_field = self.dh.fields[self.pdf_field_name]
+        kernels = []
+        for src, dst in self.comm_slices[timestep.idx]:
+            kernels.append(
+                periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
+        return kernels
+
+    def _periodicity_handling_gpu(self, prev_timestep):
+        arr = self.dh.gpu_arrays[self.pdf_field_name]
+        if self.pycuda_direct_copy:
+            for src, dst in self.comm_slices[prev_timestep.idx]:
+                arr[dst] = arr[src]
+        else:
+            kernel_args = {self.pdf_field_name: arr}
+            for kernel in self.device_copy_kernels[prev_timestep.idx]:
+                kernel(**kernel_args)


 def get_communication_slices(
 @@ -110,7 +155,6 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
    src_slice = src_slice[:-1]
    dst_slice = dst_slice[:-1]

-    # TODO this is the domain_size with GL
    if domain_size is None:
        domain_size = pdf_field.spatial_shape

 @@ -127,8 +171,9 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
    assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
        "Slices have to have same size"

-    copy_eq = Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))
-    ast = create_cuda_kernel([copy_eq], iteration_slice=dst_slice, skip_independence_check=True)
+    copy_eq = AssignmentCollection(main_assignments=[Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))])
+    config = CreateKernelConfig(iteration_slice=dst_slice, skip_independence_check=True)
+    ast = create_cuda_kernel(copy_eq, config=config)
    if target == Target.GPU:
        from pystencils.gpucuda import make_python_function
        return make_python_function(ast)
 @@ -136,92 +181,32 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
        raise ValueError('Invalid target:', target)


-class LBMPeriodicityHandling:
-
-    def __init__(self, stencil, data_handling, pdf_field_name,
-                 streaming_pattern='pull', ghost_layers=1,
-                 pycuda_direct_copy=True):
-        """
-            Periodicity Handling for Lattice Boltzmann Streaming.
-
-            **On the usage with cuda:**
-            - pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
-            e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
-            handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
-            compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
-            but especially for large stencils like D3Q27, their compilation can take up to 20 seconds. 
-            Choose your weapon depending on your use case.
-        """
-        if not isinstance(data_handling, SerialDataHandling):
-            raise ValueError('Only serial data handling is supported!')
-
-        self.stencil = stencil
-        self.dim = stencil.D
-        self.dh = data_handling
-
-        target = data_handling.default_target
-        assert target in [Target.CPU, Target.GPU]
-
-        self.pdf_field_name = pdf_field_name
-        self.ghost_layers = ghost_layers
-        periodicity = data_handling.periodicity
-        self.inplace_pattern = is_inplace(streaming_pattern)
-        self.target = target
-        self.cpu = target == Target.CPU
-        self.pycuda_direct_copy = target == Target.GPU and pycuda_direct_copy
-
-        def is_copy_direction(direction):
-            s = 0
-            for d, p in zip(direction, periodicity):
-                s += abs(d)
-                if d != 0 and not p:
-                    return False
-
-            return s != 0
-
-        full_stencil = itertools.product(*([-1, 0, 1] for _ in range(self.dim)))
-        copy_directions = tuple(filter(is_copy_direction, full_stencil))
-        self.comm_slices = []
-        timesteps = get_timesteps(streaming_pattern)
-        for timestep in timesteps:
-            slices_per_comm_dir = get_communication_slices(stencil=stencil,
-                                                           comm_stencil=copy_directions,
-                                                           streaming_pattern=streaming_pattern,
-                                                           prev_timestep=timestep,
-                                                           ghost_layers=ghost_layers)
-            self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
-
-        if target == Target.GPU and not pycuda_direct_copy:
-            self.device_copy_kernels = []
-            for timestep in timesteps:
-                self.device_copy_kernels.append(self._compile_copy_kernels(timestep))
+def _extend_dir(direction):
+    if len(direction) == 0:
+        yield tuple()
+    elif direction[0] == 0:
+        for d in [-1, 0, 1]:
+            for rest in _extend_dir(direction[1:]):
+                yield (d, ) + rest
+    else:
+        for rest in _extend_dir(direction[1:]):
+            yield (direction[0], ) + rest

-    def __call__(self, prev_timestep=Timestep.BOTH):
-        if self.cpu:
-            self._periodicity_handling_cpu(prev_timestep)
-        else:
-            self._periodicity_handling_gpu(prev_timestep)

-    def _periodicity_handling_cpu(self, prev_timestep):
-        arr = self.dh.cpu_arrays[self.pdf_field_name]
-        comm_slices = self.comm_slices[prev_timestep.idx]
-        for src, dst in comm_slices:
-            arr[dst] = arr[src]
+def _get_neighbour_transform(direction, ghost_layers):
+    return tuple(d * (ghost_layers + 1) for d in direction)

-    def _compile_copy_kernels(self, timestep):
-        pdf_field = self.dh.fields[self.pdf_field_name]
-        kernels = []
-        for src, dst in self.comm_slices[timestep.idx]:
-            kernels.append(
-                periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
-        return kernels

-    def _periodicity_handling_gpu(self, prev_timestep):
-        arr = self.dh.gpu_arrays[self.pdf_field_name]
-        if self.pycuda_direct_copy:
-            for src, dst in self.comm_slices[prev_timestep.idx]:
-                arr[dst] = arr[src]
+def _fix_length_one_slices(slices):
+    """Slices of length one are replaced by their start value for correct periodic shifting"""
+    if isinstance(slices, int):
+        return slices
+    elif isinstance(slices, slice):
+        if slices.stop is not None and abs(slices.start - slices.stop) == 1:
+            return slices.start
+        elif slices.stop is None and slices.start == -1:
+            return -1  # [-1:] also has length one
        else:
-            kernel_args = {self.pdf_field_name: arr}
-            for kernel in self.device_copy_kernels[prev_timestep.idx]:
-                kernel(**kernel_args)
+            return slices
+    else:
+        return tuple(_fix_length_one_slices(s) for s in slices)