Markus Holzer · Markus Holzer · 939bed5f · 825c6991 · 0cd17895 · 31bd0e0a
--- a/pystencils/gpu/indexing.py

+ 153

− 81
+++ b/pystencils/gpu/indexing.py

+ 153

− 81
 import abc
 from functools import partial
 import math
+from typing import Tuple

 import sympy as sp
 from sympy.core.cache import cacheit

-from pystencils.astnodes import Block, Conditional
+from pystencils.astnodes import Block, Conditional, SympyAssignment
 from pystencils.typing import TypedSymbol, create_type
-from pystencils.integer_functions import div_ceil, div_floor
-from pystencils.slicing import normalize_slice
+from pystencils.integer_functions import div_ceil, div_floor, int_div
 from pystencils.sympyextensions import is_integer_sequence, prod


 @@ -33,12 +33,37 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c

 class AbstractIndexing(abc.ABC):
    """
-    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
-    field is mapped to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
-    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
-    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
+    Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
+    to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
+    and computes the number of blocks and threads a kernel is started with.
+    The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
+    and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
+    coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
    """

+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        for iter_space in iteration_space:
+            assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
+                                                  f"not tuple of type {type(iter_space)}"
+        self._iteration_space = iteration_space
+        self._data_layout = data_layout
+        self._dim = len(iteration_space)
+
+    @property
+    def iteration_space(self):
+        """Iteration space to loop over"""
+        return self._iteration_space
+
+    @property
+    def data_layout(self):
+        """Data layout of the kernels arrays"""
+        return self._data_layout
+
+    @property
+    def dim(self):
+        """Number of spatial dimensions"""
+        return self._dim
+
    @property
    @abc.abstractmethod
    def coordinates(self):
 @@ -50,6 +75,17 @@ class AbstractIndexing(abc.ABC):
        """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM

+    @abc.abstractmethod
+    def add_loop_ctr_assignments(self, assignments, loop_counter_symbols):
+        """Adds assignments for the loop counter symbols depending on the gpu threads.
+
+        Args:
+            assignments: list of assignments
+            loop_counter_symbols: typed symbols representing the loop counters
+        Returns:
+            assignments with assignments for the loop counters
+        """
+
    @abc.abstractmethod
    def call_parameters(self, arr_shape):
        """Determine grid and block size for kernel call.
 @@ -92,8 +128,8 @@ class BlockIndexing(AbstractIndexing):
    """Generic indexing scheme that maps sub-blocks of an array to GPU blocks.

    Args:
-        field: pystencils field (common to all Indexing classes)
-        iteration_slice: slice that defines rectangular subarea which is iterated over
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple specifying loop order with innermost loop last. This is the same format as returned by `Field.layout`.
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
        compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
 @@ -102,14 +138,16 @@ class BlockIndexing(AbstractIndexing):
        device_number: device number of the used GPU. By default, the zeroth device is used.
    """

-    def __init__(self, field, iteration_slice,
-                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple,
+                 block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
                 maximum_block_size=(1024, 1024, 64), device_number=None):
-        if field.spatial_dimensions > 3:
-            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
+        super(BlockIndexing, self).__init__(iteration_space, data_layout)
+
+        if self._dim > 4:
+            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

-        if permute_block_size_dependent_on_layout:
-            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
+        if permute_block_size_dependent_on_layout and self._dim < 4:
+            block_size = self.permute_block_size_according_to_layout(block_size, data_layout)

        self._block_size = block_size
        if maximum_block_size == 'auto':
 @@ -124,9 +162,6 @@ class BlockIndexing(AbstractIndexing):
                maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])

        self._maximum_block_size = maximum_block_size
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._dim = field.spatial_dimensions
-        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
        self._compile_time_block_size = compile_time_block_size
        self._device_number = device_number

 @@ -140,17 +175,30 @@ class BlockIndexing(AbstractIndexing):

    @property
    def coordinates(self):
-        offsets = _get_start_from_slice(self._iterationSlice)
-        coordinates = [c + off for c, off in zip(self.cuda_indices, offsets)]
-
-        return coordinates[:self._dim]
+        if self._dim < 4:
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
+            return coordinates[:self._dim]
+        else:
+            coordinates = list()
+            width = self._iteration_space[0].stop - self.iteration_space[0].start
+            coordinates.append(int_div(self.cuda_indices[0], width))
+            width = self._iteration_space[1].stop - self.iteration_space[1].start
+            coordinates.append(sp.Mod(self.cuda_indices[0], width))
+            coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
+            coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
+            return coordinates
+
+    def add_loop_ctr_assignments(self, assignments, loop_counter_symbols):
+        cell_idx_assignments = _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
+        return cell_idx_assignments + assignments

    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)
+
+        if len(widths) > 3:
+            widths = [widths[0] * widths[1], widths[2], widths[3]]

-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
        extend_bs = (1,) * (3 - len(self._block_size))
        block_size = self._block_size + extend_bs
        if not self._compile_time_block_size:
 @@ -171,20 +219,22 @@ class BlockIndexing(AbstractIndexing):

    def guard(self, kernel_content, arr_shape):
        arr_shape = arr_shape[:self._dim]
-        end = _get_end_from_slice(self._iterationSlice, arr_shape)
-
-        conditions = [c < e for c, e in zip(self.coordinates, end)]
-        for cuda_index, iter_slice in zip(self.cuda_indices, self._iterationSlice):
-            if isinstance(iter_slice, slice) and iter_slice.step > 1:
-                conditions.append(sp.Eq(sp.Mod(cuda_index, iter_slice.step), 0))
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]

+        if self._dim < 4:
+            conditions = [c < e for c, e in zip(self.coordinates, end)]
+        else:
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
+            conditions = [c < e for c, e in zip(coordinates, end)]
+            conditions[0] = coordinates[0] < (end[0] * end[1])
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
        return Block([Conditional(condition, kernel_content)])

-    def iteration_space(self, arr_shape):
-        return _iteration_space(self._iterationSlice, arr_shape)
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)

    def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
        """Shrinks the block_size if there are too many registers used per block.
 @@ -246,38 +296,45 @@ class LineIndexing(AbstractIndexing):
    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a GPU block (which depends on device).
+
+    Args:
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple to determine the fast and slow coordinates.
    """

-    def __init__(self, field, iteration_slice):
-        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
-        if field.spatial_dimensions > 4:
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        super(LineIndexing, self).__init__(iteration_space, data_layout)
+
+        if len(iteration_space) > 4:
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

-        coordinates = available_indices[:field.spatial_dimensions]
+    @property
+    def cuda_indices(self):
+        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
+        coordinates = available_indices[:self.dim]

-        fastest_coordinate = field.layout[-1]
+        fastest_coordinate = self.data_layout[-1]
        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]

-        self._coordinates = coordinates
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
+        return coordinates

    @property
    def coordinates(self):
-        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
+        return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]

-    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
+    def add_loop_ctr_assignments(self, assignments, loop_counter_symbols):
+        cell_idx_assignments = _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
+        return cell_idx_assignments + assignments

-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
+    def call_parameters(self, arr_shape):
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)

        def get_shape_of_cuda_idx(cuda_idx):
-            if cuda_idx not in self._coordinates:
+            if cuda_idx not in self.cuda_indices:
                return 1
            else:
-                idx = self._coordinates.index(cuda_idx)
+                idx = self.cuda_indices.index(cuda_idx)
                return widths[idx]

        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
 @@ -292,50 +349,65 @@ class LineIndexing(AbstractIndexing):
    def symbolic_parameters(self):
        return set()

-    def iteration_space(self, arr_shape):
-        return _iteration_space(self._iterationSlice, arr_shape)
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)


 # -------------------------------------- Helper functions --------------------------------------------------------------

-def _get_start_from_slice(iteration_slice):
+def _get_numeric_iteration_slice(iteration_slice, arr_shape):
    res = []
-    for slice_component in iteration_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.start if slice_component.start is not None else 0)
-        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component)
+    for slice_component, shape in zip(iteration_slice, arr_shape):
+        result_slice = slice_component
+        if not isinstance(result_slice.start, int):
+            start = result_slice.start
+            assert len(start.free_symbols) == 1
+            start = start.subs({symbol: shape for symbol in start.free_symbols})
+            result_slice = slice(start, result_slice.stop, result_slice.step)
+        if not isinstance(result_slice.stop, int):
+            stop = result_slice.stop
+            assert len(stop.free_symbols) == 1
+            stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
+            result_slice = slice(result_slice.start, stop, result_slice.step)
+        assert isinstance(result_slice.step, int)
+        res.append(result_slice)
    return res


-def _get_end_from_slice(iteration_slice, arr_shape):
-    iter_slice = normalize_slice(iteration_slice, arr_shape)
-    res = []
-    for slice_component in iter_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.stop)
+def _get_widths(iteration_slice):
+    widths = []
+    for iter_slice in iteration_slice:
+        step = iter_slice.step
+        assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
+        start = iter_slice.start
+        stop = iter_slice.stop
+        if step == 1:
+            if stop - start == 0:
+                widths.append(1)
+            else:
+                widths.append(stop - start)
        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component + 1)
-    return res
+            width = (stop - start) / step
+            if isinstance(width, int):
+                widths.append(width)
+            elif isinstance(width, float):
+                widths.append(math.ceil(width))
+            else:
+                widths.append(div_ceil(stop - start, step))
+    return widths


-def _get_steps_from_slice(iteration_slice):
-    res = []
-    for slice_component in iteration_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.step)
+def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
+    loop_ctr_assignments = []
+    for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
+        if isinstance(iter_slice, slice) and iter_slice.step > 1:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step))
+        elif iter_slice.start == iter_slice.stop:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
        else:
-            res.append(1)
-    return res
-
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))

-def _iteration_space(iteration_slice, arr_shape):
-    starts = _get_start_from_slice(iteration_slice)
-    ends = _get_end_from_slice(iteration_slice, arr_shape)
-    steps = _get_steps_from_slice(iteration_slice)
-    return [slice(start, end, step) for start, end, step in zip(starts, ends, steps)]
+    return loop_ctr_assignments


 def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):