From fd6ff7ef04d4e1f11d533cab6a8a7b0230ca32d3 Mon Sep 17 00:00:00 2001
From: Markus Holzer <markus.holzer@fau.de>
Date: Sat, 12 Aug 2023 10:27:28 +0200
Subject: [PATCH] Improved slicing

---
 pystencils/gpu/indexing.py          | 66 +++++++++++++++++++++++++----
 pystencils/gpu/kernelcreation.py    | 19 +++------
 pystencils_tests/test_buffer_gpu.py |  9 ++--
 3 files changed, 71 insertions(+), 23 deletions(-)

diff --git a/pystencils/gpu/indexing.py b/pystencils/gpu/indexing.py
index 133934be3..8ea09a8d4 100644
--- a/pystencils/gpu/indexing.py
+++ b/pystencils/gpu/indexing.py
@@ -6,7 +6,7 @@ from typing import Tuple
 import sympy as sp
 from sympy.core.cache import cacheit
 
-from pystencils.astnodes import Block, Conditional
+from pystencils.astnodes import Block, Conditional, SympyAssignment
 from pystencils.typing import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
 from pystencils.sympyextensions import is_integer_sequence, prod
@@ -75,6 +75,17 @@ class AbstractIndexing(abc.ABC):
         """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
         return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
 
+    @abc.abstractmethod
+    def add_loop_ctr_assignments(self, assignments, loop_counter_symbols):
+        """Adds assignments for the loop counter symbols depending on the gpu threads.
+
+        Args:
+            assignments: list of assignments
+            loop_counter_symbols: typed symbols representing the loop counters
+        Returns:
+            assignments with assignments for the loop counters
+        """
+
     @abc.abstractmethod
     def call_parameters(self, arr_shape):
         """Determine grid and block size for kernel call.
@@ -167,10 +178,13 @@ class BlockIndexing(AbstractIndexing):
         coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
         return coordinates[:self._dim]
 
+    def add_loop_ctr_assignments(self, assignments, loop_counter_symbols):
+        cell_idx_assignments = _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
+        return cell_idx_assignments + assignments
+
     def call_parameters(self, arr_shape):
         numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
-        widths = [s.stop - s.start for s in numeric_iteration_slice]
-        widths = [w if w != 0 else 1 for w in widths]
+        widths = _get_widths(numeric_iteration_slice)
 
         extend_bs = (1,) * (3 - len(self._block_size))
         block_size = self._block_size + extend_bs
@@ -196,10 +210,6 @@ class BlockIndexing(AbstractIndexing):
         end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
 
         conditions = [c < e for c, e in zip(self.coordinates, end)]
-        for cuda_index, iter_slice in zip(self.cuda_indices, self._iteration_space):
-            if isinstance(iter_slice, slice) and iter_slice.step > 1:
-                conditions.append(sp.Eq(sp.Mod(cuda_index, iter_slice.step), 0))
-
         condition = conditions[0]
         for c in conditions[1:]:
             condition = sp.And(condition, c)
@@ -294,9 +304,13 @@ class LineIndexing(AbstractIndexing):
     def coordinates(self):
         return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]
 
+    def add_loop_ctr_assignments(self, assignments, loop_counter_symbols):
+        cell_idx_assignments = _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
+        return cell_idx_assignments + assignments
+
     def call_parameters(self, arr_shape):
         numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
-        widths = [len(range(*s.indices(s.stop))) for s in numeric_iteration_slice]
+        widths = _get_widths(numeric_iteration_slice)
 
         def get_shape_of_cuda_idx(cuda_idx):
             if cuda_idx not in self.cuda_indices:
@@ -342,6 +356,42 @@ def _get_numeric_iteration_slice(iteration_slice, arr_shape):
     return res
 
 
+def _get_widths(iteration_slice):
+    widths = []
+    for iter_slice in iteration_slice:
+        step = iter_slice.step
+        assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
+        start = iter_slice.start
+        stop = iter_slice.stop
+        if step == 1:
+            if stop - start == 0:
+                widths.append(1)
+            else:
+                widths.append(stop - start)
+        else:
+            width = (stop - start) / step
+            if isinstance(width, int):
+                widths.append(width)
+            elif isinstance(width, float):
+                widths.append(math.ceil(width))
+            else:
+                widths.append(div_ceil(stop - start, step))
+    return widths
+
+
+def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
+    loop_ctr_assignments = []
+    for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
+        if isinstance(iter_slice, slice) and iter_slice.step > 1:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step))
+        elif iter_slice.start == iter_slice.stop:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
+        else:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))
+
+    return loop_ctr_assignments
+
+
 def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
     if isinstance(gpu_indexing, str):
         if gpu_indexing == 'block':
diff --git a/pystencils/gpu/kernelcreation.py b/pystencils/gpu/kernelcreation.py
index 3c70de0fb..23fd02694 100644
--- a/pystencils/gpu/kernelcreation.py
+++ b/pystencils/gpu/kernelcreation.py
@@ -70,18 +70,12 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
         iteration_space = normalize_slice(iteration_slice, common_shape)
 
     iteration_space = tuple([s if isinstance(s, slice) else slice(s, s, 1) for s in iteration_space])
+    loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]
 
     indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
-    coord_mapping = indexing.coordinates
-
-    cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value)
-                            for i, value in enumerate(coord_mapping)]
-    cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)]
-    assignments = cell_idx_assignments + assignments
-
-    block = Block(assignments)
+    assignments = indexing.add_loop_ctr_assignments(assignments, loop_counter_symbols)
+    block = indexing.guard(Block(assignments), common_shape)
 
-    block = indexing.guard(block, common_shape)
     unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)
 
     ast = KernelFunction(block,
@@ -98,10 +92,11 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
                                                          f.spatial_dimensions, f.index_dimensions)
                          for f in all_fields}
 
-    coord_mapping = {f.name: cell_idx_symbols for f in all_fields}
+    coord_mapping = {f.name: loop_counter_symbols for f in all_fields}
 
     if any(FieldType.is_buffer(f) for f in all_fields):
-        resolve_buffer_accesses(ast, get_base_buffer_index(ast, cell_idx_symbols, iteration_space), read_only_fields)
+        base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
+        resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)
 
     resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
                            field_to_fixed_coordinates=coord_mapping)
@@ -153,7 +148,7 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
             data_type = ind_f.dtype
             if data_type.has_element(name):
                 rhs = ind_f[0](name)
-                lhs = TypedSymbol(name, np.int64)
+                lhs = TypedSymbol(name, data_type.get_element_type(name))
                 return SympyAssignment(lhs, rhs)
         raise ValueError(f"Index {name} not found in any of the passed index fields")
 
diff --git a/pystencils_tests/test_buffer_gpu.py b/pystencils_tests/test_buffer_gpu.py
index d2ec6808e..944488d97 100644
--- a/pystencils_tests/test_buffer_gpu.py
+++ b/pystencils_tests/test_buffer_gpu.py
@@ -275,7 +275,8 @@ def test_buffer_indexing():
     assert len(spatial_shape_symbols) <= 3
 
 
-def test_iteration_slices():
+@pytest.mark.parametrize('gpu_indexing', ("block", "line"))
+def test_iteration_slices(gpu_indexing):
     num_cell_values = 19
     dt = np.uint64
     fields = _generate_fields(dt=dt, stencil_directions=num_cell_values)
@@ -303,7 +304,8 @@ def test_iteration_slices():
         gpu_dst_arr.fill(0)
 
         config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
-                                    data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype})
+                                    data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
+                                    gpu_indexing=gpu_indexing)
 
         pack_code = create_kernel(pack_eqs, config=config)
         pack_kernel = pack_code.compile()
@@ -316,7 +318,8 @@ def test_iteration_slices():
             unpack_eqs.append(eq)
 
         config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
-                                    data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype})
+                                    data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
+                                    gpu_indexing=gpu_indexing)
 
         unpack_code = create_kernel(unpack_eqs, config=config)
         unpack_kernel = unpack_code.compile()
-- 
GitLab