From fd6ff7ef04d4e1f11d533cab6a8a7b0230ca32d3 Mon Sep 17 00:00:00 2001 From: Markus Holzer <markus.holzer@fau.de> Date: Sat, 12 Aug 2023 10:27:28 +0200 Subject: [PATCH] Improved slicing --- pystencils/gpu/indexing.py | 66 +++++++++++++++++++++++++---- pystencils/gpu/kernelcreation.py | 19 +++------ pystencils_tests/test_buffer_gpu.py | 9 ++-- 3 files changed, 71 insertions(+), 23 deletions(-) diff --git a/pystencils/gpu/indexing.py b/pystencils/gpu/indexing.py index 133934be3..8ea09a8d4 100644 --- a/pystencils/gpu/indexing.py +++ b/pystencils/gpu/indexing.py @@ -6,7 +6,7 @@ from typing import Tuple import sympy as sp from sympy.core.cache import cacheit -from pystencils.astnodes import Block, Conditional +from pystencils.astnodes import Block, Conditional, SympyAssignment from pystencils.typing import TypedSymbol, create_type from pystencils.integer_functions import div_ceil, div_floor from pystencils.sympyextensions import is_integer_sequence, prod @@ -75,6 +75,17 @@ class AbstractIndexing(abc.ABC): """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """ return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM + @abc.abstractmethod + def add_loop_ctr_assignments(self, assignments, loop_counter_symbols): + """Adds assignments for the loop counter symbols depending on the gpu threads. + + Args: + assignments: list of assignments + loop_counter_symbols: typed symbols representing the loop counters + Returns: + assignments with assignments for the loop counters + """ + @abc.abstractmethod def call_parameters(self, arr_shape): """Determine grid and block size for kernel call. @@ -167,10 +178,13 @@ class BlockIndexing(AbstractIndexing): coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)] return coordinates[:self._dim] + def add_loop_ctr_assignments(self, assignments, loop_counter_symbols): + cell_idx_assignments = _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space) + return cell_idx_assignments + assignments + def call_parameters(self, arr_shape): numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape) - widths = [s.stop - s.start for s in numeric_iteration_slice] - widths = [w if w != 0 else 1 for w in widths] + widths = _get_widths(numeric_iteration_slice) extend_bs = (1,) * (3 - len(self._block_size)) block_size = self._block_size + extend_bs @@ -196,10 +210,6 @@ class BlockIndexing(AbstractIndexing): end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice] conditions = [c < e for c, e in zip(self.coordinates, end)] - for cuda_index, iter_slice in zip(self.cuda_indices, self._iteration_space): - if isinstance(iter_slice, slice) and iter_slice.step > 1: - conditions.append(sp.Eq(sp.Mod(cuda_index, iter_slice.step), 0)) - condition = conditions[0] for c in conditions[1:]: condition = sp.And(condition, c) @@ -294,9 +304,13 @@ class LineIndexing(AbstractIndexing): def coordinates(self): return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)] + def add_loop_ctr_assignments(self, assignments, loop_counter_symbols): + cell_idx_assignments = _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space) + return cell_idx_assignments + assignments + def call_parameters(self, arr_shape): numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape) - widths = [len(range(*s.indices(s.stop))) for s in numeric_iteration_slice] + widths = _get_widths(numeric_iteration_slice) def get_shape_of_cuda_idx(cuda_idx): if cuda_idx not in self.cuda_indices: @@ -342,6 +356,42 @@ def _get_numeric_iteration_slice(iteration_slice, arr_shape): return res +def _get_widths(iteration_slice): + widths = [] + for iter_slice in iteration_slice: + step = iter_slice.step + assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}" + start = iter_slice.start + stop = iter_slice.stop + if step == 1: + if stop - start == 0: + widths.append(1) + else: + widths.append(stop - start) + else: + width = (stop - start) / step + if isinstance(width, int): + widths.append(width) + elif isinstance(width, float): + widths.append(math.ceil(width)) + else: + widths.append(div_ceil(stop - start, step)) + return widths + + +def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space): + loop_ctr_assignments = [] + for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space): + if isinstance(iter_slice, slice) and iter_slice.step > 1: + loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step)) + elif iter_slice.start == iter_slice.stop: + loop_ctr_assignments.append(SympyAssignment(loop_counter, 0)) + else: + loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate)) + + return loop_ctr_assignments + + def indexing_creator_from_params(gpu_indexing, gpu_indexing_params): if isinstance(gpu_indexing, str): if gpu_indexing == 'block': diff --git a/pystencils/gpu/kernelcreation.py b/pystencils/gpu/kernelcreation.py index 3c70de0fb..23fd02694 100644 --- a/pystencils/gpu/kernelcreation.py +++ b/pystencils/gpu/kernelcreation.py @@ -70,18 +70,12 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], iteration_space = normalize_slice(iteration_slice, common_shape) iteration_space = tuple([s if isinstance(s, slice) else slice(s, s, 1) for s in iteration_space]) + loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))] indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout) - coord_mapping = indexing.coordinates - - cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value) - for i, value in enumerate(coord_mapping)] - cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)] - assignments = cell_idx_assignments + assignments - - block = Block(assignments) + assignments = indexing.add_loop_ctr_assignments(assignments, loop_counter_symbols) + block = indexing.guard(Block(assignments), common_shape) - block = indexing.guard(block, common_shape) unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers) ast = KernelFunction(block, @@ -98,10 +92,11 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], f.spatial_dimensions, f.index_dimensions) for f in all_fields} - coord_mapping = {f.name: cell_idx_symbols for f in all_fields} + coord_mapping = {f.name: loop_counter_symbols for f in all_fields} if any(FieldType.is_buffer(f) for f in all_fields): - resolve_buffer_accesses(ast, get_base_buffer_index(ast, cell_idx_symbols, iteration_space), read_only_fields) + base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space) + resolve_buffer_accesses(ast, base_buffer_index, read_only_fields) resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info, field_to_fixed_coordinates=coord_mapping) @@ -153,7 +148,7 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol data_type = ind_f.dtype if data_type.has_element(name): rhs = ind_f[0](name) - lhs = TypedSymbol(name, np.int64) + lhs = TypedSymbol(name, data_type.get_element_type(name)) return SympyAssignment(lhs, rhs) raise ValueError(f"Index {name} not found in any of the passed index fields") diff --git a/pystencils_tests/test_buffer_gpu.py b/pystencils_tests/test_buffer_gpu.py index d2ec6808e..944488d97 100644 --- a/pystencils_tests/test_buffer_gpu.py +++ b/pystencils_tests/test_buffer_gpu.py @@ -275,7 +275,8 @@ def test_buffer_indexing(): assert len(spatial_shape_symbols) <= 3 -def test_iteration_slices(): +@pytest.mark.parametrize('gpu_indexing', ("block", "line")) +def test_iteration_slices(gpu_indexing): num_cell_values = 19 dt = np.uint64 fields = _generate_fields(dt=dt, stencil_directions=num_cell_values) @@ -303,7 +304,8 @@ def test_iteration_slices(): gpu_dst_arr.fill(0) config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, - data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}) + data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}, + gpu_indexing=gpu_indexing) pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() @@ -316,7 +318,8 @@ def test_iteration_slices(): unpack_eqs.append(eq) config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, - data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}) + data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}, + gpu_indexing=gpu_indexing) unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() -- GitLab