diff --git a/pystencils/gpu/indexing.py b/pystencils/gpu/indexing.py index 05837445af9649906498ee988b18a4bb973804dc..27cb39e454913db4faab579dc366a879c1eb04b7 100644 --- a/pystencils/gpu/indexing.py +++ b/pystencils/gpu/indexing.py @@ -1,14 +1,14 @@ import abc from functools import partial import math +from typing import List, Tuple import sympy as sp from sympy.core.cache import cacheit -from pystencils.astnodes import Block, Conditional +from pystencils.astnodes import Block, Conditional, SympyAssignment from pystencils.typing import TypedSymbol, create_type from pystencils.integer_functions import div_ceil, div_floor -from pystencils.slicing import normalize_slice from pystencils.sympyextensions import is_integer_sequence, prod @@ -33,12 +33,37 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c class AbstractIndexing(abc.ABC): """ - Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional - field is mapped to GPU's block and grid system. It calculates indices based on GPU's thread and block indices - and computes the number of blocks and threads a kernel is started with. The Indexing class is created with - a pystencils field, a slice to iterate over, and further optional parameters that must have default values. + Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped + to GPU's block and grid system. It calculates indices based on GPU's thread and block indices + and computes the number of blocks and threads a kernel is started with. + The Indexing class is created with an iteration space that is given as list of slices to determine start, stop + and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow + coordinates. This is important to get an optimal mapping of coordinates to GPU threads. """ + def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple): + for iter_space in iteration_space: + assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \ + f"not tuple of type {type(iter_space)}" + self._iteration_space = iteration_space + self._data_layout = data_layout + self._dim = len(iteration_space) + + @property + def iteration_space(self): + """Iteration space to loop over""" + return self._iteration_space + + @property + def data_layout(self): + """Data layout of the kernels arrays""" + return self._data_layout + + @property + def dim(self): + """Number of spatial dimensions""" + return self._dim + @property @abc.abstractmethod def coordinates(self): @@ -50,6 +75,16 @@ class AbstractIndexing(abc.ABC): """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """ return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM + @abc.abstractmethod + def get_loop_ctr_assignments(self, loop_counter_symbols) -> List[SympyAssignment]: + """Adds assignments for the loop counter symbols depending on the gpu threads. + + Args: + loop_counter_symbols: typed symbols representing the loop counters + Returns: + assignments for the loop counters + """ + @abc.abstractmethod def call_parameters(self, arr_shape): """Determine grid and block size for kernel call. @@ -92,8 +127,9 @@ class BlockIndexing(AbstractIndexing): """Generic indexing scheme that maps sub-blocks of an array to GPU blocks. Args: - field: pystencils field (common to all Indexing classes) - iteration_slice: slice that defines rectangular subarea which is iterated over + iteration_space: list of slices to determine start, stop and the step size for each coordinate + data_layout: tuple specifying loop order with innermost loop last. + This is the same format as returned by `Field.layout`. permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate gets the largest amount of threads compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used @@ -102,14 +138,16 @@ class BlockIndexing(AbstractIndexing): device_number: device number of the used GPU. By default, the zeroth device is used. """ - def __init__(self, field, iteration_slice, - block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, + def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple[int], + block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, maximum_block_size=(1024, 1024, 64), device_number=None): - if field.spatial_dimensions > 3: - raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") + super(BlockIndexing, self).__init__(iteration_space, data_layout) - if permute_block_size_dependent_on_layout: - block_size = self.permute_block_size_according_to_layout(block_size, field.layout) + if self._dim > 4: + raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") + + if permute_block_size_dependent_on_layout and self._dim < 4: + block_size = self.permute_block_size_according_to_layout(block_size, data_layout) self._block_size = block_size if maximum_block_size == 'auto': @@ -124,9 +162,6 @@ class BlockIndexing(AbstractIndexing): maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"]) self._maximum_block_size = maximum_block_size - self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape) - self._dim = field.spatial_dimensions - self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape] self._compile_time_block_size = compile_time_block_size self._device_number = device_number @@ -140,17 +175,28 @@ class BlockIndexing(AbstractIndexing): @property def coordinates(self): - offsets = _get_start_from_slice(self._iterationSlice) - coordinates = [c + off for c, off in zip(self.cuda_indices, offsets)] + if self._dim < 4: + coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)] + return coordinates[:self._dim] + else: + coordinates = list() + width = self._iteration_space[1].stop - self.iteration_space[1].start + coordinates.append(div_floor(self.cuda_indices[0], width)) + coordinates.append(sp.Mod(self.cuda_indices[0], width)) + coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start) + coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start) + return coordinates - return coordinates[:self._dim] + def get_loop_ctr_assignments(self, loop_counter_symbols): + return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space) def call_parameters(self, arr_shape): - substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None} + numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape) + widths = _get_widths(numeric_iteration_slice) + + if len(widths) > 3: + widths = [widths[0] * widths[1], widths[2], widths[3]] - widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice), - _get_end_from_slice(self._iterationSlice, arr_shape))] - widths = sp.Matrix(widths).subs(substitution_dict) extend_bs = (1,) * (3 - len(self._block_size)) block_size = self._block_size + extend_bs if not self._compile_time_block_size: @@ -171,20 +217,22 @@ class BlockIndexing(AbstractIndexing): def guard(self, kernel_content, arr_shape): arr_shape = arr_shape[:self._dim] - end = _get_end_from_slice(self._iterationSlice, arr_shape) - - conditions = [c < e for c, e in zip(self.coordinates, end)] - for cuda_index, iter_slice in zip(self.cuda_indices, self._iterationSlice): - if isinstance(iter_slice, slice) and iter_slice.step > 1: - conditions.append(sp.Eq(sp.Mod(cuda_index, iter_slice.step), 0)) + numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape) + end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice] + if self._dim < 4: + conditions = [c < e for c, e in zip(self.coordinates, end)] + else: + end = [end[0] * end[1], end[2], end[3]] + coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])] + conditions = [c < e for c, e in zip(coordinates, end)] condition = conditions[0] for c in conditions[1:]: condition = sp.And(condition, c) return Block([Conditional(condition, kernel_content)]) - def iteration_space(self, arr_shape): - return _iteration_space(self._iterationSlice, arr_shape) + def numeric_iteration_space(self, arr_shape): + return _get_numeric_iteration_slice(self._iteration_space, arr_shape) def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread): """Shrinks the block_size if there are too many registers used per block. @@ -246,38 +294,44 @@ class LineIndexing(AbstractIndexing): The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z} This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the maximum amount of threads allowed in a GPU block (which depends on device). + + Args: + iteration_space: list of slices to determine start, stop and the step size for each coordinate + data_layout: tuple to determine the fast and slow coordinates. """ - def __init__(self, field, iteration_slice): - available_indices = [THREAD_IDX[0]] + BLOCK_IDX - if field.spatial_dimensions > 4: + def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple): + super(LineIndexing, self).__init__(iteration_space, data_layout) + + if len(iteration_space) > 4: raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") - coordinates = available_indices[:field.spatial_dimensions] + @property + def cuda_indices(self): + available_indices = [THREAD_IDX[0]] + BLOCK_IDX + coordinates = available_indices[:self.dim] - fastest_coordinate = field.layout[-1] + fastest_coordinate = self.data_layout[-1] coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0] - self._coordinates = coordinates - self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape) - self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape] + return coordinates @property def coordinates(self): - return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))] + return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)] - def call_parameters(self, arr_shape): - substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None} + def get_loop_ctr_assignments(self, loop_counter_symbols): + return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space) - widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice), - _get_end_from_slice(self._iterationSlice, arr_shape))] - widths = sp.Matrix(widths).subs(substitution_dict) + def call_parameters(self, arr_shape): + numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape) + widths = _get_widths(numeric_iteration_slice) def get_shape_of_cuda_idx(cuda_idx): - if cuda_idx not in self._coordinates: + if cuda_idx not in self.cuda_indices: return 1 else: - idx = self._coordinates.index(cuda_idx) + idx = self.cuda_indices.index(cuda_idx) return widths[idx] return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]), @@ -292,50 +346,65 @@ class LineIndexing(AbstractIndexing): def symbolic_parameters(self): return set() - def iteration_space(self, arr_shape): - return _iteration_space(self._iterationSlice, arr_shape) + def numeric_iteration_space(self, arr_shape): + return _get_numeric_iteration_slice(self._iteration_space, arr_shape) # -------------------------------------- Helper functions -------------------------------------------------------------- -def _get_start_from_slice(iteration_slice): +def _get_numeric_iteration_slice(iteration_slice, arr_shape): res = [] - for slice_component in iteration_slice: - if type(slice_component) is slice: - res.append(slice_component.start if slice_component.start is not None else 0) - else: - assert isinstance(slice_component, int) - res.append(slice_component) + for slice_component, shape in zip(iteration_slice, arr_shape): + result_slice = slice_component + if not isinstance(result_slice.start, int): + start = result_slice.start + assert len(start.free_symbols) == 1 + start = start.subs({symbol: shape for symbol in start.free_symbols}) + result_slice = slice(start, result_slice.stop, result_slice.step) + if not isinstance(result_slice.stop, int): + stop = result_slice.stop + assert len(stop.free_symbols) == 1 + stop = stop.subs({symbol: shape for symbol in stop.free_symbols}) + result_slice = slice(result_slice.start, stop, result_slice.step) + assert isinstance(result_slice.step, int) + res.append(result_slice) return res -def _get_end_from_slice(iteration_slice, arr_shape): - iter_slice = normalize_slice(iteration_slice, arr_shape) - res = [] - for slice_component in iter_slice: - if type(slice_component) is slice: - res.append(slice_component.stop) +def _get_widths(iteration_slice): + widths = [] + for iter_slice in iteration_slice: + step = iter_slice.step + assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}" + start = iter_slice.start + stop = iter_slice.stop + if step == 1: + if stop - start == 0: + widths.append(1) + else: + widths.append(stop - start) else: - assert isinstance(slice_component, int) - res.append(slice_component + 1) - return res + width = (stop - start) / step + if isinstance(width, int): + widths.append(width) + elif isinstance(width, float): + widths.append(math.ceil(width)) + else: + widths.append(div_ceil(stop - start, step)) + return widths -def _get_steps_from_slice(iteration_slice): - res = [] - for slice_component in iteration_slice: - if type(slice_component) is slice: - res.append(slice_component.step) +def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space): + loop_ctr_assignments = [] + for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space): + if isinstance(iter_slice, slice) and iter_slice.step > 1: + loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step)) + elif iter_slice.start == iter_slice.stop: + loop_ctr_assignments.append(SympyAssignment(loop_counter, 0)) else: - res.append(1) - return res - + loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate)) -def _iteration_space(iteration_slice, arr_shape): - starts = _get_start_from_slice(iteration_slice) - ends = _get_end_from_slice(iteration_slice, arr_shape) - steps = _get_steps_from_slice(iteration_slice) - return [slice(start, end, step) for start, end, step in zip(starts, ends, steps)] + return loop_ctr_assignments def indexing_creator_from_params(gpu_indexing, gpu_indexing_params): diff --git a/pystencils/gpu/kernelcreation.py b/pystencils/gpu/kernelcreation.py index 066038cde246934d1694ee20613434ac9e3dc678..c0d6e71d05e3a2250249224e3a12e3daa30978d0 100644 --- a/pystencils/gpu/kernelcreation.py +++ b/pystencils/gpu/kernelcreation.py @@ -1,7 +1,5 @@ from typing import Union -import numpy as np - from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment from pystencils.config import CreateKernelConfig from pystencils.typing import StructType, TypedSymbol @@ -12,6 +10,7 @@ from pystencils.gpu.gpujit import make_python_function from pystencils.node_collection import NodeCollection from pystencils.gpu.indexing import indexing_creator_from_params from pystencils.simp.assignment_collection import AssignmentCollection +from pystencils.slicing import normalize_slice from pystencils.transformations import ( get_base_buffer_index, get_common_field, parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols) @@ -64,17 +63,18 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], iteration_slice.append(slice(ghost_layers[i][0], -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None)) - indexing = indexing_creator(field=common_field, iteration_slice=iteration_slice) - coord_mapping = indexing.coordinates + iteration_space = normalize_slice(iteration_slice, common_shape) + else: + iteration_space = normalize_slice(iteration_slice, common_shape) - cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value) - for i, value in enumerate(coord_mapping)] - cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)] - assignments = cell_idx_assignments + assignments + iteration_space = tuple([s if isinstance(s, slice) else slice(s, s, 1) for s in iteration_space]) + loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))] - block = Block(assignments) + indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout) + loop_counter_assignments = indexing.get_loop_ctr_assignments(loop_counter_symbols) + assignments = loop_counter_assignments + assignments + block = indexing.guard(Block(assignments), common_shape) - block = indexing.guard(block, common_shape) unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers) ast = KernelFunction(block, @@ -91,11 +91,11 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], f.spatial_dimensions, f.index_dimensions) for f in all_fields} - coord_mapping = {f.name: cell_idx_symbols for f in all_fields} + coord_mapping = {f.name: loop_counter_symbols for f in all_fields} if any(FieldType.is_buffer(f) for f in all_fields): - iteration_space = indexing.iteration_space(common_shape) - resolve_buffer_accesses(ast, get_base_buffer_index(ast, cell_idx_symbols, iteration_space), read_only_fields) + base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space) + resolve_buffer_accesses(ast, base_buffer_index, read_only_fields) resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info, field_to_fixed_coordinates=coord_mapping) @@ -147,7 +147,7 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol data_type = ind_f.dtype if data_type.has_element(name): rhs = ind_f[0](name) - lhs = TypedSymbol(name, np.int64) + lhs = TypedSymbol(name, data_type.get_element_type(name)) return SympyAssignment(lhs, rhs) raise ValueError(f"Index {name} not found in any of the passed index fields") @@ -156,8 +156,12 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments] idx_field = list(index_fields)[0] - indexing = indexing_creator(field=idx_field, - iteration_slice=[slice(None, None, None)] * len(idx_field.spatial_shape)) + + iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape), + idx_field.spatial_shape) + + indexing = indexing_creator(iteration_space=iteration_space, + data_layout=idx_field.layout) function_body = Block(coordinate_symbol_assignments + assignments) function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape) diff --git a/pystencils_tests/test_buffer_gpu.py b/pystencils_tests/test_buffer_gpu.py index d2ec6808e8db52eb69427aef9a94ce53631047eb..944488d97f182dfd6b961e38e648b4d16b3d19ef 100644 --- a/pystencils_tests/test_buffer_gpu.py +++ b/pystencils_tests/test_buffer_gpu.py @@ -275,7 +275,8 @@ def test_buffer_indexing(): assert len(spatial_shape_symbols) <= 3 -def test_iteration_slices(): +@pytest.mark.parametrize('gpu_indexing', ("block", "line")) +def test_iteration_slices(gpu_indexing): num_cell_values = 19 dt = np.uint64 fields = _generate_fields(dt=dt, stencil_directions=num_cell_values) @@ -303,7 +304,8 @@ def test_iteration_slices(): gpu_dst_arr.fill(0) config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, - data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}) + data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}, + gpu_indexing=gpu_indexing) pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() @@ -316,7 +318,8 @@ def test_iteration_slices(): unpack_eqs.append(eq) config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, - data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}) + data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}, + gpu_indexing=gpu_indexing) unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() diff --git a/pystencils_tests/test_gpu.py b/pystencils_tests/test_gpu.py index df452e2b1bd2ec55ecc57c0325ba994a3dac894f..04d616d4ead4b84df6ff1ee46dec83d8ce1c5503 100644 --- a/pystencils_tests/test_gpu.py +++ b/pystencils_tests/test_gpu.py @@ -8,7 +8,7 @@ from scipy.ndimage import convolve from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target from pystencils.gpu import BlockIndexing from pystencils.simp import sympy_cse_on_assignment_list -from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers +from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice try: import cupy @@ -164,14 +164,17 @@ def test_periodicity(): @pytest.mark.parametrize("device_number", device_numbers) def test_block_indexing(device_number): f = fields("f: [3D]") - bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False) + s = normalize_slice(make_slice[:, :, :], f.spatial_shape) + bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2), + permute_block_size_dependent_on_layout=False) assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32) assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8) - bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False) + bi = BlockIndexing(s, f.layout, block_size=(32, 1, 1), + permute_block_size_dependent_on_layout=False) assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) - bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), + bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2), maximum_block_size="auto", device_number=device_number) # This function should be used if number of needed registers is known. Can be determined with func.num_regs @@ -187,3 +190,23 @@ def test_block_indexing(device_number): assert np.prod(blocks) * registers_per_thread < max_registers_per_block + +@pytest.mark.parametrize('gpu_indexing', ("block", "line")) +@pytest.mark.parametrize('layout', ("C", "F")) +@pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11))) +def test_four_dimensional_kernel(gpu_indexing, layout, shape): + n_elements = np.prod(shape) + + arr_cpu = np.arange(n_elements, dtype=np.float64).reshape(shape, order=layout) + arr_gpu = cp.asarray(arr_cpu) + + iteration_slice = make_slice[:, :, :, :] + f = Field.create_from_numpy_array("f", arr_cpu) + update_rule = [Assignment(f.center, sp.Symbol("value"))] + + config = CreateKernelConfig(target=Target.GPU, gpu_indexing=gpu_indexing, iteration_slice=iteration_slice) + ast = create_kernel(update_rule, config=config) + kernel = ast.compile() + + kernel(f=arr_gpu, value=np.float64(42.0)) + np.testing.assert_equal(arr_gpu.get(), np.ones(shape) * 42.0)