Skip to content
Snippets Groups Projects
Commit e43aa476 authored by Markus Holzer's avatar Markus Holzer
Browse files

pull indexing

parents 728521f7 31bd0e0a
Branches
No related tags found
No related merge requests found
Pipeline #55167 failed
import abc import abc
from functools import partial from functools import partial
import math import math
from typing import Tuple
import sympy as sp import sympy as sp
from sympy.core.cache import cacheit from sympy.core.cache import cacheit
from pystencils.astnodes import Block, Conditional from pystencils.astnodes import Block, Conditional, SympyAssignment
from pystencils.typing import TypedSymbol, create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.integer_functions import div_ceil, div_floor from pystencils.integer_functions import div_ceil, div_floor, int_div
from pystencils.slicing import normalize_slice
from pystencils.sympyextensions import is_integer_sequence, prod from pystencils.sympyextensions import is_integer_sequence, prod
...@@ -33,12 +33,37 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c ...@@ -33,12 +33,37 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c
class AbstractIndexing(abc.ABC): class AbstractIndexing(abc.ABC):
""" """
Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
field is mapped to GPU's block and grid system. It calculates indices based on GPU's thread and block indices to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
and computes the number of blocks and threads a kernel is started with. The Indexing class is created with and computes the number of blocks and threads a kernel is started with.
a pystencils field, a slice to iterate over, and further optional parameters that must have default values. The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
""" """
def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
for iter_space in iteration_space:
assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
f"not tuple of type {type(iter_space)}"
self._iteration_space = iteration_space
self._data_layout = data_layout
self._dim = len(iteration_space)
@property
def iteration_space(self):
"""Iteration space to loop over"""
return self._iteration_space
@property
def data_layout(self):
"""Data layout of the kernels arrays"""
return self._data_layout
@property
def dim(self):
"""Number of spatial dimensions"""
return self._dim
@property @property
@abc.abstractmethod @abc.abstractmethod
def coordinates(self): def coordinates(self):
...@@ -50,6 +75,17 @@ class AbstractIndexing(abc.ABC): ...@@ -50,6 +75,17 @@ class AbstractIndexing(abc.ABC):
"""Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """ """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
@abc.abstractmethod
def add_loop_ctr_assignments(self, assignments, loop_counter_symbols):
"""Adds assignments for the loop counter symbols depending on the gpu threads.
Args:
assignments: list of assignments
loop_counter_symbols: typed symbols representing the loop counters
Returns:
assignments with assignments for the loop counters
"""
@abc.abstractmethod @abc.abstractmethod
def call_parameters(self, arr_shape): def call_parameters(self, arr_shape):
"""Determine grid and block size for kernel call. """Determine grid and block size for kernel call.
...@@ -92,8 +128,8 @@ class BlockIndexing(AbstractIndexing): ...@@ -92,8 +128,8 @@ class BlockIndexing(AbstractIndexing):
"""Generic indexing scheme that maps sub-blocks of an array to GPU blocks. """Generic indexing scheme that maps sub-blocks of an array to GPU blocks.
Args: Args:
field: pystencils field (common to all Indexing classes) iteration_space: list of slices to determine start, stop and the step size for each coordinate
iteration_slice: slice that defines rectangular subarea which is iterated over data_layout: tuple to determine the fast and slow coordinates.
permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
gets the largest amount of threads gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
...@@ -102,14 +138,16 @@ class BlockIndexing(AbstractIndexing): ...@@ -102,14 +138,16 @@ class BlockIndexing(AbstractIndexing):
device_number: device number of the used GPU. By default, the zeroth device is used. device_number: device number of the used GPU. By default, the zeroth device is used.
""" """
def __init__(self, field, iteration_slice, def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64), device_number=None): maximum_block_size=(1024, 1024, 64), device_number=None):
if field.spatial_dimensions > 3: super(BlockIndexing, self).__init__(iteration_space, data_layout)
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
if self._dim > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
if permute_block_size_dependent_on_layout: if permute_block_size_dependent_on_layout and self._dim < 4:
block_size = self.permute_block_size_according_to_layout(block_size, field.layout) block_size = self.permute_block_size_according_to_layout(block_size, data_layout)
self._block_size = block_size self._block_size = block_size
if maximum_block_size == 'auto': if maximum_block_size == 'auto':
...@@ -124,9 +162,6 @@ class BlockIndexing(AbstractIndexing): ...@@ -124,9 +162,6 @@ class BlockIndexing(AbstractIndexing):
maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"]) maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
self._maximum_block_size = maximum_block_size self._maximum_block_size = maximum_block_size
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._dim = field.spatial_dimensions
self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
self._compile_time_block_size = compile_time_block_size self._compile_time_block_size = compile_time_block_size
self._device_number = device_number self._device_number = device_number
...@@ -140,17 +175,30 @@ class BlockIndexing(AbstractIndexing): ...@@ -140,17 +175,30 @@ class BlockIndexing(AbstractIndexing):
@property @property
def coordinates(self): def coordinates(self):
offsets = _get_start_from_slice(self._iterationSlice) if self._dim < 4:
coordinates = [c + off for c, off in zip(self.cuda_indices, offsets)] coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
return coordinates[:self._dim]
return coordinates[:self._dim] else:
coordinates = list()
width = self._iteration_space[0].stop - self.iteration_space[0].start
coordinates.append(int_div(self.cuda_indices[0], width))
width = self._iteration_space[1].stop - self.iteration_space[1].start
coordinates.append(sp.Mod(self.cuda_indices[0], width))
coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
return coordinates
def add_loop_ctr_assignments(self, assignments, loop_counter_symbols):
cell_idx_assignments = _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
return cell_idx_assignments + assignments
def call_parameters(self, arr_shape): def call_parameters(self, arr_shape):
substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None} numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
widths = _get_widths(numeric_iteration_slice)
if len(widths) > 3:
widths = [widths[0] * widths[1], widths[2], widths[3]]
widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
_get_end_from_slice(self._iterationSlice, arr_shape))]
widths = sp.Matrix(widths).subs(substitution_dict)
extend_bs = (1,) * (3 - len(self._block_size)) extend_bs = (1,) * (3 - len(self._block_size))
block_size = self._block_size + extend_bs block_size = self._block_size + extend_bs
if not self._compile_time_block_size: if not self._compile_time_block_size:
...@@ -171,20 +219,22 @@ class BlockIndexing(AbstractIndexing): ...@@ -171,20 +219,22 @@ class BlockIndexing(AbstractIndexing):
def guard(self, kernel_content, arr_shape): def guard(self, kernel_content, arr_shape):
arr_shape = arr_shape[:self._dim] arr_shape = arr_shape[:self._dim]
end = _get_end_from_slice(self._iterationSlice, arr_shape) numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
conditions = [c < e for c, e in zip(self.coordinates, end)]
for cuda_index, iter_slice in zip(self.cuda_indices, self._iterationSlice):
if isinstance(iter_slice, slice) and iter_slice.step > 1:
conditions.append(sp.Eq(sp.Mod(cuda_index, iter_slice.step), 0))
if self._dim < 4:
conditions = [c < e for c, e in zip(self.coordinates, end)]
else:
coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
conditions = [c < e for c, e in zip(coordinates, end)]
conditions[0] = coordinates[0] < (end[0] * end[1])
condition = conditions[0] condition = conditions[0]
for c in conditions[1:]: for c in conditions[1:]:
condition = sp.And(condition, c) condition = sp.And(condition, c)
return Block([Conditional(condition, kernel_content)]) return Block([Conditional(condition, kernel_content)])
def iteration_space(self, arr_shape): def numeric_iteration_space(self, arr_shape):
return _iteration_space(self._iterationSlice, arr_shape) return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread): def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
"""Shrinks the block_size if there are too many registers used per block. """Shrinks the block_size if there are too many registers used per block.
...@@ -246,38 +296,45 @@ class LineIndexing(AbstractIndexing): ...@@ -246,38 +296,45 @@ class LineIndexing(AbstractIndexing):
The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z} The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a GPU block (which depends on device). maximum amount of threads allowed in a GPU block (which depends on device).
Args:
iteration_space: list of slices to determine start, stop and the step size for each coordinate
data_layout: tuple to determine the fast and slow coordinates.
""" """
def __init__(self, field, iteration_slice): def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX super(LineIndexing, self).__init__(iteration_space, data_layout)
if field.spatial_dimensions > 4:
if len(iteration_space) > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
coordinates = available_indices[:field.spatial_dimensions] @property
def cuda_indices(self):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX
coordinates = available_indices[:self.dim]
fastest_coordinate = field.layout[-1] fastest_coordinate = self.data_layout[-1]
coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0] coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
self._coordinates = coordinates return coordinates
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
@property @property
def coordinates(self): def coordinates(self):
return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))] return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]
def call_parameters(self, arr_shape): def add_loop_ctr_assignments(self, assignments, loop_counter_symbols):
substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None} cell_idx_assignments = _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
return cell_idx_assignments + assignments
widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice), def call_parameters(self, arr_shape):
_get_end_from_slice(self._iterationSlice, arr_shape))] numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
widths = sp.Matrix(widths).subs(substitution_dict) widths = _get_widths(numeric_iteration_slice)
def get_shape_of_cuda_idx(cuda_idx): def get_shape_of_cuda_idx(cuda_idx):
if cuda_idx not in self._coordinates: if cuda_idx not in self.cuda_indices:
return 1 return 1
else: else:
idx = self._coordinates.index(cuda_idx) idx = self.cuda_indices.index(cuda_idx)
return widths[idx] return widths[idx]
return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]), return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
...@@ -292,50 +349,65 @@ class LineIndexing(AbstractIndexing): ...@@ -292,50 +349,65 @@ class LineIndexing(AbstractIndexing):
def symbolic_parameters(self): def symbolic_parameters(self):
return set() return set()
def iteration_space(self, arr_shape): def numeric_iteration_space(self, arr_shape):
return _iteration_space(self._iterationSlice, arr_shape) return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
# -------------------------------------- Helper functions -------------------------------------------------------------- # -------------------------------------- Helper functions --------------------------------------------------------------
def _get_start_from_slice(iteration_slice): def _get_numeric_iteration_slice(iteration_slice, arr_shape):
res = [] res = []
for slice_component in iteration_slice: for slice_component, shape in zip(iteration_slice, arr_shape):
if type(slice_component) is slice: result_slice = slice_component
res.append(slice_component.start if slice_component.start is not None else 0) if not isinstance(result_slice.start, int):
else: start = result_slice.start
assert isinstance(slice_component, int) assert len(start.free_symbols) == 1
res.append(slice_component) start = start.subs({symbol: shape for symbol in start.free_symbols})
result_slice = slice(start, result_slice.stop, result_slice.step)
if not isinstance(result_slice.stop, int):
stop = result_slice.stop
assert len(stop.free_symbols) == 1
stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
result_slice = slice(result_slice.start, stop, result_slice.step)
assert isinstance(result_slice.step, int)
res.append(result_slice)
return res return res
def _get_end_from_slice(iteration_slice, arr_shape): def _get_widths(iteration_slice):
iter_slice = normalize_slice(iteration_slice, arr_shape) widths = []
res = [] for iter_slice in iteration_slice:
for slice_component in iter_slice: step = iter_slice.step
if type(slice_component) is slice: assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
res.append(slice_component.stop) start = iter_slice.start
stop = iter_slice.stop
if step == 1:
if stop - start == 0:
widths.append(1)
else:
widths.append(stop - start)
else: else:
assert isinstance(slice_component, int) width = (stop - start) / step
res.append(slice_component + 1) if isinstance(width, int):
return res widths.append(width)
elif isinstance(width, float):
widths.append(math.ceil(width))
else:
widths.append(div_ceil(stop - start, step))
return widths
def _get_steps_from_slice(iteration_slice): def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
res = [] loop_ctr_assignments = []
for slice_component in iteration_slice: for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
if type(slice_component) is slice: if isinstance(iter_slice, slice) and iter_slice.step > 1:
res.append(slice_component.step) loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step))
elif iter_slice.start == iter_slice.stop:
loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
else: else:
res.append(1) loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))
return res
def _iteration_space(iteration_slice, arr_shape): return loop_ctr_assignments
starts = _get_start_from_slice(iteration_slice)
ends = _get_end_from_slice(iteration_slice, arr_shape)
steps = _get_steps_from_slice(iteration_slice)
return [slice(start, end, step) for start, end, step in zip(starts, ends, steps)]
def indexing_creator_from_params(gpu_indexing, gpu_indexing_params): def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
......
from typing import Union from typing import Union
import numpy as np
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.config import CreateKernelConfig from pystencils.config import CreateKernelConfig
from pystencils.typing import StructType, TypedSymbol from pystencils.typing import StructType, TypedSymbol
...@@ -12,6 +10,7 @@ from pystencils.gpu.gpujit import make_python_function ...@@ -12,6 +10,7 @@ from pystencils.gpu.gpujit import make_python_function
from pystencils.node_collection import NodeCollection from pystencils.node_collection import NodeCollection
from pystencils.gpu.indexing import indexing_creator_from_params from pystencils.gpu.indexing import indexing_creator_from_params
from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.slicing import normalize_slice
from pystencils.transformations import ( from pystencils.transformations import (
get_base_buffer_index, get_common_field, parse_base_pointer_info, get_base_buffer_index, get_common_field, parse_base_pointer_info,
resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols) resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
...@@ -64,18 +63,17 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], ...@@ -64,18 +63,17 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
iteration_slice.append(slice(ghost_layers[i][0], iteration_slice.append(slice(ghost_layers[i][0],
-ghost_layers[i][1] if ghost_layers[i][1] > 0 else None)) -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None))
indexing = indexing_creator(field=common_field, iteration_slice=iteration_slice) iteration_space = normalize_slice(iteration_slice, common_shape)
coord_mapping = indexing.coordinates else:
print(coord_mapping) iteration_space = normalize_slice(iteration_slice, common_shape)
cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value) iteration_space = tuple([s if isinstance(s, slice) else slice(s, s, 1) for s in iteration_space])
for i, value in enumerate(coord_mapping)] loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]
cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)]
assignments = cell_idx_assignments + assignments
block = Block(assignments) indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
assignments = indexing.add_loop_ctr_assignments(assignments, loop_counter_symbols)
block = indexing.guard(Block(assignments), common_shape)
block = indexing.guard(block, common_shape)
unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers) unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)
ast = KernelFunction(block, ast = KernelFunction(block,
...@@ -92,11 +90,11 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], ...@@ -92,11 +90,11 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
f.spatial_dimensions, f.index_dimensions) f.spatial_dimensions, f.index_dimensions)
for f in all_fields} for f in all_fields}
coord_mapping = {f.name: cell_idx_symbols for f in all_fields} coord_mapping = {f.name: loop_counter_symbols for f in all_fields}
if any(FieldType.is_buffer(f) for f in all_fields): if any(FieldType.is_buffer(f) for f in all_fields):
iteration_space = indexing.iteration_space(common_shape) base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
resolve_buffer_accesses(ast, get_base_buffer_index(ast, cell_idx_symbols, iteration_space), read_only_fields) resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)
resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info, resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
field_to_fixed_coordinates=coord_mapping) field_to_fixed_coordinates=coord_mapping)
...@@ -148,7 +146,7 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol ...@@ -148,7 +146,7 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
data_type = ind_f.dtype data_type = ind_f.dtype
if data_type.has_element(name): if data_type.has_element(name):
rhs = ind_f[0](name) rhs = ind_f[0](name)
lhs = TypedSymbol(name, np.int64) lhs = TypedSymbol(name, data_type.get_element_type(name))
return SympyAssignment(lhs, rhs) return SympyAssignment(lhs, rhs)
raise ValueError(f"Index {name} not found in any of the passed index fields") raise ValueError(f"Index {name} not found in any of the passed index fields")
...@@ -157,8 +155,12 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol ...@@ -157,8 +155,12 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments] coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments]
idx_field = list(index_fields)[0] idx_field = list(index_fields)[0]
indexing = indexing_creator(field=idx_field,
iteration_slice=[slice(None, None, None)] * len(idx_field.spatial_shape)) iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape),
idx_field.spatial_shape)
indexing = indexing_creator(iteration_space=iteration_space,
data_layout=idx_field.layout)
function_body = Block(coordinate_symbol_assignments + assignments) function_body = Block(coordinate_symbol_assignments + assignments)
function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape) function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape)
......
...@@ -275,7 +275,8 @@ def test_buffer_indexing(): ...@@ -275,7 +275,8 @@ def test_buffer_indexing():
assert len(spatial_shape_symbols) <= 3 assert len(spatial_shape_symbols) <= 3
def test_iteration_slices(): @pytest.mark.parametrize('gpu_indexing', ("block", "line"))
def test_iteration_slices(gpu_indexing):
num_cell_values = 19 num_cell_values = 19
dt = np.uint64 dt = np.uint64
fields = _generate_fields(dt=dt, stencil_directions=num_cell_values) fields = _generate_fields(dt=dt, stencil_directions=num_cell_values)
...@@ -303,7 +304,8 @@ def test_iteration_slices(): ...@@ -303,7 +304,8 @@ def test_iteration_slices():
gpu_dst_arr.fill(0) gpu_dst_arr.fill(0)
config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}) data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
gpu_indexing=gpu_indexing)
pack_code = create_kernel(pack_eqs, config=config) pack_code = create_kernel(pack_eqs, config=config)
pack_kernel = pack_code.compile() pack_kernel = pack_code.compile()
...@@ -316,7 +318,8 @@ def test_iteration_slices(): ...@@ -316,7 +318,8 @@ def test_iteration_slices():
unpack_eqs.append(eq) unpack_eqs.append(eq)
config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}) data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
gpu_indexing=gpu_indexing)
unpack_code = create_kernel(unpack_eqs, config=config) unpack_code = create_kernel(unpack_eqs, config=config)
unpack_kernel = unpack_code.compile() unpack_kernel = unpack_code.compile()
......
...@@ -8,7 +8,7 @@ from scipy.ndimage import convolve ...@@ -8,7 +8,7 @@ from scipy.ndimage import convolve
from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
from pystencils.gpu import BlockIndexing from pystencils.gpu import BlockIndexing
from pystencils.simp import sympy_cse_on_assignment_list from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice
try: try:
import cupy import cupy
...@@ -164,14 +164,17 @@ def test_periodicity(): ...@@ -164,14 +164,17 @@ def test_periodicity():
@pytest.mark.parametrize("device_number", device_numbers) @pytest.mark.parametrize("device_number", device_numbers)
def test_block_indexing(device_number): def test_block_indexing(device_number):
f = fields("f: [3D]") f = fields("f: [3D]")
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False) s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32) assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8) assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False) bi = BlockIndexing(s, f.layout, block_size=(32, 1, 1),
permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
maximum_block_size="auto", device_number=device_number) maximum_block_size="auto", device_number=device_number)
# This function should be used if number of needed registers is known. Can be determined with func.num_regs # This function should be used if number of needed registers is known. Can be determined with func.num_regs
...@@ -187,3 +190,19 @@ def test_block_indexing(device_number): ...@@ -187,3 +190,19 @@ def test_block_indexing(device_number):
assert np.prod(blocks) * registers_per_thread < max_registers_per_block assert np.prod(blocks) * registers_per_thread < max_registers_per_block
@pytest.mark.parametrize('gpu_indexing', ("block", "line"))
def test_four_dimensional_kernel(gpu_indexing):
arr_cpu = np.arange(625, dtype=np.float64).reshape((5, 5, 5, 5), order='C')
arr_gpu = cp.asarray(arr_cpu)
iteration_slice = make_slice[:, :, :, :]
f = Field.create_generic("f", 4)
update_rule = [Assignment(f.center, sp.Symbol("value"))]
config = CreateKernelConfig(target=Target.GPU, gpu_indexing=gpu_indexing, iteration_slice=iteration_slice)
ast = create_kernel(sympy_cse_on_assignment_list(update_rule), config=config)
kernel = ast.compile()
kernel(f=arr_gpu, value=np.float64(42.0))
np.testing.assert_equal(arr_gpu.get(), np.ones((5, 5, 5, 5)) * 42.0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment