diff --git a/.flake8 b/.flake8 index 3f946922a5f68e39bc02c65d4a5f4ffdb10e05e1..05758405a65664809b3a0a71893554a1af8e3381 100644 --- a/.flake8 +++ b/.flake8 @@ -4,4 +4,4 @@ exclude=src/pystencils/jupyter.py, src/pystencils/plot.py src/pystencils/session.py src/pystencils/old -ignore = W293 W503 W291 C901 E741 +ignore = W293 W503 W291 C901 E741 E704 diff --git a/conftest.py b/conftest.py index 0c6c49153770bc40c90cb09e990d12561f7bad7a..4e8e2b73ab49ff1f881e755b19f9a46e07c0837e 100644 --- a/conftest.py +++ b/conftest.py @@ -203,3 +203,8 @@ else: return IPyNbFile.from_parent(fspath=path, parent=parent) else: return IPyNbFile(path, parent) + + +# Fixtures + +from tests.fixtures import * diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/backend/jit/gpu_cupy.py index 563a9c06a7261343e6435737c47854a55d54a05e..1dd18767160a626ff7972ebb78f83bb3e64a1efc 100644 --- a/src/pystencils/backend/jit/gpu_cupy.py +++ b/src/pystencils/backend/jit/gpu_cupy.py @@ -41,6 +41,7 @@ class CupyKernelWrapper(KernelWrapper): self._kfunc: GpuKernelFunction = kfunc self._raw_kernel = raw_kernel self._block_size = block_size + self._num_blocks: tuple[int, int, int] | None = None self._args_cache: dict[Any, tuple] = dict() @property @@ -59,6 +60,14 @@ class CupyKernelWrapper(KernelWrapper): def block_size(self, bs: tuple[int, int, int]): self._block_size = bs + @property + def num_blocks(self) -> tuple[int, int, int] | None: + return self._num_blocks + + @num_blocks.setter + def num_blocks(self, nb: tuple[int, int, int] | None): + self._num_blocks = nb + def __call__(self, **kwargs: Any): kernel_args, launch_grid = self._get_cached_args(**kwargs) device = self._get_device(kernel_args) @@ -72,7 +81,7 @@ class CupyKernelWrapper(KernelWrapper): return devices.pop() def _get_cached_args(self, **kwargs): - key = (self._block_size,) + tuple((k, id(v)) for k, v in kwargs.items()) + key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items()) if key not in self._args_cache: args = self._get_args(**kwargs) @@ -185,25 +194,36 @@ class CupyKernelWrapper(KernelWrapper): symbolic_threads_range = self._kfunc.threads_range - threads_range: list[int] = [ - evaluate_expression(expr, valuation) - for expr in symbolic_threads_range.num_work_items - ] + if self._num_blocks is not None: + launch_grid = LaunchGrid(self._num_blocks, self._block_size) - if symbolic_threads_range.dim < 3: - threads_range += [1] * (3 - symbolic_threads_range.dim) + elif symbolic_threads_range is not None: + threads_range: list[int] = [ + evaluate_expression(expr, valuation) + for expr in symbolic_threads_range.num_work_items + ] - def div_ceil(a, b): - return a // b if a % b == 0 else a // b + 1 + if symbolic_threads_range.dim < 3: + threads_range += [1] * (3 - symbolic_threads_range.dim) - # TODO: Refine this? - grid_size = tuple( - div_ceil(threads, tpb) - for threads, tpb in zip(threads_range, self._block_size) - ) - assert len(grid_size) == 3 + def div_ceil(a, b): + return a // b if a % b == 0 else a // b + 1 + + # TODO: Refine this? + num_blocks = tuple( + div_ceil(threads, tpb) + for threads, tpb in zip(threads_range, self._block_size) + ) + assert len(num_blocks) == 3 + + launch_grid = LaunchGrid(num_blocks, self._block_size) - launch_grid = LaunchGrid(grid_size, self._block_size) + else: + raise JitError( + "Unable to determine launch grid for GPU kernel invocation: " + "No manual grid size was specified, and the number of threads could not " + "be determined automatically." + ) return tuple(args), launch_grid diff --git a/src/pystencils/backend/kernelcreation/ast_factory.py b/src/pystencils/backend/kernelcreation/ast_factory.py index 56bcea11db40aa4b71d9570952958791e7ef35da..3b07ef862d80bcc5058975cd2083fbe1704f2d82 100644 --- a/src/pystencils/backend/kernelcreation/ast_factory.py +++ b/src/pystencils/backend/kernelcreation/ast_factory.py @@ -139,6 +139,13 @@ class AstFactory: self._typify(self.parse_index(iter_slice) + self.parse_index(1)) ) step = self.parse_index(1) + + if normalize_to is not None: + upper_limit = self.parse_index(normalize_to) + if isinstance(start, PsConstantExpr) and start.constant.value < 0: + start = fold(self._typify(upper_limit.clone() + start)) + stop = fold(self._typify(upper_limit.clone() + stop)) + else: start = self._parse_any_index( iter_slice.start if iter_slice.start is not None else 0 @@ -157,21 +164,21 @@ class AstFactory: f"Invalid value for `slice.step`: {step.constant.value}" ) - if normalize_to is not None: - upper_limit = self.parse_index(normalize_to) - if isinstance(start, PsConstantExpr) and start.constant.value < 0: - start = fold(self._typify(upper_limit.clone() + start)) + if normalize_to is not None: + upper_limit = self.parse_index(normalize_to) + if isinstance(start, PsConstantExpr) and start.constant.value < 0: + start = fold(self._typify(upper_limit.clone() + start)) - if stop is None: - stop = upper_limit - elif isinstance(stop, PsConstantExpr) and stop.constant.value < 0: - stop = fold(self._typify(upper_limit.clone() + stop)) + if stop is None: + stop = upper_limit + elif isinstance(stop, PsConstantExpr) and stop.constant.value < 0: + stop = fold(self._typify(upper_limit.clone() + stop)) + + elif stop is None: + raise ValueError( + "Cannot parse a slice with `stop == None` if no normalization limit is given" + ) - elif stop is None: - raise ValueError( - "Cannot parse a slice with `stop == None` if no normalization limit is given" - ) - assert stop is not None # for mypy return start, stop, step diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py index 50a0e31400e2c6793c7b2bd09108394373cc4052..9df9883ce67d6d856335a7a7a9537f829b7df11e 100644 --- a/src/pystencils/backend/kernelcreation/iteration_space.py +++ b/src/pystencils/backend/kernelcreation/iteration_space.py @@ -6,6 +6,7 @@ from functools import reduce from operator import mul from ...defaults import DEFAULTS +from ...config import _AUTO_TYPE, AUTO from ...simp import AssignmentCollection from ...field import Field, FieldType @@ -195,21 +196,25 @@ class FullIterationSpace(IterationSpace): def dimensions(self): """The dimensions of this iteration space""" return self._dimensions + + @property + def counters(self) -> tuple[PsSymbol, ...]: + return tuple(dim.counter for dim in self._dimensions) @property - def lower(self): + def lower(self) -> tuple[PsExpression, ...]: """Lower limits of each dimension""" - return (dim.start for dim in self._dimensions) + return tuple(dim.start for dim in self._dimensions) @property - def upper(self): + def upper(self) -> tuple[PsExpression, ...]: """Upper limits of each dimension""" - return (dim.stop for dim in self._dimensions) + return tuple(dim.stop for dim in self._dimensions) @property - def steps(self): + def steps(self) -> tuple[PsExpression, ...]: """Iteration steps of each dimension""" - return (dim.step for dim in self._dimensions) + return tuple(dim.step for dim in self._dimensions) @property def archetype_field(self) -> Field | None: @@ -412,7 +417,7 @@ def create_sparse_iteration_space( def create_full_iteration_space( ctx: KernelCreationContext, assignments: AssignmentCollection, - ghost_layers: None | int | Sequence[int | tuple[int, int]] = None, + ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None, iteration_slice: None | int | slice | tuple[int | slice, ...] = None, ) -> IterationSpace: assert not ctx.fields.index_fields @@ -452,16 +457,7 @@ def create_full_iteration_space( # Otherwise, if an iteration slice was specified, use that # Otherwise, use the inferred ghost layers - if ghost_layers is not None: - ctx.metadata["ghost_layers"] = ghost_layers - return FullIterationSpace.create_with_ghost_layers( - ctx, ghost_layers, archetype_field - ) - elif iteration_slice is not None: - return FullIterationSpace.create_from_slice( - ctx, iteration_slice, archetype_field - ) - else: + if ghost_layers is AUTO: if len(domain_field_accesses) > 0: inferred_gls = max( [fa.required_ghost_layers for fa in domain_field_accesses] @@ -473,3 +469,15 @@ def create_full_iteration_space( return FullIterationSpace.create_with_ghost_layers( ctx, inferred_gls, archetype_field ) + elif ghost_layers is not None: + assert not isinstance(ghost_layers, _AUTO_TYPE) + ctx.metadata["ghost_layers"] = ghost_layers + return FullIterationSpace.create_with_ghost_layers( + ctx, ghost_layers, archetype_field + ) + elif iteration_slice is not None: + return FullIterationSpace.create_from_slice( + ctx, iteration_slice, archetype_field + ) + else: + assert False, "unreachable code" diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py index 2e08978fb239c72d86da44f28d81532bbc6e917f..afd54c1107e13e73d2141cd9b0d7da2e97f0ef3e 100644 --- a/src/pystencils/backend/kernelfunction.py +++ b/src/pystencils/backend/kernelfunction.py @@ -262,7 +262,7 @@ class GpuKernelFunction(KernelFunction): def __init__( self, body: PsBlock, - threads_range: GpuThreadsRange, + threads_range: GpuThreadsRange | None, target: Target, name: str, parameters: Sequence[KernelParameter], @@ -276,7 +276,7 @@ class GpuKernelFunction(KernelFunction): self._threads_range = threads_range @property - def threads_range(self) -> GpuThreadsRange: + def threads_range(self) -> GpuThreadsRange | None: return self._threads_range @@ -284,14 +284,16 @@ def create_gpu_kernel_function( ctx: KernelCreationContext, platform: Platform, body: PsBlock, - threads_range: GpuThreadsRange, + threads_range: GpuThreadsRange | None, function_name: str, target_spec: Target, jit: JitBase, ): undef_symbols = collect_undefined_symbols(body) - for threads in threads_range.num_work_items: - undef_symbols |= collect_undefined_symbols(threads) + + if threads_range is not None: + for threads in threads_range.num_work_items: + undef_symbols |= collect_undefined_symbols(threads) params = _get_function_params(ctx, undef_symbols) req_headers = _get_headers(ctx, platform, body) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 323dcc5a9b306990a11f85037f305794c8729625..7ebbd4fd4762821d3beb6f0cbbc04a33f775ecdf 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,3 +1,5 @@ +from warnings import warn + from ...types import constify from ..exceptions import MaterializationError from .generic_gpu import GenericGpu, GpuThreadsRange @@ -7,7 +9,7 @@ from ..kernelcreation import ( IterationSpace, FullIterationSpace, SparseIterationSpace, - AstFactory + AstFactory, ) from ..kernelcreation.context import KernelCreationContext @@ -43,6 +45,7 @@ GRID_DIM = [ class CudaPlatform(GenericGpu): + """Platform for CUDA-based GPUs.""" def __init__( self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None @@ -57,7 +60,7 @@ class CudaPlatform(GenericGpu): def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> tuple[PsBlock, GpuThreadsRange | None]: if isinstance(ispace, FullIterationSpace): return self._prepend_dense_translation(body, ispace) elif isinstance(ispace, SparseIterationSpace): @@ -112,6 +115,11 @@ class CudaPlatform(GenericGpu): case MathFunctions.Abs if dtype.width == 16: cfunc = CFunction(" __habs", arg_types, dtype) + case _: + raise MaterializationError( + f"Cannot materialize call to function {func}" + ) + call.function = cfunc return call @@ -123,9 +131,21 @@ class CudaPlatform(GenericGpu): def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> tuple[PsBlock, GpuThreadsRange | None]: dimensions = ispace.dimensions_in_loop_order() - launch_config = GpuThreadsRange.from_ispace(ispace) + + if not self._cfg.manual_launch_grid: + try: + threads_range = GpuThreadsRange.from_ispace(ispace) + except MaterializationError as e: + warn( + str(e.args[0]) + + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.", + UserWarning, + ) + threads_range = None + else: + threads_range = None indexing_decls = [] conds = [] @@ -146,6 +166,8 @@ class CudaPlatform(GenericGpu): if not self._cfg.omit_range_check: conds.append(PsLt(ctr, dim.stop)) + indexing_decls = indexing_decls[::-1] + if conds: condition: PsExpression = conds[0] for cond in conds[1:]: @@ -155,7 +177,7 @@ class CudaPlatform(GenericGpu): body.statements = indexing_decls + body.statements ast = body - return ast, launch_config + return ast, threads_range def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 774b9405cd04b8dc6489cd6b6ae36e4aa563f157..4c990429d8fd33f800dbf4f1f7797b7520c93b39 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -10,6 +10,7 @@ from ..kernelcreation.iteration_space import ( SparseIterationSpace, ) from .platform import Platform +from ..exceptions import MaterializationError class GpuThreadsRange: @@ -56,6 +57,19 @@ class GpuThreadsRange: raise NotImplementedError( f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space" ) + + from ..ast.analysis import collect_undefined_symbols as collect + + for dim in dimensions: + symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step) + for ctr in ispace.counters: + if ctr in symbs: + raise MaterializationError( + "Unable to construct GPU threads range for iteration space: " + f"Limits of dimension counter {dim.counter.name} " + f"depend on another dimension's counter {ctr.name}" + ) + work_items = [ispace.actual_iterations(dim) for dim in dimensions] return GpuThreadsRange(work_items) @@ -63,6 +77,6 @@ class GpuThreadsRange: class GenericGpu(Platform): @abstractmethod def materialize_iteration_space( - self, block: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + self, body: PsBlock, ispace: IterationSpace + ) -> tuple[PsBlock, GpuThreadsRange | None]: pass diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index cab4d0a7b0143eabd4e40602b45ef2f66592ee8c..2c7ee1c5f4750eac0375bc31a3f44b9eea50642b 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -27,7 +27,7 @@ class Platform(ABC): @abstractmethod def materialize_iteration_space( - self, block: PsBlock, ispace: IterationSpace + self, body: PsBlock, ispace: IterationSpace ) -> PsBlock | tuple[PsBlock, Any]: pass diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index c89698193d9764996cfab7a5c5fd7e70358fbac6..e1e4fea502c08de86e13de5e3c251f1b7a7d0ee6 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -8,7 +8,7 @@ from ..kernelcreation import KernelCreationContext from ..constants import PsConstant from ..ast import PsAstNode from ..ast.structural import PsLoop, PsBlock, PsDeclaration -from ..ast.expressions import PsExpression +from ..ast.expressions import PsExpression, PsTernary, PsGt from ..ast.vector import PsVecBroadcast from ..ast.analysis import collect_undefined_symbols @@ -18,7 +18,7 @@ from .rewrite import substitute_symbols class LoopVectorizer: """Vectorize loops. - + The loop vectorizer provides methods to vectorize single loops inside an AST using a given number of vector lanes. During vectorization, the loop body is transformed using the `AstVectorizer`, @@ -64,29 +64,26 @@ class LoopVectorizer: @overload def vectorize_select_loops( self, node: PsBlock, predicate: Callable[[PsLoop], bool] - ) -> PsBlock: - ... + ) -> PsBlock: ... @overload def vectorize_select_loops( self, node: PsLoop, predicate: Callable[[PsLoop], bool] - ) -> PsLoop | PsBlock: - ... + ) -> PsLoop | PsBlock: ... @overload def vectorize_select_loops( self, node: PsAstNode, predicate: Callable[[PsLoop], bool] - ) -> PsAstNode: - ... + ) -> PsAstNode: ... def vectorize_select_loops( self, node: PsAstNode, predicate: Callable[[PsLoop], bool] ) -> PsAstNode: """Select and vectorize loops from a syntax tree according to a predicate. - + Finds each loop inside a subtree and evaluates ``predicate`` on them. If ``predicate(loop)`` evaluates to `True`, the loop is vectorized. - + Loops nested inside a vectorized loop will not be processed. Args: @@ -139,7 +136,7 @@ class LoopVectorizer: # Generate vectorized loop body simd_body = self._vectorize_ast(loop.body, vc) - + if vector_ctr in collect_undefined_symbols(simd_body): simd_body.statements.insert(0, vector_counter_decl) @@ -186,20 +183,31 @@ class LoopVectorizer: trailing_start = self._ctx.get_new_symbol( f"__{scalar_ctr.name}_trailing_start", scalar_ctr.get_dtype() ) + trailing_start_decl = self._type_fold( PsDeclaration( PsExpression.make(trailing_start), - ( + PsTernary( + # If at least one vectorized iteration took place... + PsGt( + PsExpression.make(simd_stop), + simd_start.clone(), + ), + # start from the smallest non-valid multiple of simd_step, offset from simd_start ( - PsExpression.make(simd_stop) - - simd_start.clone() - - PsExpression.make(PsConstant(1)) + ( + PsExpression.make(simd_stop) + - simd_start.clone() + - PsExpression.make(PsConstant(1)) + ) + / PsExpression.make(simd_step) + + PsExpression.make(PsConstant(1)) ) - / PsExpression.make(simd_step) - + PsExpression.make(PsConstant(1)) - ) - * PsExpression.make(simd_step) - + simd_start.clone(), + * PsExpression.make(simd_step) + + simd_start.clone(), + # otherwise start at zero + simd_start.clone(), + ), ) ) diff --git a/src/pystencils/boundaries/boundaryhandling.py b/src/pystencils/boundaries/boundaryhandling.py index 5c0869c2923a5091471fe60b4a32ee9a83aee1fa..fe8dd7d0059940841277f954cc322a42d2d744b6 100644 --- a/src/pystencils/boundaries/boundaryhandling.py +++ b/src/pystencils/boundaries/boundaryhandling.py @@ -314,7 +314,7 @@ class BoundaryHandling: def _create_boundary_kernel(self, symbolic_field, symbolic_index_field, boundary_obj): return create_boundary_kernel(symbolic_field, symbolic_index_field, self.stencil, boundary_obj, - target=self._target,) # cpu_openmp=self._openmp) TODO: replace + target=self._target, cpu_openmp=self._openmp) def _create_index_fields(self): dh = self._data_handling diff --git a/src/pystencils/config.py b/src/pystencils/config.py index 43918d386696c6c1f2a6287554a7520643557377..c08ddc16198627adba97169e02724371eca62158 100644 --- a/src/pystencils/config.py +++ b/src/pystencils/config.py @@ -28,6 +28,14 @@ class PsOptionsError(Exception): """Indicates an option clash in the `CreateKernelConfig`.""" +class _AUTO_TYPE: + ... + + +AUTO = _AUTO_TYPE() +"""Special value that can be passed to some options for invoking automatic behaviour.""" + + @dataclass class OpenMpConfig: """Parameters controlling kernel parallelization using OpenMP.""" @@ -182,6 +190,14 @@ class GpuIndexingConfig: block_size: tuple[int, int, int] | None = None """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system.""" + manual_launch_grid: bool = False + """Always require a manually specified launch grid when running this kernel. + + If set to `True`, the code generator will not attempt to infer the size of + the launch grid from the kernel. + The launch grid will then have to be specified manually at runtime. + """ + sycl_automatic_block_size: bool = True """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size. @@ -213,32 +229,43 @@ class CreateKernelConfig: function_name: str = "kernel" """Name of the generated function""" - ghost_layers: None | int | Sequence[int | tuple[int, int]] = None + ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None """Specifies the number of ghost layers of the iteration region. Options: - - `None`: Required ghost layers are inferred from field accesses + - :py:data:`AUTO <pystencils.config.AUTO>`: Required ghost layers are inferred from field accesses - `int`: A uniform number of ghost layers in each spatial coordinate is applied - ``Sequence[int, tuple[int, int]]``: Ghost layers are specified for each spatial coordinate. In each coordinate, a single integer specifies the ghost layers at both the lower and upper iteration limit, while a pair of integers specifies the lower and upper ghost layers separately. When manually specifying ghost layers, it is the user's responsibility to avoid out-of-bounds memory accesses. - If ``ghost_layers=None`` is specified, the iteration region may otherwise be set using the `iteration_slice` option. + + .. note:: + At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set. """ - iteration_slice: None | Sequence[slice] = None + iteration_slice: None | int | slice | tuple[int | slice] = None """Specifies the kernel's iteration slice. - - `iteration_slice` may only be set if ``ghost_layers=None``. - If it is set, a slice must be specified for each spatial coordinate. - TODO: Specification of valid slices and their behaviour + + Example: + >>> cfg = CreateKernelConfig( + ... iteration_slice=ps.make_slice[3:14, 2:-2] + ... ) + >>> cfg.iteration_slice + (slice(3, 14, None), slice(2, -2, None)) + + .. note:: + At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set. """ index_field: Field | None = None """Index field for a sparse kernel. If this option is set, a sparse kernel with the given field as index field will be generated. + + .. note:: + At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set. """ """Data Types""" diff --git a/src/pystencils/datahandling/serial_datahandling.py b/src/pystencils/datahandling/serial_datahandling.py index 8521dda1014ab4e031706a1b21f4ae3c28259a05..6a5ce573085b0380196e208c7d19ec16cf5fbb37 100644 --- a/src/pystencils/datahandling/serial_datahandling.py +++ b/src/pystencils/datahandling/serial_datahandling.py @@ -291,7 +291,10 @@ class SerialDataHandling(DataHandling): def synchronization_function(self, names, stencil=None, target=None, functor=None, **_): if target is None: target = self.default_target - assert target in (Target.CPU, Target.GPU) + + if not (target.is_cpu() or target == Target.CUDA): + raise ValueError(f"Unsupported target: {target}") + if not hasattr(names, '__len__') or type(names) is str: names = [names] @@ -325,7 +328,7 @@ class SerialDataHandling(DataHandling): values_per_cell = values_per_cell[0] if len(filtered_stencil) > 0: - if target == Target.CPU: + if target.is_cpu(): if functor is None: from pystencils.slicing import get_periodic_boundary_functor functor = get_periodic_boundary_functor diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py index 651a67cf2092a93e9e1cab3f393c2edd1baf15a9..548fbc9bba8c1606fbba2929324e9cea273b73b3 100644 --- a/src/pystencils/kernelcreation.py +++ b/src/pystencils/kernelcreation.py @@ -6,6 +6,7 @@ from .config import ( CreateKernelConfig, OpenMpConfig, VectorizationConfig, + AUTO ) from .backend import KernelFunction from .types import create_numeric_type, PsIntegerType, PsScalarType @@ -91,49 +92,18 @@ class DefaultKernelCreationDriver: self, assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase, ): - if isinstance(assignments, AssignmentBase): - assignments = [assignments] - - if not isinstance(assignments, AssignmentCollection): - assignments = AssignmentCollection(assignments) # type: ignore - - _ = _parse_simplification_hints(assignments) - - analysis = KernelAnalysis( - self._ctx, - not self._cfg.skip_independence_check, - not self._cfg.allow_double_writes, + kernel_body = self.parse_kernel_body( + assignments ) - analysis(assignments) - - if len(self._ctx.fields.index_fields) > 0 or self._cfg.index_field is not None: - ispace = create_sparse_iteration_space( - self._ctx, assignments, index_field=self._cfg.index_field - ) - else: - ispace = create_full_iteration_space( - self._ctx, - assignments, - ghost_layers=self._cfg.ghost_layers, - iteration_slice=self._cfg.iteration_slice, - ) - - self._ctx.set_iteration_space(ispace) - - freeze = FreezeExpressions(self._ctx) - kernel_body = freeze(assignments) - - typify = Typifier(self._ctx) - kernel_body = typify(kernel_body) match self._platform: case GenericCpu(): kernel_ast = self._platform.materialize_iteration_space( - kernel_body, ispace + kernel_body, self._ctx.get_iteration_space() ) case GenericGpu(): kernel_ast, gpu_threads = self._platform.materialize_iteration_space( - kernel_body, ispace + kernel_body, self._ctx.get_iteration_space() ) # Fold and extract constants @@ -179,6 +149,53 @@ class DefaultKernelCreationDriver: self._cfg.get_jit(), ) + def parse_kernel_body( + self, + assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase, + ) -> PsBlock: + if isinstance(assignments, AssignmentBase): + assignments = [assignments] + + if not isinstance(assignments, AssignmentCollection): + assignments = AssignmentCollection(assignments) # type: ignore + + _ = _parse_simplification_hints(assignments) + + analysis = KernelAnalysis( + self._ctx, + not self._cfg.skip_independence_check, + not self._cfg.allow_double_writes, + ) + analysis(assignments) + + if self._cfg.index_field is not None: + ispace = create_sparse_iteration_space( + self._ctx, assignments, index_field=self._cfg.index_field + ) + else: + gls = self._cfg.ghost_layers + islice = self._cfg.iteration_slice + + if gls is None and islice is None: + gls = AUTO + + ispace = create_full_iteration_space( + self._ctx, + assignments, + ghost_layers=gls, + iteration_slice=islice, + ) + + self._ctx.set_iteration_space(ispace) + + freeze = FreezeExpressions(self._ctx) + kernel_body = freeze(assignments) + + typify = Typifier(self._ctx) + kernel_body = typify(kernel_body) + + return kernel_body + def _transform_for_cpu(self, kernel_ast: PsBlock): canonicalize = CanonicalizeSymbols(self._ctx, True) kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) diff --git a/src/pystencils/simp/simplificationstrategy.py b/src/pystencils/simp/simplificationstrategy.py index 22ffa34d04bc2731f615bd685137c8abebf9d58b..7cba94f8bb80e69afe039a1bc88822e627781a23 100644 --- a/src/pystencils/simp/simplificationstrategy.py +++ b/src/pystencils/simp/simplificationstrategy.py @@ -57,7 +57,7 @@ class SimplificationStrategy: def __str__(self): try: - import tabulate + from tabulate import tabulate return tabulate(self.elements, headers=['Name', 'Runtime', 'Adds', 'Muls', 'Divs', 'Total']) except ImportError: result = "Name, Adds, Muls, Divs, Runtime\n" diff --git a/src/pystencils/types/types.py b/src/pystencils/types/types.py index ae751992db61c7bcb2230c5c4dec780eb7ed6d4c..7645a452ffaad83de802da90e10b93fa43e1d3dc 100644 --- a/src/pystencils/types/types.py +++ b/src/pystencils/types/types.py @@ -683,7 +683,7 @@ class PsIeeeFloatType(PsScalarType): def create_constant(self, value: Any) -> Any: np_type = self.NUMPY_TYPES[self._width] - if isinstance(value, (int, float, np.floating)): + if isinstance(value, (int, float, np.integer, np.floating)): finfo = np.finfo(np_type) # type: ignore if value < finfo.min or value > finfo.max: raise PsTypeError( diff --git a/tests/fixtures.py b/tests/fixtures.py new file mode 100644 index 0000000000000000000000000000000000000000..7c95216147d02b7943b0c218af6fbb5b5175559b --- /dev/null +++ b/tests/fixtures.py @@ -0,0 +1,71 @@ +"""Fixtures for the pystencils test suite + +This module provides a number of fixtures used by the pystencils test suite. +Use these fixtures wherever applicable to extend the code surface area covered +by your tests: + +- All tests that should work for every target should use the `target` fixture +- All tests that should work with the highest optimization level for every target + should use the `gen_config` fixture +- Use the `xp` fixture to access the correct array module (numpy or cupy) depending + on the target + +""" + +import pytest + +from types import ModuleType +from dataclasses import replace + +import pystencils as ps + +AVAILABLE_TARGETS = [ps.Target.GenericCPU] + +try: + import cupy + + AVAILABLE_TARGETS += [ps.Target.CUDA] +except ImportError: + pass + +AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets() +TARGET_IDS = [t.name for t in AVAILABLE_TARGETS] + +@pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS) +def target(request) -> ps.Target: + """Provides all code generation targets available on the current hardware""" + return request.param + +@pytest.fixture +def gen_config(target: ps.Target): + """Default codegen configuration for the current target. + + For GPU targets, set default indexing options. + For vector-CPU targets, set default vectorization config. + """ + + gen_config = ps.CreateKernelConfig(target=target) + + if target.is_vector_cpu(): + gen_config = replace( + gen_config, + cpu_optim=ps.CpuOptimConfig( + vectorize=ps.VectorizationConfig(assume_inner_stride_one=True) + ), + ) + + return gen_config + +@pytest.fixture() +def xp(target: ps.Target) -> ModuleType: + """Primary array module for the current target. + + Returns: + `cupy` if `target == Target.CUDA`, and `numpy` otherwise + """ + if target == ps.Target.CUDA: + import cupy as xp + return xp + else: + import numpy as np + return np diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py index b9cebb3548851af29406acb93a3a4ea14127e35e..d02bfd8e46e8fc8c8f19bcebffc0db52787ff1bd 100644 --- a/tests/kernelcreation/test_domain_kernels.py +++ b/tests/kernelcreation/test_domain_kernels.py @@ -18,35 +18,6 @@ from pystencils.assignment import assignment_from_stencil from pystencils.kernelcreation import create_kernel, KernelFunction from pystencils.backend.emission import emit_code -AVAILABLE_TARGETS = [Target.GenericCPU] - -try: - import cupy - - AVAILABLE_TARGETS += [Target.CUDA] -except ImportError: - pass - -AVAILABLE_TARGETS += Target.available_vector_cpu_targets() -TEST_IDS = [t.name for t in AVAILABLE_TARGETS] - - -@pytest.fixture(params=AVAILABLE_TARGETS, ids=TEST_IDS) -def gen_config(request): - target: Target = request.param - - gen_config = CreateKernelConfig(target=target) - - if Target._VECTOR in target: - gen_config = replace( - gen_config, - cpu_optim=CpuOptimConfig( - vectorize=VectorizationConfig(assume_inner_stride_one=True) - ), - ) - - return gen_config - def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig): code = emit_code(kernel) diff --git a/tests/kernelcreation/test_functions.py b/tests/kernelcreation/test_functions.py index e16201f819161fc611ec16b09dc98a7d7abf9445..cab1affcb3a344033458829ba28765a9a1ff8c48 100644 --- a/tests/kernelcreation/test_functions.py +++ b/tests/kernelcreation/test_functions.py @@ -7,7 +7,6 @@ from pystencils.backend.ast import dfs_preorder from pystencils.backend.ast.expressions import PsCall - def unary_function(name, xp): return { "exp": (sp.exp, xp.exp), diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py new file mode 100644 index 0000000000000000000000000000000000000000..fb7f37eba99e2625a0eeb050a49575f657342ca5 --- /dev/null +++ b/tests/kernelcreation/test_iteration_slices.py @@ -0,0 +1,192 @@ +import numpy as np +import sympy as sp +import pytest + +from dataclasses import replace + +from pystencils import ( + DEFAULTS, + Assignment, + Field, + TypedSymbol, + create_kernel, + make_slice, + Target, + CreateKernelConfig, + GpuIndexingConfig, + DynamicType, +) +from pystencils.sympyextensions.integer_functions import int_rem +from pystencils.simp import sympy_cse_on_assignment_list +from pystencils.slicing import normalize_slice +from pystencils.backend.jit.gpu_cupy import CupyKernelWrapper + + +def test_sliced_iteration(): + size = (4, 4) + src_arr = np.ones(size) + dst_arr = np.zeros_like(src_arr) + src_field = Field.create_from_numpy_array("src", src_arr) + dst_field = Field.create_from_numpy_array("dst", dst_arr) + + a, b = sp.symbols("a b") + update_rule = Assignment( + dst_field[0, 0], + ( + a * src_field[0, 1] + + a * src_field[0, -1] + + b * src_field[1, 0] + + b * src_field[-1, 0] + ) + / 4, + ) + + x_end = TypedSymbol("x_end", "int") + s = make_slice[1:x_end, 1] + x_end_value = size[1] - 1 + kernel = create_kernel( + sympy_cse_on_assignment_list([update_rule]), iteration_slice=s + ).compile() + + kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value) + + expected_result = np.zeros(size) + expected_result[1:x_end_value, 1] = 1 + np.testing.assert_almost_equal(expected_result, dst_arr) + + +@pytest.mark.parametrize( + "islice", + [ + make_slice[1:-1, 1:-1], + make_slice[3, 2:-2], + make_slice[2:-2:2, ::3], + make_slice[10:, :-5:2], + make_slice[-5:-1, -1], + make_slice[-3, -1] + ], +) +def test_numerical_slices(gen_config: CreateKernelConfig, xp, islice): + shape = (16, 16) + + f_arr = xp.zeros(shape) + expected = xp.zeros_like(f_arr) + expected[islice] = 1.0 + + f = Field.create_from_numpy_array("f", f_arr) + + update = Assignment(f.center(), 1) + gen_config = replace(gen_config, iteration_slice=islice) + + try: + kernel = create_kernel(update, gen_config).compile() + except NotImplementedError: + if gen_config.target.is_vector_cpu(): + # TODO Gather/Scatter not implemented yet + pytest.xfail("Gather/Scatter not available yet") + + kernel(f=f_arr) + + xp.testing.assert_array_equal(f_arr, expected) + + +def test_symbolic_slice(gen_config: CreateKernelConfig, xp): + shape = (16, 16) + + sx, sy, ex, ey = [ + TypedSymbol(n, DynamicType.INDEX_TYPE) for n in ("sx", "sy", "ex", "ey") + ] + + f_arr = xp.zeros(shape) + + f = Field.create_from_numpy_array("f", f_arr) + + update = Assignment(f.center(), 1) + islice = make_slice[sy:ey, sx:ex] + gen_config = replace(gen_config, iteration_slice=islice) + kernel = create_kernel(update, gen_config).compile() + + for slic in [make_slice[:, :], make_slice[1:-1, 2:-2], make_slice[8:14, 7:11]]: + slic = normalize_slice(slic, shape) + expected = xp.zeros_like(f_arr) + expected[slic] = 1.0 + + f_arr[:] = 0.0 + + kernel( + f=f_arr, + sy=slic[0].start, + ey=slic[0].stop, + sx=slic[1].start, + ex=slic[1].stop, + ) + + xp.testing.assert_array_equal(f_arr, expected) + + +def test_triangle_pattern(gen_config: CreateKernelConfig, xp): + shape = (16, 16) + + f_arr = xp.zeros(shape) + f = Field.create_from_numpy_array("f", f_arr) + + expected = xp.zeros_like(f_arr) + for r in range(shape[0]): + expected[r, r:] = 1.0 + + update = Assignment(f.center(), 1) + outer_counter = DEFAULTS.spatial_counters[0] + islice = make_slice[:, outer_counter:] + gen_config = replace(gen_config, iteration_slice=islice) + + if gen_config.target == Target.CUDA: + gen_config = replace( + gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True) + ) + + kernel = create_kernel(update, gen_config).compile() + + if isinstance(kernel, CupyKernelWrapper): + kernel.block_size = shape + (1,) + kernel.num_blocks = (1, 1, 1) + + kernel(f=f_arr) + + xp.testing.assert_array_equal(f_arr, expected) + + +def test_red_black_pattern(gen_config: CreateKernelConfig, xp): + shape = (16, 16) + + f_arr = xp.zeros(shape) + f = Field.create_from_numpy_array("f", f_arr) + + expected = xp.zeros_like(f_arr) + for r in range(shape[0]): + start = 0 if r % 2 == 0 else 1 + expected[r, start::2] = 1.0 + + update = Assignment(f.center(), 1) + outer_counter = DEFAULTS.spatial_counters[0] + start = sp.Piecewise((0, sp.Eq(int_rem(outer_counter, 2), 0)), (1, True)) + islice = make_slice[:, start::2] + gen_config = replace(gen_config, iteration_slice=islice) + + if gen_config.target == Target.CUDA: + gen_config = replace( + gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True) + ) + + try: + kernel = create_kernel(update, gen_config).compile() + except NotImplementedError: + if gen_config.target.is_vector_cpu(): + pytest.xfail("Gather/Scatter not implemented yet") + + if isinstance(kernel, CupyKernelWrapper): + kernel.block_size = (8, 16, 1) + kernel.num_blocks = (1, 1, 1) + + kernel(f=f_arr) + + xp.testing.assert_array_equal(f_arr, expected) diff --git a/tests/kernelcreation/test_sliced_iteration.py b/tests/kernelcreation/test_sliced_iteration.py deleted file mode 100644 index 5eff0a89d4d23386d00e7408a19ece93453d7d9d..0000000000000000000000000000000000000000 --- a/tests/kernelcreation/test_sliced_iteration.py +++ /dev/null @@ -1,29 +0,0 @@ -import numpy as np -import sympy as sp - -from pystencils import Assignment, Field, TypedSymbol, create_kernel, make_slice -from pystencils.simp import sympy_cse_on_assignment_list - - -def test_sliced_iteration(): - size = (4, 4) - src_arr = np.ones(size) - dst_arr = np.zeros_like(src_arr) - src_field = Field.create_from_numpy_array('src', src_arr) - dst_field = Field.create_from_numpy_array('dst', dst_arr) - - a, b = sp.symbols("a b") - update_rule = Assignment(dst_field[0, 0], - (a * src_field[0, 1] + a * src_field[0, -1] + - b * src_field[1, 0] + b * src_field[-1, 0]) / 4) - - x_end = TypedSymbol("x_end", "int") - s = make_slice[1:x_end, 1] - x_end_value = size[1] - 1 - kernel = create_kernel(sympy_cse_on_assignment_list([update_rule]), iteration_slice=s).compile() - - kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value) - - expected_result = np.zeros(size) - expected_result[1:x_end_value, 1] = 1 - np.testing.assert_almost_equal(expected_result, dst_arr) diff --git a/tests/nbackend/kernelcreation/test_iteration_space.py b/tests/nbackend/kernelcreation/test_iteration_space.py index 5d56abd2b818fa74fbd48aac0216d472112f8c64..abc1c9820002eb08454ef5bac7c0e1ba2bfca3ba 100644 --- a/tests/nbackend/kernelcreation/test_iteration_space.py +++ b/tests/nbackend/kernelcreation/test_iteration_space.py @@ -129,6 +129,36 @@ def test_slices_with_negative_start(): ) +def test_negative_singular_slices(): + ctx = KernelCreationContext() + factory = AstFactory(ctx) + + archetype_field = Field.create_generic("f", spatial_dimensions=2, layout="fzyx") + ctx.add_field(archetype_field) + archetype_arr = ctx.get_buffer(archetype_field) + + islice = (-2, -1) + ispace = FullIterationSpace.create_from_slice(ctx, islice, archetype_field) + + dims = ispace.dimensions + + assert dims[0].start.structurally_equal( + PsExpression.make(archetype_arr.shape[0]) + factory.parse_index(-2) + ) + + assert dims[0].stop.structurally_equal( + PsExpression.make(archetype_arr.shape[0]) + factory.parse_index(-1) + ) + + assert dims[1].start.structurally_equal( + PsExpression.make(archetype_arr.shape[1]) + factory.parse_index(-1) + ) + + assert dims[1].stop.structurally_equal( + PsExpression.make(archetype_arr.shape[1]) + ) + + def test_field_independent_slices(): ctx = KernelCreationContext() diff --git a/tests/nbackend/test_vectorization.py b/tests/nbackend/test_vectorization.py index aeb0ebaa0601537331d4daaee1bfcbfdcccb2e6a..55330c9ee8d5d675379418748aa085ab4ce3ae73 100644 --- a/tests/nbackend/test_vectorization.py +++ b/tests/nbackend/test_vectorization.py @@ -89,6 +89,11 @@ TEST_SETUPS: list[VectorTestSetup] = list( TEST_IDS = [t.name for t in TEST_SETUPS] +@pytest.fixture(params=TEST_SETUPS, ids=TEST_IDS) +def vectorization_setup(request) -> VectorTestSetup: + return request.param + + def create_vector_kernel( assignments: list[Assignment], field: Field, @@ -139,9 +144,9 @@ def create_vector_kernel( return kernel -@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS) @pytest.mark.parametrize("ghost_layers", [0, 2]) -def test_update_kernel(setup: VectorTestSetup, ghost_layers: int): +def test_update_kernel(vectorization_setup: VectorTestSetup, ghost_layers: int): + setup = vectorization_setup src, dst = fields(f"src(2), dst(4): {setup.numeric_dtype}[2D]", layout="fzyx") x = sp.symbols("x_:4") @@ -197,8 +202,8 @@ def test_update_kernel(setup: VectorTestSetup, ghost_layers: int): np.testing.assert_equal(dst_arr[:, -i, :], 0.0) -@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS) -def test_trailing_iterations(setup: VectorTestSetup): +def test_trailing_iterations(vectorization_setup: VectorTestSetup): + setup = vectorization_setup f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx") update = [Assignment(f(0), 2 * f(0))] @@ -216,3 +221,24 @@ def test_trailing_iterations(setup: VectorTestSetup): kernel(f=f_arr) np.testing.assert_equal(f_arr, 2.0) + + +def test_only_trailing_iterations(vectorization_setup: VectorTestSetup): + setup = vectorization_setup + f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx") + + update = [Assignment(f(0), 2 * f(0))] + + kernel = create_vector_kernel(update, f, setup) + + for trailing_iters in range(1, setup.lanes): + shape = (trailing_iters, 1) + f_arr = create_numpy_array_with_layout( + shape, layout=(1, 0), dtype=setup.numeric_dtype.numpy_dtype + ) + + f_arr[:] = 1.0 + + kernel(f=f_arr) + + np.testing.assert_equal(f_arr, 2.0)