From 9a15d1f77df46864d3d16068c71f99ec07b51628 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Mon, 10 Feb 2025 18:39:26 +0100 Subject: [PATCH] Implement Lambda and GpuLaunchGridConstraints. Start on backend WorkItemMapping. --- .../backend/platforms/generic_gpu.py | 26 ++++++++-- src/pystencils/codegen/config.py | 30 +++++++++++ src/pystencils/codegen/gpu_indexing.py | 51 +++++++++++++++++++ src/pystencils/codegen/lambdas.py | 45 ++++++++++++++++ 4 files changed, 149 insertions(+), 3 deletions(-) create mode 100644 src/pystencils/codegen/gpu_indexing.py create mode 100644 src/pystencils/codegen/lambdas.py diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 15df36cdd..f22c3c99b 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,6 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from abc import abstractmethod +from abc import ABC, abstractmethod from ..ast.expressions import PsExpression from ..ast.structural import PsBlock @@ -16,6 +16,26 @@ if TYPE_CHECKING: from ...codegen.kernel import GpuThreadsRange +class WorkItemMapping(ABC): + """Signature for work-item mappings used to modify the thread index mapping behavior""" + + @abstractmethod + def __call__( + self, + block_idx: tuple[PsExpression, PsExpression, PsExpression], + thread_idx: tuple[PsExpression, PsExpression, PsExpression], + ispace_rank: int, + ) -> tuple[PsExpression, ...]: + """Compute a work item index from the current block index, thread index, and iteration space dimensionality. + + Implementations of this method must return a tuple with `ispace_rank` entries, + containing expressions for the compressed index of the work item identified by the + given GPU block and thread index triples. + (The *compressed index* is the work item's index before application + of the iteration space's lower limits and strides.) + """ + + class GenericGpu(Platform): @abstractmethod def materialize_iteration_space( @@ -38,13 +58,13 @@ class GenericGpu(Platform): @classmethod def _threads_from_full_ispace(cls, ispace: FullIterationSpace) -> GpuThreadsRange: from ...codegen.kernel import GpuThreadsRange - + dimensions = ispace.dimensions_in_loop_order()[::-1] if len(dimensions) > 3: raise NotImplementedError( f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space" ) - + from ..ast.analysis import collect_undefined_symbols as collect for dim in dimensions: diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index bce075731..96ab13ea0 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -3,6 +3,7 @@ from __future__ import annotations from warnings import warn from abc import ABC from collections.abc import Collection +from enum import Enum, auto from typing import TYPE_CHECKING, Sequence, Generic, TypeVar, Callable, Any, cast from dataclasses import dataclass, InitVar, fields @@ -331,6 +332,35 @@ class CpuOptions(ConfigBase): """ +class GpuIndexingScheme(Enum): + """Available index translation schemes for GPU kernels.""" + + Linear3D = auto() + """Map coordinates to global thread indices. + + Supports up to three-dimensional iteration spaces. + For each dimension (with known start, stop and step values), compute the current iteration + point as ``start + step * (blockIdx.c * blockDim.c * threadDim.c)`` + (where c :math:`\\in` (x, y, z)). + """ + + Blockwise4D = auto() + """On a 3D grid of 1D blocks, map the fastest coordinate onto the intra-block thread index, + and slower coordinates onto the block index. + + Supports up to four-dimensional iteration spaces. + Using this indexing scheme, the iteration counters of up to four dimensions are assigned + like follows, from slowest to fastest: + + .. code-block:: C++ + + ctr_3 = blockIdx.z; + ctr_2 = blockIdx.y; + ctr_1 = blockIdx.x; + ctr_0 = threadIDx.x; + """ + + @dataclass class GpuOptions(ConfigBase): """Configuration options specific to GPU targets.""" diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py new file mode 100644 index 000000000..1d9bf9c2c --- /dev/null +++ b/src/pystencils/codegen/gpu_indexing.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from itertools import chain + +from .lambdas import Lambda +from .parameters import Parameter + + +_ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None] + + +class GpuLaunchGridConstraints: + """Constraints on the number of threads and blocks on the GPU launch grid for a given kernel. + + This constraints set determines all or some of + the number of threads on a GPU block as well as the number of blocks on the GPU grid, + statically or depending on runtime parameters. + """ + + def __init__( + self, + block_size: _ConstraintTriple | None = None, + grid_size: _ConstraintTriple | None = None, + ) -> None: + self._block_size: _ConstraintTriple = ( + (None, None, None) if block_size is None else block_size + ) + self._grid_size: _ConstraintTriple = ( + (None, None, None) if grid_size is None else grid_size + ) + + params = set() + for constr in chain(self._block_size, self._grid_size): + if constr is not None: + params |= set(constr.parameters) + self._params = frozenset(params) + + @property + def parameters(self) -> frozenset[Parameter]: + """Parameters to this set of constraints""" + return self._params + + @property + def block_size(self) -> _ConstraintTriple: + """Constraints on the number of threads per block""" + return self._block_size + + @property + def grid_size(self) -> _ConstraintTriple: + """Constraints on the number of blocks on the grid""" + return self._grid_size diff --git a/src/pystencils/codegen/lambdas.py b/src/pystencils/codegen/lambdas.py new file mode 100644 index 000000000..dd0fb571d --- /dev/null +++ b/src/pystencils/codegen/lambdas.py @@ -0,0 +1,45 @@ +from __future__ import annotations +from typing import Sequence + +import numpy as np + +from .parameters import Parameter +from ..types import PsType + +from ..backend.ast.expressions import PsExpression + + +class Lambda: + """A one-line function emitted by the code generator as an auxiliary object.""" + + def __init__(self, expr: PsExpression, params: Sequence[Parameter]): + self._expr = expr + self._params = tuple(params) + self._return_type = expr.get_dtype() + + @property + def parameters(self) -> tuple[Parameter, ...]: + """Parameters to this lambda""" + return self._params + + @property + def return_type(self) -> PsType: + """Return type of this lambda""" + return self._return_type + + def __call__(self, **kwargs) -> np.generic: + """Evaluate this lambda with the given arguments. + + The lambda must receive a value for each parameter listed in `parameters`. + """ + from ..backend.ast.expressions import evaluate_expression + return evaluate_expression(self._expr, kwargs) + + def __str__(self) -> str: + return str(self._expr) + + def c_code(self) -> str: + """Print the C code of this lambda""" + from ..backend.emission import CAstPrinter + printer = CAstPrinter() + return printer(self._expr) -- GitLab