diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index 2c7ee1c5f4750eac0375bc31a3f44b9eea50642b..9b7e642b5b6634e9f65a6b18b54785e89f6df362 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from typing import Any from ..ast.structural import PsBlock from ..ast.expressions import PsCall, PsExpression @@ -28,7 +27,7 @@ class Platform(ABC): @abstractmethod def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> PsBlock | tuple[PsBlock, Any]: + ) -> PsBlock: pass @abstractmethod diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 152fceba832e5be49df9405a5d32989be315dc4e..b14fc027255481a51a34767495b2a2c8b77b59d1 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import cast, Sequence, Iterable, TYPE_CHECKING +from typing import cast, Sequence, Iterable, Callable, TYPE_CHECKING from dataclasses import dataclass, replace from .target import Target @@ -15,7 +15,7 @@ from .config import ( from .kernel import Kernel, GpuKernel from .properties import PsSymbolProperty, FieldBasePtr from .parameters import Parameter -from .gpu_indexing import GpuIndexing, GpuLaunchGridConstraints +from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration from ..field import Field from ..types import PsIntegerType, PsScalarType @@ -40,7 +40,6 @@ from ..backend.platforms import ( Platform, GenericCpu, GenericVectorCpu, - GenericGpu, ) from ..backend.exceptions import VectorizationError @@ -166,17 +165,9 @@ class DefaultKernelCreationDriver: ) -> Kernel: kernel_body = self.parse_kernel_body(assignments) - match self._platform: - case GenericCpu(): - kernel_ast = self._platform.materialize_iteration_space( - kernel_body, self._ctx.get_iteration_space() - ) - case GenericGpu(): - kernel_ast = self._platform.materialize_iteration_space( - kernel_body, self._ctx.get_iteration_space() - ) - case _: - assert False, "unexpected platform" + kernel_ast = self._platform.materialize_iteration_space( + kernel_body, self._ctx.get_iteration_space() + ) if self._intermediates is not None: self._intermediates.materialized_ispace = kernel_ast.clone() @@ -220,14 +211,16 @@ class DefaultKernelCreationDriver: self._cfg.get_jit(), ) else: + assert self._gpu_indexing is not None + return create_gpu_kernel_function( self._ctx, self._platform, kernel_ast, - self._gpu_indexing, self._cfg.get_option("function_name"), self._target, self._cfg.get_jit(), + self._gpu_indexing.get_launch_config, ) def parse_kernel_body( @@ -500,30 +493,24 @@ def create_gpu_kernel_function( ctx: KernelCreationContext, platform: Platform, body: PsBlock, - indexing: GpuIndexing | None, function_name: str, target_spec: Target, jit: JitBase, + launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration], ) -> GpuKernel: undef_symbols = collect_undefined_symbols(body) - launch_grid_constraints = ( - indexing.get_launch_grid_constraints() - if indexing is not None - else GpuLaunchGridConstraints() - ) - params = _get_function_params(ctx, undef_symbols) req_headers = _get_headers(ctx, platform, body) kfunc = GpuKernel( body, - launch_grid_constraints, target_spec, function_name, params, req_headers, jit, + launch_config_factory, ) kfunc.metadata.update(ctx.metadata) return kfunc diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 2b84ef00751ef12bc07f5642e71f7a9805e6184a..08134a6221609bf39f249622bccb1eb8ebabda38 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -1,12 +1,12 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import cast +from typing import cast, Any from itertools import chain from .functions import Lambda +from .kernel import GpuKernel from .parameters import Parameter -from .properties import GpuBlockSize from .errors import CodegenError from ..backend.kernelcreation import ( @@ -18,34 +18,30 @@ from ..backend.platforms.cuda import ThreadToIndexMapping from ..backend.ast.expressions import PsExpression -_ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None] +_Dim3Lambda = tuple[Lambda, Lambda, Lambda] -class GpuLaunchGridConstraints: - """Constraints on the number of threads and blocks on the GPU launch grid for a given kernel. +class GpuLaunchConfiguration: + """Base class for launch configurations for CUDA and HIP kernels. - This constraints set determines all or some of - the number of threads on a GPU block as well as the number of blocks on the GPU grid, - statically or depending on runtime parameters. + Args: + block_size: A triple of lambdas determining the GPU block size + grid_size: A triple of lambdas determining the GPU grid size + config_parameters: Set containing all parameters to the given lambdas that are not also + parameters to the associated kernel """ def __init__( self, - block_size: _ConstraintTriple | None = None, - grid_size: _ConstraintTriple | None = None, + block_size: _Dim3Lambda, + grid_size: _Dim3Lambda, + config_parameters: set[Parameter], ) -> None: - self._block_size: _ConstraintTriple = ( - (None, None, None) if block_size is None else block_size - ) - self._grid_size: _ConstraintTriple = ( - (None, None, None) if grid_size is None else grid_size - ) + self._block_size = block_size + self._grid_size = grid_size - params = set() - for constr in chain(self._block_size, self._grid_size): - if constr is not None: - params |= set(constr.parameters) - self._params = frozenset(params) + self._params = frozenset(config_parameters) + self._valuation: dict[Parameter, Any] = dict() @property def parameters(self) -> frozenset[Parameter]: @@ -53,12 +49,18 @@ class GpuLaunchGridConstraints: return self._params @property - def block_size(self) -> _ConstraintTriple: + def parameter_values(self) -> dict[Parameter, Any]: + """Values for all parameters that are specific to the launch grid configuration and not + also kernel parameters.""" + return self._valuation + + @property + def block_size(self) -> _Dim3Lambda: """Constraints on the number of threads per block""" return self._block_size @property - def grid_size(self) -> _ConstraintTriple: + def grid_size(self) -> _Dim3Lambda: """Constraints on the number of blocks on the grid""" return self._grid_size @@ -68,7 +70,7 @@ class GpuIndexing(ABC): def get_thread_mapping(self) -> ThreadToIndexMapping | None: ... @abstractmethod - def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints: ... + def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration: ... class Linear3DGpuIndexing(GpuIndexing): @@ -85,29 +87,48 @@ class Linear3DGpuIndexing(GpuIndexing): return Linear3DMapping() - def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints: + def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration: + block_size, grid_size = self._prepare_launch_grid() + + kernel_params = set(kernel.parameters) + launch_config_params = ( + set().union( + *(lb.parameters for lb in chain(block_size, grid_size)) + ) + - kernel_params + ) + + return GpuLaunchConfiguration( + block_size=cast(_Dim3Lambda, tuple(block_size)), + grid_size=cast(_Dim3Lambda, tuple(grid_size)), + config_parameters=launch_config_params, + ) + + def _prepare_launch_grid(self): work_items = self._get_work_items() rank = len(work_items) from ..backend.constants import PsConstant from ..backend.ast.expressions import PsExpression, PsIntDiv - block_size_constraints = [None] * rank + [ - Lambda(self._factory.parse_index(1), ()) for _ in range(3 - rank) - ] - block_size_symbols = [ self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank) ] - for c, bs in enumerate(block_size_symbols): - bs.add_property(GpuBlockSize(c)) + + block_size = [ + Lambda.from_expression(self._ctx, self._factory.parse_index(bs_symb)) + for bs_symb in block_size_symbols + ] + [ + Lambda.from_expression(self._ctx, self._factory.parse_index(1)) + for _ in range(3 - rank) + ] def div_ceil(a: PsExpression, b: PsExpression): return self._factory.parse_index( PsIntDiv(a + b - PsExpression.make(PsConstant(1)), b) ) - grid_size_constraints = [ + grid_size = [ Lambda.from_expression( self._ctx, div_ceil(witems, PsExpression.make(bsize)) ) @@ -117,10 +138,7 @@ class Linear3DGpuIndexing(GpuIndexing): for _ in range(3 - rank) ] - return GpuLaunchGridConstraints( - block_size=cast(_ConstraintTriple, tuple(block_size_constraints)), - grid_size=cast(_ConstraintTriple, tuple(grid_size_constraints)), - ) + return block_size, grid_size def _get_work_items(self) -> tuple[PsExpression, ...]: ispace = self._ctx.get_iteration_space() diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py index 8038f24b017bf3b2a51cc0adb87959f8a9ea3c5b..67ef6554cb8cfb00c19ad3bd9b7609076d47a8e8 100644 --- a/src/pystencils/codegen/kernel.py +++ b/src/pystencils/codegen/kernel.py @@ -6,7 +6,6 @@ from itertools import chain from .target import Target from .parameters import Parameter -from .gpu_indexing import GpuLaunchGridConstraints from ..backend.ast.structural import PsBlock from ..field import Field @@ -14,6 +13,7 @@ from .._deprecation import _deprecated if TYPE_CHECKING: from ..jit import JitBase + from .gpu_indexing import GpuLaunchConfiguration class Kernel: @@ -118,17 +118,16 @@ class GpuKernel(Kernel): def __init__( self, body: PsBlock, - launch_grid_constraints: GpuLaunchGridConstraints, target: Target, name: str, parameters: Sequence[Parameter], required_headers: set[str], jit: JitBase, + launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration], ): super().__init__(body, target, name, parameters, required_headers, jit) - self._launch_grid_constraints = launch_grid_constraints + self._launch_config_factory = launch_config_factory - @property - def launch_grid_constraints(self) -> GpuLaunchGridConstraints: + def get_launch_configuration(self) -> GpuLaunchConfiguration: """Object exposing the total size of the launch grid this kernel expects to be executed with.""" - return self._launch_grid_constraints + return self._launch_config_factory(self) diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index df76489db175fb7fc576755a1008edb47f142493..d377fb3d35d99b59c4f364cc4d066b736bfd9140 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -39,8 +39,3 @@ class FieldBasePtr(UniqueSymbolProperty): FieldProperty = FieldShape | FieldStride | FieldBasePtr _FieldProperty = (FieldShape, FieldStride, FieldBasePtr) - - -@dataclass(frozen=True) -class GpuBlockSize(UniqueSymbolProperty): - coordinate: int diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index afdbd5097dbd3b33957e1dc342d51ca7e03992c4..f3f834f769767a0165aaa29f97d7196a9ef2dd5e 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -18,6 +18,7 @@ from ..codegen import ( GpuKernel, Parameter, ) +from ..codegen.gpu_indexing import GpuLaunchConfiguration from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr from ..types import PsStructType, PsPointerType @@ -35,38 +36,24 @@ class CupyKernelWrapper(KernelWrapper): self, kfunc: GpuKernel, raw_kernel: Any, - block_size: tuple[int, int, int], ): self._kfunc: GpuKernel = kfunc + self._launch_config = kfunc.get_launch_configuration() self._raw_kernel = raw_kernel - self._block_size = block_size - self._grid_size: tuple[int, int, int] | None = None self._args_cache: dict[Any, tuple] = dict() @property def kernel_function(self) -> GpuKernel: return self._kfunc + + @property + def launch_config(self) -> GpuLaunchConfiguration: + return self._launch_config @property def raw_kernel(self): return self._raw_kernel - @property - def block_size(self) -> tuple[int, int, int]: - return self._block_size - - @block_size.setter - def block_size(self, bs: tuple[int, int, int]): - self._block_size = bs - - @property - def num_blocks(self) -> tuple[int, int, int] | None: - return self._grid_size - - @num_blocks.setter - def num_blocks(self, nb: tuple[int, int, int] | None): - self._grid_size = nb - def __call__(self, **kwargs: Any): kernel_args, launch_grid = self._get_cached_args(**kwargs) device = self._get_device(kernel_args) @@ -80,9 +67,10 @@ class CupyKernelWrapper(KernelWrapper): return devices.pop() def _get_cached_args(self, **kwargs): - key = (self._block_size, self._grid_size) + tuple( - (k, id(v)) for k, v in kwargs.items() - ) + launch_config_params = self._launch_config.parameter_values + key = tuple( + (k, v) for k, v in launch_config_params.items() + ) + tuple((k, id(v)) for k, v in kwargs.items()) if key not in self._args_cache: args = self._get_args(**kwargs) @@ -203,48 +191,32 @@ class CupyKernelWrapper(KernelWrapper): # Determine launch grid - from ..codegen.gpu_indexing import GpuBlockSize - - constraints = self._kfunc.launch_grid_constraints + launch_cfg_valuation = valuation.copy() + launch_cfg_valuation.update( + { + param.name: value + for param, value in self._launch_config.parameter_values.items() + } + ) - for cparam in constraints.parameters: - for prop in cparam.properties: - match prop: - case GpuBlockSize(coord): - valuation[cparam.name] = self._block_size[coord] - break - else: - valuation[cparam.name] = kwargs[cparam.name] - - # launch_block_size: list[int] = [] - # for coord, (bsize_constr, user_bsize) in enumerate( - # zip(constraints.block_size, self._block_size) - # ): - # if bsize_constr is None: - # launch_grid_size - - launch_block_size = [ - ( - int(bsize_constr(**valuation)) - if bsize_constr is not None - else self._block_size[coord] - ) - for coord, bsize_constr in enumerate(constraints.block_size) - ] - - launch_grid_size = [ - ( - int(gsize_constr(**valuation)) - if gsize_constr is not None - else self._grid_size[coord] - ) - for coord, gsize_constr in enumerate(constraints.grid_size) - ] + block_size = cast( + tuple[int, int, int], + tuple( + int(component(**launch_cfg_valuation)) + for component in self._launch_config.block_size + ), + ) - return tuple(args), LaunchGrid( - tuple(launch_grid_size), tuple(launch_block_size) + grid_size = cast( + tuple[int, int, int], + tuple( + int(component(**launch_cfg_valuation)) + for component in self._launch_config.grid_size + ), ) + return tuple(args), LaunchGrid(grid_size, block_size) + class CupyJit(JitBase): @@ -261,26 +233,26 @@ class CupyJit(JitBase): tuple(default_block_size) + (1,) * (3 - len(default_block_size)), ) - def compile(self, kfunc: Kernel) -> KernelWrapper: + def compile(self, kernel: Kernel) -> KernelWrapper: if not HAVE_CUPY: raise JitError( "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable." ) - if not isinstance(kfunc, GpuKernel) or kfunc.target != Target.CUDA: + if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA: raise ValueError( "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP" ) options = self._compiler_options() - prelude = self._prelude(kfunc) - kernel_code = self._kernel_code(kfunc) + prelude = self._prelude(kernel) + kernel_code = self._kernel_code(kernel) code = prelude + kernel_code raw_kernel = cp.RawKernel( - code, kfunc.name, options=options, backend="nvrtc", jitify=True + code, kernel.name, options=options, backend="nvrtc", jitify=True ) - return CupyKernelWrapper(kfunc, raw_kernel, self._default_block_size) + return CupyKernelWrapper(kernel, raw_kernel) def _compiler_options(self) -> tuple[str, ...]: options = ["-w", "-std=c++11"]