diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 47a64df647e79fb931224b226550b5b68b6415ce..2d62f286b5a8c30799c408175f0dfa0d10c83740 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -381,7 +381,11 @@ class GpuOptions(ConfigBase): """ block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO) - """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system.""" + """Desired block size for the execution of GPU kernels. + + This option only takes effect if `Linear3D` is chosen as an indexing scheme. + The block size may be overridden at runtime. + """ manual_launch_grid: BasicOption[bool] = BasicOption(False) """Always require a manually specified launch grid when running this kernel. @@ -596,11 +600,8 @@ class CreateKernelConfig(ConfigBase): elif self.get_target() == Target.CUDA: try: from ..jit.gpu_cupy import CupyJit - - if self.gpu is not None and self.gpu.block_size is not None: - return CupyJit(self.gpu.block_size) - else: - return CupyJit() + + return CupyJit() except ImportError: from ..jit import no_jit diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index f7eb8ddb41fe869cbd5b86358d31fdababca7afb..14a95c84d899638ea796d13cfddf7dd4e7ccd04f 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -220,7 +220,7 @@ class DefaultKernelCreationDriver: self._cfg.get_option("function_name"), self._target, self._cfg.get_jit(), - self._gpu_indexing.get_launch_config, + self._gpu_indexing.get_launch_config_factory(), ) def parse_kernel_body( @@ -397,15 +397,13 @@ class DefaultKernelCreationDriver: if self._target != Target.CUDA: return None - idx_scheme = self._cfg.gpu.get_option("indexing_scheme") + from .gpu_indexing import dim3 - match idx_scheme: - case None | GpuIndexingScheme.Linear3D: - from .gpu_indexing import Linear3DGpuIndexing + idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") + block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") + manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") - return Linear3DGpuIndexing(self._ctx) - case _: - raise NotImplementedError() + return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid) def _get_platform(self) -> Platform: if Target._CPU in self._target: @@ -496,7 +494,7 @@ def create_gpu_kernel_function( function_name: str, target_spec: Target, jit: JitBase, - launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration], + launch_config_factory: Callable[[], GpuLaunchConfiguration], ) -> GpuKernel: undef_symbols = collect_undefined_symbols(body) diff --git a/src/pystencils/codegen/functions.py b/src/pystencils/codegen/functions.py index 2779fa289e04cda9bc47fd46e48ff0ada9a98ad1..f6be3b1f3446c6b9a25a0013f0e06d099edf5bed 100644 --- a/src/pystencils/codegen/functions.py +++ b/src/pystencils/codegen/functions.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Sequence - -import numpy as np +from typing import Sequence, Any from .parameters import Parameter from ..types import PsType @@ -36,7 +34,7 @@ class Lambda: """Return type of this lambda""" return self._return_type - def __call__(self, **kwargs) -> np.generic: + def __call__(self, **kwargs) -> Any: """Evaluate this lambda with the given arguments. The lambda must receive a value for each parameter listed in `parameters`. diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 24189bf635b0cd8a5c64b8ceb3d65a14a4121d36..1e23a820e85fc7250b9d49196ba5d11ee0526f4d 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -5,7 +5,6 @@ from typing import cast, Any, Callable from itertools import chain from .functions import Lambda -from .kernel import GpuKernel from .parameters import Parameter from .errors import CodegenError from .config import GpuIndexingScheme, _AUTO_TYPE @@ -15,7 +14,7 @@ from ..backend.kernelcreation import ( FullIterationSpace, SparseIterationSpace, ) -from ..backend.platforms.cuda import ThreadToIndexMapping + from ..backend.ast.expressions import PsExpression @@ -24,7 +23,7 @@ _Dim3Params = tuple[Parameter, Parameter, Parameter] _Dim3Lambda = tuple[Lambda, Lambda, Lambda] -class GpuLaunchConfiguration: +class GpuLaunchConfiguration(ABC): """Base class for launch configurations for CUDA and HIP kernels. Args: @@ -34,33 +33,61 @@ class GpuLaunchConfiguration: parameters to the associated kernel """ + @property + @abstractmethod + def parameters(self) -> frozenset[Parameter]: + """Parameters of this launch configuration""" + + @abstractmethod + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: + """Compute block and grid size for a kernel launch. + + Args: + kwargs: Valuation providing a value for each parameter listed in `parameters` + """ + + @abstractmethod + def jit_cache_key(self) -> Any: + """Return a hashable object that represents any user-configurable options of + this launch configuration, such that when the configuration changes, the JIT parameter + cache is invalidated.""" + + +class AutomaticLaunchConfiguration(GpuLaunchConfiguration): + """Launch configuration that is dynamically computed from kernel parameters. + + This launch configuration permits no further user customization + """ + def __init__( self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, - config_parameters: set[Parameter], ) -> None: self._block_size = block_size self._grid_size = grid_size - self._params = frozenset(config_parameters) - self._valuation: dict[Parameter, Any] = dict() + self._params: frozenset[Parameter] = frozenset().union( + *(lb.parameters for lb in chain(block_size, grid_size)) + ) @property def parameters(self) -> frozenset[Parameter]: - """Parameters to this set of constraints""" + """Parameters of this launch configuration""" return self._params - def get_valuation(self) -> dict[Parameter, Any]: - """Values for all parameters that are specific to the launch grid configuration and not - also kernel parameters.""" - return self._valuation + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: + """Compute block and grid size for a kernel launch. - def get_block_size(self) -> _Dim3Lambda: - return self._block_size + Args: + kwargs: Valuation providing a value for each parameter listed in `parameters` + """ + block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) + grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) + return cast(dim3, block_size), cast(dim3, grid_size) - def get_grid_size(self) -> _Dim3Lambda: - return self._grid_size + def jit_cache_key(self) -> Any: + return () class ManualLaunchConfiguration(GpuLaunchConfiguration): @@ -71,89 +98,93 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): def __init__( self, - block_size: _Dim3Lambda, - grid_size: _Dim3Lambda, - block_size_params: _Dim3Params, - grid_size_params: _Dim3Params, - ): - super().__init__( - cast(_Dim3Lambda, block_size), - cast(_Dim3Lambda, grid_size), - set(block_size_params).union(grid_size_params), - ) - self._block_size_params = block_size_params - self._grid_size_params = grid_size_params - - self._user_block_size: dim3 | None = None - self._user_grid_size: dim3 | None = None + ) -> None: + self._block_size: dim3 | None = None + self._grid_size: dim3 | None = None @property def block_size(self) -> dim3 | None: - return self._user_block_size + return self._block_size @block_size.setter def block_size(self, val: dim3): - self._user_block_size = val + self._block_size = val @property def grid_size(self) -> dim3 | None: - return self._user_grid_size + return self._grid_size @grid_size.setter def grid_size(self, val: dim3): - self._user_grid_size = val - - def get_valuation(self) -> dict[Parameter, Any]: - if self._user_block_size is None: - raise AttributeError("No GPU block size was specified") + self._grid_size = val - if self._user_grid_size is None: - raise AttributeError("No GPU grid size was specified") + @property + def parameters(self) -> frozenset[Parameter]: + return frozenset() - valuation: dict[Parameter, Any] = dict() + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: + if self._block_size is None: + raise AttributeError("No GPU block size was set by the user.") - for bs_param, bs in zip(self._block_size_params, self._user_block_size): - valuation[bs_param] = bs + if self._grid_size is None: + raise AttributeError("No GPU grid size was set by the user.") - for gs_param, gs in zip(self._grid_size_params, self._user_grid_size): - valuation[gs_param] = gs + return self._block_size, self._grid_size - return valuation + def jit_cache_key(self) -> Any: + return (self._block_size, self._grid_size) -class GridFromBlockSizeConfiguration(GpuLaunchConfiguration): - """GPU launch configuration that computes the grid size from a user-defined block size.""" +class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): + """GPU launch configuration that permits the user to set a block size dynamically.""" def __init__( self, - block_size: _Dim3Lambda, - grid_size: _Dim3Lambda, + block_size_expr: _Dim3Lambda, + grid_size_expr: _Dim3Lambda, block_size_params: _Dim3Params, default_block_size: dim3 | None = None, ) -> None: - super().__init__(block_size, grid_size, set(block_size_params)) + self._block_size_expr = block_size_expr + self._grid_size_expr = grid_size_expr self._block_size_params = block_size_params - self._user_block_size: dim3 | None = default_block_size + self._block_size: dim3 | None = default_block_size + + self._params: frozenset[Parameter] = frozenset().union( + *(lb.parameters for lb in chain(block_size_expr, grid_size_expr)) + ) - set(self._block_size_params) @property def block_size(self) -> dim3 | None: - return self._user_block_size + return self._block_size @block_size.setter def block_size(self, val: dim3): - self._user_block_size = val + self._block_size = val + + @property + def parameters(self) -> frozenset[Parameter]: + """Parameters of this launch configuration""" + return self._params - def get_valuation(self) -> dict[Parameter, Any]: - if self._user_block_size is None: - raise AttributeError("No GPU block size was specified") + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: + if self._block_size is None: + raise AttributeError("No GPU block size was specified by the user!") - valuation: dict[Parameter, Any] = dict() + kwargs.update( + { + param.name: value + for param, value in zip(self._block_size_params, self._block_size) + } + ) - for bs_param, bs in zip(self._block_size_params, self._user_block_size): - valuation[bs_param] = bs + block_size = tuple(int(bs(**kwargs)) for bs in self._block_size_expr) + grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size_expr) + return cast(dim3, block_size), cast(dim3, grid_size) - return valuation + def jit_cache_key(self) -> Any: + return self._block_size class GpuIndexing(ABC): @@ -182,65 +213,19 @@ class GpuIndexing(ABC): case GpuIndexingScheme.Blockwise4D: return Blockwise4DMapping() - def get_launch_config_factory( - self, scheme: GpuIndexingScheme - ) -> Callable[[], GpuLaunchConfiguration]: + def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]: if self._manual_launch_grid: - return self._manual_config_factory() - - raise NotImplementedError() - - def _manual_config_factory(self) -> Callable[[], ManualLaunchConfiguration]: - ctx = self._ctx - - block_size_symbols = [ - ctx.get_new_symbol(f"gpuBlockSize_{c}", ctx.index_dtype) for c in range(3) - ] - grid_size_symbols = [ - ctx.get_new_symbol(f"gpuGridSize_{c}", ctx.index_dtype) for c in range(3) - ] - - block_size = tuple( - Lambda.from_expression(ctx, PsExpression.make(bs)) - for bs in block_size_symbols - ) + return ManualLaunchConfiguration - grid_size = tuple( - Lambda.from_expression(ctx, PsExpression.make(gs)) - for gs in grid_size_symbols - ) - - from .driver import _symbol_to_param - - bs_params = [_symbol_to_param(ctx, s) for s in block_size_symbols] - gs_params = [_symbol_to_param(ctx, s) for s in grid_size_symbols] - - def factory(): - return ManualLaunchConfiguration( - cast(_Dim3Lambda, block_size), - cast(_Dim3Lambda, grid_size), - cast(_Dim3Params, bs_params), - cast(_Dim3Params, gs_params), - ) - - return factory - - -class Linear3DGpuIndexing(GpuIndexing): - - def __init__(self, ctx: KernelCreationContext) -> None: - self._ctx = ctx - - from ..backend.kernelcreation import AstFactory - - self._factory = AstFactory(self._ctx) - - def get_thread_mapping(self) -> ThreadToIndexMapping: - from ..backend.platforms.cuda import Linear3DMapping - - return Linear3DMapping() + match self._scheme: + case GpuIndexingScheme.Linear3D: + return self._get_linear3d_config_factory() + case GpuIndexingScheme.Blockwise4D: + return self._get_blockwise4d_config_factory() - def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration: + def _get_linear3d_config_factory( + self, + ) -> Callable[[], DynamicBlockSizeLaunchConfiguration]: work_items = self._get_work_items() rank = len(work_items) @@ -280,13 +265,65 @@ class Linear3DGpuIndexing(GpuIndexing): _symbol_to_param(self._ctx, s) for s in block_size_symbols ) - return GridFromBlockSizeConfiguration( - cast(_Dim3Lambda, tuple(block_size)), - cast(_Dim3Lambda, tuple(grid_size)), - cast(tuple[Parameter, Parameter, Parameter], block_size_params), + def factory(): + return DynamicBlockSizeLaunchConfiguration( + cast(_Dim3Lambda, tuple(block_size)), + cast(_Dim3Lambda, tuple(grid_size)), + cast(_Dim3Params, block_size_params), + self._get_default_block_size(rank), + ) + + return factory + + def _get_default_block_size(self, rank: int) -> dim3: + if isinstance(self._block_size, _AUTO_TYPE): + match rank: + case 1: + return (256, 1, 1) + case 2: + return (128, 2, 1) + case 3: + return (128, 2, 2) + case _: + assert False, "unreachable code" + else: + return self._block_size + + def _get_blockwise4d_config_factory( + self, + ) -> Callable[[], AutomaticLaunchConfiguration]: + work_items = self._get_work_items()[::-1] # Want this ordered fastest first + rank = len(work_items) + + if rank > 4: + raise ValueError(f"Iteration space rank is too large: {rank}") + + block_size = ( + Lambda.from_expression(self._ctx, work_items[0]), + Lambda.from_expression(self._ctx, self._factory.parse_index(1)), + Lambda.from_expression(self._ctx, self._factory.parse_index(1)), + ) + + grid_size = tuple( + Lambda.from_expression(self._ctx, wit) for wit in work_items[1:] + ) + tuple( + Lambda.from_expression(self._ctx, self._factory.parse_index(1)) + for _ in range(4 - rank) ) + def factory(): + return AutomaticLaunchConfiguration( + block_size, + cast(_Dim3Lambda, grid_size), + ) + + return factory + def _get_work_items(self) -> tuple[PsExpression, ...]: + """Return a tuple of expressions representing the number of work items + in each dimension of the kernel's iteration space, + ordered from slowest to fastest dimension. + """ ispace = self._ctx.get_iteration_space() match ispace: case FullIterationSpace(): diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py index 67ef6554cb8cfb00c19ad3bd9b7609076d47a8e8..181e6ad3b5d1cb1f1835d8b4e656c39f65a1316b 100644 --- a/src/pystencils/codegen/kernel.py +++ b/src/pystencils/codegen/kernel.py @@ -6,6 +6,8 @@ from itertools import chain from .target import Target from .parameters import Parameter +from .gpu_indexing import GpuLaunchConfiguration + from ..backend.ast.structural import PsBlock from ..field import Field @@ -13,7 +15,6 @@ from .._deprecation import _deprecated if TYPE_CHECKING: from ..jit import JitBase - from .gpu_indexing import GpuLaunchConfiguration class Kernel: @@ -123,11 +124,11 @@ class GpuKernel(Kernel): parameters: Sequence[Parameter], required_headers: set[str], jit: JitBase, - launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration], + launch_config_factory: Callable[[], GpuLaunchConfiguration], ): super().__init__(body, target, name, parameters, required_headers, jit) self._launch_config_factory = launch_config_factory def get_launch_configuration(self) -> GpuLaunchConfiguration: """Object exposing the total size of the launch grid this kernel expects to be executed with.""" - return self._launch_config_factory(self) + return self._launch_config_factory() diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index d4f1c020437b4ffeacc462f2ded1277933dee75d..42d9a685feef80611a7441d2d1c453a039e2b52d 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -45,7 +45,7 @@ class CupyKernelWrapper(KernelWrapper): @property def kernel_function(self) -> GpuKernel: return self._kfunc - + @property def launch_config(self) -> GpuLaunchConfiguration: return self._launch_config @@ -67,10 +67,9 @@ class CupyKernelWrapper(KernelWrapper): return devices.pop() def _get_cached_args(self, **kwargs): - launch_config_params = self._launch_config.get_valuation - key = tuple( - (k, v) for k, v in launch_config_params.items() - ) + tuple((k, id(v)) for k, v in kwargs.items()) + key = (self._launch_config.jit_cache_key(),) + tuple( + (k, id(v)) for k, v in kwargs.items() + ) if key not in self._args_cache: args = self._get_args(**kwargs) @@ -80,7 +79,7 @@ class CupyKernelWrapper(KernelWrapper): return self._args_cache[key] def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]: - args = [] + kernel_args = [] valuation: dict[str, Any] = dict() def add_arg(name: str, arg: Any, dtype: PsType): @@ -88,7 +87,7 @@ class CupyKernelWrapper(KernelWrapper): assert nptype is not None typecast = nptype.type arg = typecast(arg) - args.append(arg) + kernel_args.append(arg) valuation[name] = arg field_shapes = set() @@ -168,7 +167,7 @@ class CupyKernelWrapper(KernelWrapper): f"Expected {field.dtype}, got {arr.dtype}" ) check_shape(kparam, arr) - args.append(arr) + kernel_args.append(arr) break case FieldShape(field, coord): @@ -191,31 +190,21 @@ class CupyKernelWrapper(KernelWrapper): # Determine launch grid - launch_cfg_valuation = valuation.copy() - launch_cfg_valuation.update( - { - param.name: value - for param, value in self._launch_config.get_valuation.items() - } - ) + def add_launch_config_arg(name: str, arg: Any, dtype: PsType): + nptype = dtype.numpy_dtype + assert nptype is not None + typecast = nptype.type + arg = typecast(arg) + valuation[name] = arg - block_size = cast( - tuple[int, int, int], - tuple( - int(component(**launch_cfg_valuation)) - for component in self._launch_config.get_block_size() - ), - ) + for cparam in self._launch_config.parameters: + if cparam.name not in valuation: + val = kwargs[cparam.name] + add_launch_config_arg(cparam.name, val, cparam.dtype) - grid_size = cast( - tuple[int, int, int], - tuple( - int(component(**launch_cfg_valuation)) - for component in self._launch_config.get_grid_size() - ), - ) + block_size, grid_size = self._launch_config.evaluate(**valuation) - return tuple(args), LaunchGrid(grid_size, block_size) + return tuple(kernel_args), LaunchGrid(grid_size, block_size) class CupyJit(JitBase):