diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 1082669e640c989cad0b994ac70818f22a3517bb..e5035d7c7b4f2f21699f04b3cc0a7245a7ba2ff4 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -72,6 +72,13 @@ These depend on the type of the launch configuration: while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel parameters, the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size. +The `DynamicBlockSizeLaunchConfiguration` dynamically computes the grid size from either the default block size +or a computed block size. Computing block sizes can be signaled by the user via the `trim_block_size` or +`fit_block_size` member functions. These function receive an initial block size as an argument and adapt it. +The `trim_block_size` function trims the initial block size with the sizes of the iteration space, i.e. it takes +the minimum value of both sizes per dimension. The `fit_block_size` performs a block fitting algorithm that adapts +the initial block size by incrementally enlarging the trimmed block size until it is large enough +and aligns with the warp size. The `evaluate` method can only be used from within a Python runtime environment. When exporting pystencils CUDA kernels for external use in C++ projects, diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 610c61ddf647331d7b77b06968e489b4dcc76293..48a24e703584e8c87365550c7a7d5d8338feb717 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -111,24 +111,49 @@ For most kernels with an at most three-dimensional iteration space, this behavior is sufficient and desired. It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`. -If the `Linear3D` indexing scheme is used, you may modifiy the GPU thread block size in two places. -The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` -code generator option; -if none is specified, a default depending on the iteration space's dimensionality will be used. - -The block size can furthermore be modified at the compiled kernel's wrapper object via the -`launch_config.block_size` attribute: +The GPU thread block size of a compiled kernel's wrapper object can only be directly modified +for manual launch configurations, cf. the section for [manual launch configurations](#manual_launch_grids). +Linear indexing schemes without manual launch configurations either employ default block sizes +or compute block sizes using user-exposed member functions that adapt the initial block size that was +passed as an argument. The available functions are :meth:`fit_block_size` and :meth:`trim_block_size`. +The :meth:`trim_block_size` function trims the user-defined initial block size with the iteration space. +The :meth:`fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is +divisible by the hardware's warp size. ```{code-cell} ipython3 +:tags: [raises-exception] kfunc = kernel.compile() -kfunc.launch_config.block_size = (256, 2, 1) -# Run the kernel +# three different configuration cases for block size: + +# a) Nothing +# -> use default block size, perform no fitting, no trimming + +# b) Activate fitting with initial block size +kfunc.launch_config.fit_block_size((8, 8, 4)) + +# c) Activate trimming with initial block size +kfunc.launch_config.trim_block_size((8, 8, 4)) + +# finally run the kernel... kfunc(f=f_arr, g=g_arr) ``` +Block sizes aligning with multiples of the hardware's warp size allow for a better usage of the GPUs resources. +Even though such block size configurations are not enforced, notifying our code generator via the +`GpuOptions.assume_warp_aligned_block_size` option that the configured block size is divisible by the warp size allows +for further optimization potential, e.g. for warp-level reductions. +When setting this option to `True`, the user has to make sure that this alignment applies. +For [manual launch configurations](#manual_launch_grids), this can be achieved by manually providing suitable +block sizes via the `launch_config.block_size`. For the other launch configurations, this criterion is guaranteed +by using the default block sizes in pystencils. Using :meth:`fit_block_size` also guarantees this while also producing +block sizes that are better customized towards the kernel's iteration space. For :meth:`trim_block_size`, +the trimmed block's dimension that is closest the next multiple of the warp size is rounded up to the next multiple +of the warp size. For all cases, the final block size is checked against the imposed hardware limits and an error +is thrown in case these limits are exceeded. + In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments -and the given thread block size. +and the final thread block size. :::{attention} diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index d630594ba8a633e04e2b058e74f77d604f6d95a9..c97fad413198c16b87d6c27949e70d6aa59a3aa1 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -117,7 +117,7 @@ class Blockwise4DMapping(ThreadMapping): THREAD_IDX[0], BLOCK_IDX[0], BLOCK_IDX[1], - BLOCK_IDX[2] + BLOCK_IDX[2], ] def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: @@ -163,23 +163,19 @@ class Blockwise4DMapping(ThreadMapping): class CudaPlatform(GenericGpu): """Platform for CUDA-based GPUs. - + Args: ctx: The kernel creation context - omit_range_check: If `True`, generated index translation code will not check if the point identified - by block and thread indices is actually contained in the iteration space thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points """ def __init__( self, ctx: KernelCreationContext, - omit_range_check: bool = False, thread_mapping: ThreadMapping | None = None, ) -> None: super().__init__(ctx) - self._omit_range_check = omit_range_check self._thread_mapping = ( thread_mapping if thread_mapping is not None else Linear3DMapping() ) @@ -282,19 +278,12 @@ class CudaPlatform(GenericGpu): indexing_decls.append( self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) - if not self._omit_range_check: - conds.append(PsLt(ctr_expr, dim.stop)) - - if conds: - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) - else: - body.statements = indexing_decls + body.statements - ast = body + conds.append(PsLt(ctr_expr, dim.stop)) - return ast + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + return PsBlock(indexing_decls + [PsConditional(condition, body)]) def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace @@ -324,12 +313,6 @@ class CudaPlatform(GenericGpu): ] body.statements = mappings + body.statements - if not self._omit_range_check: - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr_expr.clone(), stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - else: - body.statements = [sparse_idx_decl] + body.statements - ast = body - - return ast + stop = PsExpression.make(ispace.index_list.shape[0]) + condition = PsLt(sparse_ctr_expr.clone(), stop) + return PsBlock([sparse_idx_decl, PsConditional(condition, body)]) diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index f3c4bb3d5559fa5b418b3f3ca51b3cf43aa99b2f..d16c4f51b0609e50b1047c8f2fbd3824b8630b18 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -35,12 +35,10 @@ class SyclPlatform(Platform): def __init__( self, ctx: KernelCreationContext, - omit_range_check: bool = False, automatic_block_size: bool = False, ): super().__init__(ctx) - self._omit_range_check = omit_range_check self._automatic_block_size = automatic_block_size @property @@ -136,8 +134,7 @@ class SyclPlatform(Platform): indexing_decls.append( PsDeclaration(ctr, dim.start + work_item_idx * dim.step) ) - if not self._omit_range_check: - conds.append(PsLt(ctr, dim.stop)) + conds.append(PsLt(ctr, dim.stop)) if conds: condition: PsExpression = conds[0] @@ -182,15 +179,9 @@ class SyclPlatform(Platform): ] body.statements = mappings + body.statements - if not self._omit_range_check: - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr, stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - else: - body.statements = [sparse_idx_decl] + body.statements - ast = body - - return ast + stop = PsExpression.make(ispace.index_list.shape[0]) + condition = PsLt(sparse_ctr, stop) + return PsBlock([sparse_idx_decl, PsConditional(condition, body)]) def _item_type(self, rank: int): if not self._automatic_block_size: diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 0d43b40e35b9e90b1f13e2ba7bc279274f466859..d6f8e403cdce0c99853ee1127567e11c14b92b32 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -372,30 +372,38 @@ class GpuOptions(ConfigBase): indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D) """Thread indexing scheme for dense GPU kernels.""" - omit_range_check: BasicOption[bool] = BasicOption(False) - """If set to `True`, omit the iteration counter range check. + manual_launch_grid: BasicOption[bool] = BasicOption(False) + """Always require a manually specified launch grid when running this kernel. - By default, the code generator introduces a check if the iteration counters computed from GPU block and thread - indices are within the prescribed loop range. - This check can be discarded through this option, at your own peril. + If set to `True`, the code generator will not attempt to infer the size of + the launch grid from the kernel. + The launch grid will then have to be specified manually at runtime. """ - block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO) - """Desired block size for the execution of GPU kernels. + warp_size: BasicOption[int] = BasicOption() + """Specifies the size of a warp (CUDA) or wavefront (HIP). - This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>` - is chosen as an indexing scheme. - The block size may be overridden at runtime. + If this option is not set the default value for the given target will be automatically used. """ - manual_launch_grid: BasicOption[bool] = BasicOption(False) - """Always require a manually specified launch grid when running this kernel. + assume_warp_aligned_block_size: BasicOption[bool] = BasicOption(False) + """Specifies whether block sizes are divisible by the hardware's warp size. - If set to `True`, the code generator will not attempt to infer the size of - the launch grid from the kernel. - The launch grid will then have to be specified manually at runtime. + If set to `True`, the code generator can employ optimizations that require this assumption, + e.g. warp-level reductions. + The pystencils Cupy runtime also checks if user-provided block sizes fulfill this criterion. """ + @staticmethod + def default_warp_size(target: Target): + match target: + case Target.CUDA: + return 32 + case _: + raise NotImplementedError( + f"No default warp/wavefront size known for target {target}" + ) + @indexing_scheme.validate def _validate_idx_scheme(self, val: str | GpuIndexingScheme): if isinstance(val, GpuIndexingScheme): @@ -601,7 +609,7 @@ class CreateKernelConfig(ConfigBase): elif self.get_target() == Target.CUDA: try: from ..jit.gpu_cupy import CupyJit - + return CupyJit() except ImportError: @@ -734,9 +742,7 @@ class CreateKernelConfig(ConfigBase): UserWarning, ) - self.gpu = GpuOptions( - block_size=gpu_indexing_params.get("block_size", None) - ) + self.gpu = GpuOptions() def _deprecated_option(name, instead): # pragma: no cover diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index b8f9c71015765e638ddca22278dfa15c0e5bcaa1..65352f5205125b05acbd088bc42e9205cd7eb415 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -11,6 +11,7 @@ from .config import ( GhostLayerSpec, IterationSliceSpec, GpuIndexingScheme, + GpuOptions, ) from .kernel import Kernel, GpuKernel from .properties import PsSymbolProperty, FieldBasePtr @@ -401,13 +402,22 @@ class DefaultKernelCreationDriver: if self._target != Target.CUDA: return None - from .gpu_indexing import dim3 - idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") - block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") + warp_size: int | None = self._cfg.gpu.get_option("warp_size") - return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid) + if warp_size is None: + warp_size = GpuOptions.default_warp_size(self._target) + + return GpuIndexing( + self._ctx, + self._target, + idx_scheme, + warp_size, + manual_launch_grid, + assume_warp_aligned_block_size, + ) def _get_platform(self) -> Platform: if Target._CPU in self._target: @@ -437,9 +447,6 @@ class DefaultKernelCreationDriver: ) elif self._target.is_gpu(): - gpu_opts = self._cfg.gpu - omit_range_check: bool = gpu_opts.get_option("omit_range_check") - match self._target: case Target.CUDA: from ..backend.platforms import CudaPlatform @@ -452,18 +459,15 @@ class DefaultKernelCreationDriver: return CudaPlatform( self._ctx, - omit_range_check=omit_range_check, thread_mapping=thread_mapping, ) elif self._target == Target.SYCL: from ..backend.platforms import SyclPlatform auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size") - omit_range_check = self._cfg.gpu.get_option("omit_range_check") return SyclPlatform( self._ctx, - omit_range_check=omit_range_check, automatic_block_size=auto_block_size, ) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 27d6fc817d5a9193c3faa4b170d907987fe6022e..d473e5b4a7408d2be77f0c4f2bc3b55a5ca9df0a 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -1,13 +1,16 @@ from __future__ import annotations from abc import ABC, abstractmethod +from dataclasses import dataclass from typing import cast, Any, Callable from itertools import chain +from warnings import warn from .functions import Lambda from .parameters import Parameter from .errors import CodegenError -from .config import GpuIndexingScheme, _AUTO_TYPE +from .config import GpuIndexingScheme +from .target import Target from ..backend.kernelcreation import ( KernelCreationContext, @@ -16,13 +19,36 @@ from ..backend.kernelcreation import ( ) from ..backend.platforms.cuda import ThreadMapping -from ..backend.ast.expressions import PsExpression +from ..backend.ast.expressions import PsExpression, PsIntDiv +from math import prod +from ..utils import ceil_to_multiple dim3 = tuple[int, int, int] _Dim3Lambda = tuple[Lambda, Lambda, Lambda] +@dataclass +class HardwareProperties: + warp_size: int + max_threads_per_block: int + max_block_sizes: dim3 + + def block_size_exceeds_hw_limits( + self, + block_size: tuple[int, ...] + ) -> bool: + """Checks if provided block size conforms limits given by the hardware.""" + + return ( + any( + size > max_size + for size, max_size in zip(block_size, self.max_block_sizes) + ) + or prod(block_size) > self.max_threads_per_block + ) + + class GpuLaunchConfiguration(ABC): """Base class for launch configurations for CUDA and HIP kernels. @@ -33,6 +59,18 @@ class GpuLaunchConfiguration(ABC): parameters to the associated kernel """ + @property + @abstractmethod + def block_size(self) -> dim3 | None: + """Returns desired block size if available.""" + pass + + @block_size.setter + @abstractmethod + def block_size(self, val: dim3): + """Sets desired block size if possible.""" + pass + @property @abstractmethod def parameters(self) -> frozenset[Parameter]: @@ -52,6 +90,25 @@ class GpuLaunchConfiguration(ABC): this launch configuration, such that when the configuration changes, the JIT parameter cache is invalidated.""" + @staticmethod + def get_default_block_size(rank: int) -> dim3: + """Returns the default block size configuration used by the generator.""" + + match rank: + case 1: + return (256, 1, 1) + case 2: + return (16, 16, 1) + case 3: + return (8, 8, 4) + case _: + assert False, "unreachable code" + + @staticmethod + def _excessive_block_size_error_msg(block_size: tuple[int, ...]): + return f"Unable to determine GPU block size for this kernel. \ + Final block size was too large: {block_size}." + class AutomaticLaunchConfiguration(GpuLaunchConfiguration): """Launch configuration that is dynamically computed from kernel parameters. @@ -63,20 +120,37 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, + hw_props: HardwareProperties, + assume_warp_aligned_block_size: bool, ) -> None: self._block_size = block_size self._grid_size = grid_size + self._hw_props = hw_props + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size self._params: frozenset[Parameter] = frozenset().union( *(lb.parameters for lb in chain(block_size, grid_size)) ) + @property + def block_size(self) -> dim3 | None: + """Block size is only available when `evaluate` is called.""" + return None + + @block_size.setter + def block_size(self, val: dim3): + AttributeError("Setting `block_size` on an automatic launch configuration has no effect.") + @property def parameters(self) -> frozenset[Parameter]: return self._params def evaluate(self, **kwargs) -> tuple[dim3, dim3]: block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) + + if self._hw_props.block_size_exceeds_hw_limits(block_size): + raise CodegenError(f"Block size {block_size} exceeds hardware limits.") + grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) return cast(dim3, block_size), cast(dim3, grid_size) @@ -91,8 +165,12 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): """ def __init__( - self, + self, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool = False ) -> None: + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + + self._hw_props = hw_props + self._block_size: dim3 | None = None self._grid_size: dim3 | None = None @@ -123,6 +201,18 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): if self._grid_size is None: raise AttributeError("No GPU grid size was set by the user.") + if ( + self._assume_warp_aligned_block_size + and prod(self._block_size) % self._hw_props.warp_size != 0 + ): + raise CodegenError( + "Specified block sizes must align with warp size with " + "`assume_warp_aligned_block_size` enabled." + ) + + if self._hw_props.block_size_exceeds_hw_limits(self._block_size): + raise CodegenError(self._excessive_block_size_error_msg(self._block_size)) + return self._block_size, self._grid_size def jit_cache_key(self) -> Any: @@ -130,24 +220,60 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): - """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size. + """GPU launch configuration that dynamically computes the grid size from either the default block size + or a computed block size. Computing block sizes can be triggerred via the :meth:`trim_block_size` or + :meth:`fit_block_size` member functions. These functions adapt a user-defined initial block size that they + receive as an argument. The adaptation of the initial block sizes is described in the following: - The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items - in the kernel's iteration space as follows. For each dimension :math:`c \\in \\{ x, y, z \\}`, - - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``; - - otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``. + - if :meth:`fit_block_size` was chosen: + + the initial block size is adapted such that it aligns with multiples of the hardware's warp size. + This is done using a fitting algorithm first trims the initial block size with the iteration space + and increases it incrementally until it is large enough and coincides with multiples of the warp size, i.e. + + ``block_size.c = _fit_block_size_to_it_space(iter_space.c, init_block_size.c, hardware_properties)`` + + The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`. + + - elif :meth:`trim_block_size` was chosen: + + a trimming between the number of work items and the kernel's iteration space occurs, i.e. + + - if ``init_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` + - otherwise, ``block_size.c = init_block_size.c`` + + When `GpuOptions.assume_warp_aligned_block_size` is set, we ensure warp-alignment by + rounding the block size dimension that is closest the next multiple of the warp size. + + - otherwise: the default block size is taken i.e. + + ``block_size.c = get_default_block_size(rank=3).c`` + + The actual launch grid size is then computed as follows. + + ``grid_size.c = ceil(num_work_items.c / block_size.c)``. """ def __init__( self, num_work_items: _Dim3Lambda, - default_block_size: dim3 | None = None, + hw_props: HardwareProperties, + assume_warp_aligned_block_size: bool, ) -> None: self._num_work_items = num_work_items - self._block_size: dim3 | None = default_block_size + self._hw_props = hw_props + + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + + default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items)) + self._default_block_size = default_bs + self._init_block_size: dim3 = default_bs + self._compute_block_size: ( + Callable[[dim3, dim3, HardwareProperties], tuple[int, ...]] | None + ) = None self._params: frozenset[Parameter] = frozenset().union( *(wit.parameters for wit in num_work_items) @@ -159,44 +285,188 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): dimension from kernel parameters.""" return self._num_work_items + @property + def parameters(self) -> frozenset[Parameter]: + """Parameters of this launch configuration""" + return self._params + @property def block_size(self) -> dim3 | None: - """The desired GPU block size.""" - return self._block_size + """Block size is only available when `evaluate` is called.""" + return None @block_size.setter def block_size(self, val: dim3): - self._block_size = val + AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.") + + @staticmethod + def _round_block_sizes_to_warp_size( + to_round: tuple[int, ...], warp_size: int + ) -> tuple[int, ...]: + # check if already aligns with warp size + if prod(to_round) % warp_size == 0: + return tuple(to_round) + + # find index of element closest to warp size and round up + index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size))) + if index_to_round + 1 < len(to_round): + return ( + *to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], warp_size), + *to_round[index_to_round + 1:], + ) + else: + return ( + *to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], warp_size), + ) - @property - def parameters(self) -> frozenset[Parameter]: - """Parameters of this launch configuration""" - return self._params + def trim_block_size(self, block_size: dim3): + def call_trimming_factory( + it: dim3, + bs: dim3, + hw: HardwareProperties, + ): + return self._trim_block_size_to_it_space(it, bs, hw) - def evaluate(self, **kwargs) -> tuple[dim3, dim3]: - if self._block_size is None: - raise AttributeError("No GPU block size was specified by the user!") + self._init_block_size = block_size + self._compute_block_size = call_trimming_factory + + def _trim_block_size_to_it_space( + self, + it_space: dim3, + block_size: dim3, + hw_props: HardwareProperties, + ) -> tuple[int, ...]: + """Returns specified block sizes trimmed with iteration space. + Raises CodegenError if trimmed block size does not conform hardware limits. + """ + + ret = tuple([min(b, i) for b, i in zip(block_size, it_space)]) + if hw_props.block_size_exceeds_hw_limits(ret): + raise CodegenError(self._excessive_block_size_error_msg(ret)) + + if ( + self._assume_warp_aligned_block_size + and prod(ret) % self._hw_props.warp_size != 0 + ): + self._round_block_sizes_to_warp_size(ret, hw_props.warp_size) + + return ret + + def fit_block_size(self, block_size: dim3): + def call_fitting_factory( + it: dim3, + bs: dim3, + hw: HardwareProperties, + ): + return self._fit_block_size_to_it_space(it, bs, hw) + + self._init_block_size = block_size + self._compute_block_size = call_fitting_factory + + def _fit_block_size_to_it_space( + self, + it_space: dim3, + block_size: dim3, + hw_props: HardwareProperties, + ) -> tuple[int, ...]: + """Returns an optimized block size configuration with block sizes being aligned with the warp size. + Raises CodegenError if optimal block size could not be found or does not conform hardware limits. + """ + def trim(to_trim: list[int]) -> list[int]: + return [min(b, i) for b, i in zip(to_trim, it_space)] + + def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]: + if hw_props.block_size_exceeds_hw_limits(ret): + raise CodegenError(self._excessive_block_size_error_msg(ret)) + return ret + + trimmed = trim(list(block_size)) + if ( + prod(trimmed) >= hw_props.warp_size + and prod(trimmed) % hw_props.warp_size == 0 + ): + # case 1: greater than min block size -> use trimmed result + return check_sizes_and_return(tuple(trimmed)) + + prev_trim_size = 0 + resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space)) + while prod(trimmed) is not prev_trim_size: + prev_trim_size = prod(trimmed) + + # case 2: trimmed block is equivalent to the whole iteration space + if all(b == i for b, i in zip(trimmed, it_space)): + return check_sizes_and_return( + self._round_block_sizes_to_warp_size( + tuple(trimmed), hw_props.warp_size + ) + ) + else: + # double block size in each dimension until block is large enough (or case 2 triggers) + for d in resize_order: + trimmed[d] *= 2 + + # trim fastest moving dim to multiples of warp size + if ( + d == 0 + and trimmed[d] > hw_props.warp_size + and trimmed[d] % hw_props.warp_size != 0 + ): + # subtract remainder + trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size) + + # check if block sizes are within hardware capabilities + trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d]) + + # trim again + trimmed = trim(trimmed) + + # case 3: trim block is large enough + if prod(trimmed) >= hw_props.warp_size: + return check_sizes_and_return( + self._round_block_sizes_to_warp_size( + tuple(trimmed), hw_props.warp_size + ) + ) + + raise CodegenError("Unable to determine GPU block size for this kernel.") + + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: from ..utils import div_ceil num_work_items = cast( dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items) ) - reduced_block_size = cast( - dim3, - tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)), - ) + + block_size: dim3 + if self._compute_block_size: + try: + computed_bs = self._compute_block_size( + num_work_items, self._init_block_size, self._hw_props + ) + + block_size = cast(dim3, computed_bs) + except CodegenError as e: + block_size = self._default_block_size + warn( + f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. " + f"Block size fitting could not determine optimal block size configuration. " + f"Defaulting back to {self._default_block_size}." + ) + else: + block_size = self._default_block_size + grid_size = cast( dim3, - tuple( - div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size) - ), + tuple(div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size)), ) - return reduced_block_size, grid_size + return block_size, grid_size def jit_cache_key(self) -> Any: - return self._block_size + return () class GpuIndexing: @@ -218,14 +488,24 @@ class GpuIndexing: def __init__( self, ctx: KernelCreationContext, + target: Target, scheme: GpuIndexingScheme, - default_block_size: dim3 | _AUTO_TYPE | None = None, + warp_size: int, manual_launch_grid: bool = False, + assume_warp_aligned_block_size: bool = False, ) -> None: self._ctx = ctx + self._target = target self._scheme = scheme - self._default_block_size = default_block_size + self._warp_size = warp_size self._manual_launch_grid = manual_launch_grid + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + + self._hw_props = HardwareProperties( + warp_size, + self.get_max_threads_per_block(target), + self.get_max_block_sizes(target), + ) from ..backend.kernelcreation import AstFactory from .driver import KernelFactory @@ -233,6 +513,26 @@ class GpuIndexing: self._ast_factory = AstFactory(self._ctx) self._kernel_factory = KernelFactory(self._ctx) + @staticmethod + def get_max_block_sizes(target: Target): + match target: + case Target.CUDA: + return (1024, 1024, 64) + case _: + raise CodegenError( + f"Cannot determine max GPU block sizes for target {target}" + ) + + @staticmethod + def get_max_threads_per_block(target: Target): + match target: + case Target.CUDA: + return 1024 + case _: + raise CodegenError( + f"Cannot determine max GPU threads per block for target {target}" + ) + def get_thread_mapping(self) -> ThreadMapping: """Retrieve a thread mapping object for use by the backend""" @@ -247,7 +547,13 @@ class GpuIndexing: def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]: """Retrieve a factory for the launch configuration for later consumption by the runtime system""" if self._manual_launch_grid: - return ManualLaunchConfiguration + + def factory(): + return ManualLaunchConfiguration( + self._hw_props, self._assume_warp_aligned_block_size + ) + + return factory match self._scheme: case GpuIndexingScheme.Linear3D: @@ -268,10 +574,9 @@ class GpuIndexing: ) work_items_expr += tuple( - self._ast_factory.parse_index(1) - for _ in range(3 - rank) + self._ast_factory.parse_index(1) for _ in range(3 - rank) ) - + num_work_items = cast( _Dim3Lambda, tuple(self._kernel_factory.create_lambda(wit) for wit in work_items_expr), @@ -280,28 +585,12 @@ class GpuIndexing: def factory(): return DynamicBlockSizeLaunchConfiguration( num_work_items, - self._get_default_block_size(rank), + self._hw_props, + self._assume_warp_aligned_block_size, ) return factory - def _get_default_block_size(self, rank: int) -> dim3: - if self._default_block_size is None: - raise CodegenError("The default block size option was not set") - - if isinstance(self._default_block_size, _AUTO_TYPE): - match rank: - case 1: - return (256, 1, 1) - case 2: - return (128, 2, 1) - case 3: - return (128, 2, 2) - case _: - assert False, "unreachable code" - else: - return self._default_block_size - def _get_blockwise4d_config_factory( self, ) -> Callable[[], AutomaticLaunchConfiguration]: @@ -311,8 +600,19 @@ class GpuIndexing: if rank > 4: raise ValueError(f"Iteration space rank is too large: {rank}") + # impossible to use block size determination function since the iteration space is unknown + # -> round block size in fastest moving dimension up to multiple of warp size + rounded_block_size: PsExpression + if self._assume_warp_aligned_block_size: + warp_size = self._ast_factory.parse_index(self._hw_props.warp_size) + rounded_block_size = self._ast_factory.parse_index( + PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1), + warp_size.clone()) * warp_size.clone()) + else: + rounded_block_size = work_items[0] + block_size = ( - self._kernel_factory.create_lambda(work_items[0]), + self._kernel_factory.create_lambda(rounded_block_size), self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), ) @@ -328,6 +628,8 @@ class GpuIndexing: return AutomaticLaunchConfiguration( block_size, cast(_Dim3Lambda, grid_size), + self._hw_props, + self._assume_warp_aligned_block_size, ) return factory diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index d45abf8786e8fc1789f8eae551a07b6aa556db67..4e36e369e3b65f3f6e8c8f268feae9e16e1b6b56 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -1,4 +1,4 @@ -from typing import Any, Sequence, cast, Callable +from typing import Any, Callable from dataclasses import dataclass try: @@ -207,19 +207,9 @@ class CupyKernelWrapper(KernelWrapper): class CupyJit(JitBase): - def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)): + def __init__(self): self._runtime_headers = {"<cstdint>"} - if len(default_block_size) > 3: - raise ValueError( - f"Invalid block size: {default_block_size}. Must be at most three-dimensional." - ) - - self._default_block_size: tuple[int, int, int] = cast( - tuple[int, int, int], - tuple(default_block_size) + (1,) * (3 - len(default_block_size)), - ) - def compile(self, kernel: Kernel) -> KernelWrapper: if not HAVE_CUPY: raise JitError( diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py index 0049d0a2ce36471c3cfb98bf39c965d33c24868b..5ef13f31f891ce7904ac1701272deeb1a337d488 100644 --- a/src/pystencils/utils.py +++ b/src/pystencils/utils.py @@ -316,3 +316,8 @@ def div_ceil(divident, divisor): The result is unspecified if either argument is negative.""" return c_intdiv(divident + divisor - 1, divisor) + + +def ceil_to_multiple(divident, divisor): + """Rounds 'divident' to the next multiple of 'divisor'.""" + return div_ceil(divident, divisor) * divisor diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index f1905b1fcb7c7406f43cfb94af2928b6f35bc3f8..70b4edd35852b1d2ff674e6cb0421bf4fea66e1f 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -12,6 +12,7 @@ from pystencils import ( create_kernel, Target, ) +from pystencils.codegen.gpu_indexing import GpuIndexing, HardwareProperties from pystencils.slicing import ( add_ghost_layers, @@ -19,6 +20,7 @@ from pystencils.slicing import ( remove_ghost_layers, normalize_slice, ) +from math import prod try: import cupy as cp @@ -29,10 +31,12 @@ except ImportError: @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) -@pytest.mark.parametrize("omit_range_check", [False, True]) @pytest.mark.parametrize("manual_grid", [False, True]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True]) def test_indexing_options_3d( - indexing_scheme: str, omit_range_check: bool, manual_grid: bool + indexing_scheme: str, + manual_grid: bool, + assume_warp_aligned_block_size: bool, ): src, dst = fields("src, dst: [3D]") asm = Assignment( @@ -47,8 +51,8 @@ def test_indexing_options_3d( cfg = CreateKernelConfig(target=Target.CUDA) cfg.gpu.indexing_scheme = indexing_scheme - cfg.gpu.omit_range_check = omit_range_check cfg.gpu.manual_launch_grid = manual_grid + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size ast = create_kernel(asm, cfg) kernel = ast.compile() @@ -59,14 +63,100 @@ def test_indexing_options_3d( if manual_grid: match indexing_scheme: case "linear3d": - kernel.launch_config.block_size = (10, 8, 8) - kernel.launch_config.grid_size = (4, 4, 2) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (8, 10, 8) + kernel.launch_config.grid_size = (5, 4, 2) + else: + kernel.launch_config.block_size = (10, 10, 8) + kernel.launch_config.grid_size = (4, 4, 2) case "blockwise4d": - kernel.launch_config.block_size = (40, 1, 1) - kernel.launch_config.grid_size = (32, 16, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (64, 1, 1) + kernel.launch_config.grid_size = (32, 16, 1) + else: + kernel.launch_config.block_size = (40, 1, 1) + kernel.launch_config.grid_size = (32, 16, 1) elif indexing_scheme == "linear3d": - kernel.launch_config.block_size = (10, 8, 8) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (32, 8, 2) + else: + kernel.launch_config.block_size = (10, 10, 10) + + kernel(src=src_arr, dst=dst_arr) + + expected = cp.zeros_like(src_arr) + expected[1:-1, 1:-1, 1:-1].fill(6.0) + + cp.testing.assert_allclose(dst_arr, expected) + +@pytest.mark.parametrize("iteration_space", + [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)]) +@pytest.mark.parametrize("initial_block_size", + [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) +@pytest.mark.parametrize("use_block_fitting", [True, False]) +def test_block_size_adaptations( + iteration_space: tuple[int, int, int], + initial_block_size: tuple[int, int, int], + assume_warp_aligned_block_size: bool, + use_block_fitting: bool, +): + src, dst = fields("src, dst: [3D]") + asm = Assignment( + dst.center(), + src[-1, 0, 0] + + src[1, 0, 0] + + src[0, -1, 0] + + src[0, 1, 0] + + src[0, 0, -1] + + src[0, 0, 1], + ) + + target = Target.CUDA + cfg = CreateKernelConfig(target=target) + cfg.gpu.indexing_scheme = "linear3d" + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size + + warp_size = cfg.gpu.default_warp_size(target) + max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) + max_block_sizes = GpuIndexing.get_max_block_sizes(target) + + ast = create_kernel(asm, cfg) + kernel = ast.compile() + + if use_block_fitting: + # test internal block fitting function later used in `kernel.launch_config.fit_block_size` + internal_block_size = kernel.launch_config._fit_block_size_to_it_space( + iteration_space, + initial_block_size, + HardwareProperties(warp_size, max_threads_per_block, max_block_sizes), + ) + + # checks if criterion for warp size alignment is fulfilled + def check_suitability(b): + return prod(b) >= warp_size and prod(b) % warp_size == 0 + + # block size fitting should not modify an already ideal configuration + # -> check if ideal configurations are modified + if ( + check_suitability(initial_block_size) + and all(x == y for x, y in zip(initial_block_size, iteration_space)) # trimming may alter results + ): + assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \ + f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}." + + assert check_suitability(internal_block_size), \ + "Determined block size shall be divisible by warp size." + + # set block size via fitting algorithm + kernel.launch_config.fit_block_size(initial_block_size) + else: + # set block size via trimming algorithm + kernel.launch_config.trim_block_size(initial_block_size) + + src_arr = cp.ones(iteration_space) + dst_arr = cp.zeros_like(src_arr) kernel(src=src_arr, dst=dst_arr) @@ -77,10 +167,10 @@ def test_indexing_options_3d( @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) -@pytest.mark.parametrize("omit_range_check", [False, True]) @pytest.mark.parametrize("manual_grid", [False, True]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True]) def test_indexing_options_2d( - indexing_scheme: str, omit_range_check: bool, manual_grid: bool + indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool ): src, dst = fields("src, dst: [2D]") asm = Assignment( @@ -93,8 +183,8 @@ def test_indexing_options_2d( cfg = CreateKernelConfig(target=Target.CUDA) cfg.gpu.indexing_scheme = indexing_scheme - cfg.gpu.omit_range_check = omit_range_check cfg.gpu.manual_launch_grid = manual_grid + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size ast = create_kernel(asm, cfg) kernel = ast.compile() @@ -105,14 +195,25 @@ def test_indexing_options_2d( if manual_grid: match indexing_scheme: case "linear3d": - kernel.launch_config.block_size = (10, 8, 1) - kernel.launch_config.grid_size = (4, 2, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (8, 8, 1) + kernel.launch_config.grid_size = (5, 2, 1) + else: + kernel.launch_config.block_size = (10, 8, 1) + kernel.launch_config.grid_size = (4, 2, 1) case "blockwise4d": - kernel.launch_config.block_size = (40, 1, 1) - kernel.launch_config.grid_size = (16, 1, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (64, 1, 1) + kernel.launch_config.grid_size = (16, 1, 1) + else: + kernel.launch_config.block_size = (40, 1, 1) + kernel.launch_config.grid_size = (16, 1, 1) elif indexing_scheme == "linear3d": - kernel.launch_config.block_size = (10, 8, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (8, 8, 1) + else: + kernel.launch_config.block_size = (10, 8, 1) kernel(src=src_arr, dst=dst_arr)