From 73295d2776dcf3b753e9584a422b39514088c124 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 21 Feb 2025 20:21:02 +0100 Subject: [PATCH 01/54] Throw out omit_range_check configuration option --- src/pystencils/backend/platforms/cuda.py | 18 ++++-------------- src/pystencils/backend/platforms/sycl.py | 17 ++++------------- src/pystencils/codegen/config.py | 8 -------- src/pystencils/codegen/driver.py | 4 ---- tests/kernelcreation/test_gpu.py | 4 +--- 5 files changed, 9 insertions(+), 42 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index e896fc2bb..3d45117e2 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -166,20 +166,16 @@ class CudaPlatform(GenericGpu): Args: ctx: The kernel creation context - omit_range_check: If `True`, generated index translation code will not check if the point identified - by block and thread indices is actually contained in the iteration space thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points """ def __init__( self, ctx: KernelCreationContext, - omit_range_check: bool = False, thread_mapping: ThreadMapping | None = None, ) -> None: super().__init__(ctx) - self._omit_range_check = omit_range_check self._thread_mapping = ( thread_mapping if thread_mapping is not None else Linear3DMapping() ) @@ -282,8 +278,7 @@ class CudaPlatform(GenericGpu): indexing_decls.append( self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) - if not self._omit_range_check: - conds.append(PsLt(ctr_expr, dim.stop)) + conds.append(PsLt(ctr_expr, dim.stop)) if conds: condition: PsExpression = conds[0] @@ -324,12 +319,7 @@ class CudaPlatform(GenericGpu): ] body.statements = mappings + body.statements - if not self._omit_range_check: - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr_expr.clone(), stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - else: - body.statements = [sparse_idx_decl] + body.statements - ast = body + stop = PsExpression.make(ispace.index_list.shape[0]) + condition = PsLt(sparse_ctr_expr.clone(), stop) + return PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - return ast diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index f3c4bb3d5..d16c4f51b 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -35,12 +35,10 @@ class SyclPlatform(Platform): def __init__( self, ctx: KernelCreationContext, - omit_range_check: bool = False, automatic_block_size: bool = False, ): super().__init__(ctx) - self._omit_range_check = omit_range_check self._automatic_block_size = automatic_block_size @property @@ -136,8 +134,7 @@ class SyclPlatform(Platform): indexing_decls.append( PsDeclaration(ctr, dim.start + work_item_idx * dim.step) ) - if not self._omit_range_check: - conds.append(PsLt(ctr, dim.stop)) + conds.append(PsLt(ctr, dim.stop)) if conds: condition: PsExpression = conds[0] @@ -182,15 +179,9 @@ class SyclPlatform(Platform): ] body.statements = mappings + body.statements - if not self._omit_range_check: - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr, stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - else: - body.statements = [sparse_idx_decl] + body.statements - ast = body - - return ast + stop = PsExpression.make(ispace.index_list.shape[0]) + condition = PsLt(sparse_ctr, stop) + return PsBlock([sparse_idx_decl, PsConditional(condition, body)]) def _item_type(self, rank: int): if not self._automatic_block_size: diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 0d43b40e3..9856ad632 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -372,14 +372,6 @@ class GpuOptions(ConfigBase): indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D) """Thread indexing scheme for dense GPU kernels.""" - omit_range_check: BasicOption[bool] = BasicOption(False) - """If set to `True`, omit the iteration counter range check. - - By default, the code generator introduces a check if the iteration counters computed from GPU block and thread - indices are within the prescribed loop range. - This check can be discarded through this option, at your own peril. - """ - block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO) """Desired block size for the execution of GPU kernels. diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index b8f9c7101..ac933e2e5 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -438,7 +438,6 @@ class DefaultKernelCreationDriver: elif self._target.is_gpu(): gpu_opts = self._cfg.gpu - omit_range_check: bool = gpu_opts.get_option("omit_range_check") match self._target: case Target.CUDA: @@ -452,18 +451,15 @@ class DefaultKernelCreationDriver: return CudaPlatform( self._ctx, - omit_range_check=omit_range_check, thread_mapping=thread_mapping, ) elif self._target == Target.SYCL: from ..backend.platforms import SyclPlatform auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size") - omit_range_check = self._cfg.gpu.get_option("omit_range_check") return SyclPlatform( self._ctx, - omit_range_check=omit_range_check, automatic_block_size=auto_block_size, ) diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 10b37e610..38e849c3c 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -29,10 +29,9 @@ except ImportError: @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) -@pytest.mark.parametrize("omit_range_check", [False, True]) @pytest.mark.parametrize("manual_grid", [False, True]) def test_indexing_options( - indexing_scheme: str, omit_range_check: bool, manual_grid: bool + indexing_scheme: str, manual_grid: bool ): src, dst = fields("src, dst: [3D]") asm = Assignment( @@ -47,7 +46,6 @@ def test_indexing_options( cfg = CreateKernelConfig(target=Target.CUDA) cfg.gpu.indexing_scheme = indexing_scheme - cfg.gpu.omit_range_check = omit_range_check cfg.gpu.manual_launch_grid = manual_grid ast = create_kernel(asm, cfg) -- GitLab From 0a907233b111343317abf14a47ecccbbd55ae64c Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 21 Feb 2025 21:12:49 +0100 Subject: [PATCH 02/54] Introduce warp_size as GpuOption and to round block sizes to multiples of it --- src/pystencils/codegen/config.py | 19 +++++++++++++++++++ src/pystencils/codegen/driver.py | 7 ++++++- src/pystencils/codegen/gpu_indexing.py | 25 +++++++++++++++++++++---- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 9856ad632..fcf1190f5 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -388,6 +388,25 @@ class GpuOptions(ConfigBase): The launch grid will then have to be specified manually at runtime. """ + warp_size: BasicOption[int] = BasicOption() + """Specifies the size of a warp (CUDA) or wavefront (HIP). + + If set to `None` (the default), the default value for the given target will be automatically used. + """ + + @staticmethod + def default_warp_size(target: Target): + if target.is_vector_cpu(): + raise ValueError(f"Given target {target} is no GPU target.") + + match target: + case Target.CUDA: + return 32 + case _: + raise NotImplementedError( + f"No default warp/wavefront size known for target {target}" + ) + @indexing_scheme.validate def _validate_idx_scheme(self, val: str | GpuIndexingScheme): if isinstance(val, GpuIndexingScheme): diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index ac933e2e5..8286d962a 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -11,6 +11,7 @@ from .config import ( GhostLayerSpec, IterationSliceSpec, GpuIndexingScheme, + GpuOptions, ) from .kernel import Kernel, GpuKernel from .properties import PsSymbolProperty, FieldBasePtr @@ -406,8 +407,12 @@ class DefaultKernelCreationDriver: idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") + warp_size: int | None = self._cfg.gpu.get_option("warp_size") - return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid) + if warp_size is None: + warp_size = GpuOptions.default_warp_size(self._target) + + return GpuIndexing(self._ctx, idx_scheme, warp_size, block_size, manual_launch_grid) def _get_platform(self) -> Platform: if Target._CPU in self._target: diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index c93f0f959..08a24d761 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -8,6 +8,7 @@ from .functions import Lambda from .parameters import Parameter from .errors import CodegenError from .config import GpuIndexingScheme, _AUTO_TYPE +from ..backend.constants import PsConstant from ..backend.kernelcreation import ( KernelCreationContext, @@ -16,8 +17,7 @@ from ..backend.kernelcreation import ( ) from ..backend.platforms.cuda import ThreadMapping -from ..backend.ast.expressions import PsExpression - +from ..backend.ast.expressions import PsExpression, PsIntDiv dim3 = tuple[int, int, int] _Dim3Lambda = tuple[Lambda, Lambda, Lambda] @@ -63,12 +63,14 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, + warp_size: Lambda ) -> None: self._block_size = block_size self._grid_size = grid_size + self._warp_size = warp_size self._params: frozenset[Parameter] = frozenset().union( - *(lb.parameters for lb in chain(block_size, grid_size)) + *(lb.parameters for lb in chain(block_size, grid_size, warp_size)) ) @property @@ -76,7 +78,13 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): return self._params def evaluate(self, **kwargs) -> tuple[dim3, dim3]: + def ceil_to_multiple(arg1, arg2): + PsIntDiv(arg1 + arg2 - PsExpression.make(PsConstant(1)), arg2) * arg2 + block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) + # round block size in fastest moving dimension up to multiple of warp size + block_size = (ceil_to_multiple(block_size[0], self._warp_size), *block_size[1:]) + grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) return cast(dim3, block_size), cast(dim3, grid_size) @@ -143,10 +151,13 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def __init__( self, num_work_items: _Dim3Lambda, + warp_size: Lambda, default_block_size: dim3 | None = None, ) -> None: self._num_work_items = num_work_items + self._warp_size = warp_size + self._block_size: dim3 | None = default_block_size self._params: frozenset[Parameter] = frozenset().union( @@ -184,7 +195,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ) reduced_block_size = cast( dim3, - tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)), + tuple(div_ceil(min(wit, bs), self._warp_size) * self._warp_size + for wit, bs in zip(num_work_items, self._block_size)), ) grid_size = cast( dim3, @@ -219,11 +231,13 @@ class GpuIndexing: self, ctx: KernelCreationContext, scheme: GpuIndexingScheme, + warp_size: int, default_block_size: dim3 | _AUTO_TYPE | None = None, manual_launch_grid: bool = False, ) -> None: self._ctx = ctx self._scheme = scheme + self._warp_size = warp_size self._default_block_size = default_block_size self._manual_launch_grid = manual_launch_grid @@ -319,10 +333,13 @@ class GpuIndexing: for _ in range(4 - rank) ) + warp_size = self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size))) + def factory(): return AutomaticLaunchConfiguration( block_size, cast(_Dim3Lambda, grid_size), + warp_size ) return factory -- GitLab From 1b7d1283e6c0d95f1739720b327715400f7a0a40 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 24 Feb 2025 11:59:42 +0100 Subject: [PATCH 03/54] Add missing warp size parameter to DynamicBlockSizeLaunchConfiguration --- src/pystencils/codegen/gpu_indexing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 08a24d761..6cd82a645 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -151,7 +151,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def __init__( self, num_work_items: _Dim3Lambda, - warp_size: Lambda, + warp_size: int, default_block_size: dim3 | None = None, ) -> None: self._num_work_items = num_work_items @@ -289,6 +289,7 @@ class GpuIndexing: def factory(): return DynamicBlockSizeLaunchConfiguration( num_work_items, + self._warp_size, self._get_default_block_size(rank), ) -- GitLab From d9744fed3d3caa9bd315718c4e06f8bd5edfe7c9 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 24 Feb 2025 12:10:43 +0100 Subject: [PATCH 04/54] Fix lint --- src/pystencils/backend/platforms/cuda.py | 1 - src/pystencils/codegen/driver.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 3d45117e2..ab5d6c19b 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -322,4 +322,3 @@ class CudaPlatform(GenericGpu): stop = PsExpression.make(ispace.index_list.shape[0]) condition = PsLt(sparse_ctr_expr.clone(), stop) return PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 8286d962a..6f041b6a7 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -442,8 +442,6 @@ class DefaultKernelCreationDriver: ) elif self._target.is_gpu(): - gpu_opts = self._cfg.gpu - match self._target: case Target.CUDA: from ..backend.platforms import CudaPlatform -- GitLab From 675c6bd71acac5cec54a5306390e928e3df1ce82 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 24 Feb 2025 12:14:15 +0100 Subject: [PATCH 05/54] Fix typecheck --- src/pystencils/codegen/gpu_indexing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 6cd82a645..731abcc64 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -20,6 +20,7 @@ from ..backend.platforms.cuda import ThreadMapping from ..backend.ast.expressions import PsExpression, PsIntDiv dim3 = tuple[int, int, int] +_Dim1Lambda = tuple[Lambda] _Dim3Lambda = tuple[Lambda, Lambda, Lambda] @@ -63,7 +64,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, - warp_size: Lambda + warp_size: _Dim1Lambda ) -> None: self._block_size = block_size self._grid_size = grid_size @@ -83,7 +84,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) # round block size in fastest moving dimension up to multiple of warp size - block_size = (ceil_to_multiple(block_size[0], self._warp_size), *block_size[1:]) + block_size = (ceil_to_multiple(block_size[0], self._warp_size[0]), *block_size[1:]) grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) return cast(dim3, block_size), cast(dim3, grid_size) @@ -334,7 +335,7 @@ class GpuIndexing: for _ in range(4 - rank) ) - warp_size = self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size))) + warp_size = (self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size))),) def factory(): return AutomaticLaunchConfiguration( -- GitLab From eddb7882d3eafde8be8471a8c3accc6637e552f1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 24 Feb 2025 13:29:25 +0100 Subject: [PATCH 06/54] Only apply warp-size rounding to x block size --- src/pystencils/codegen/gpu_indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 731abcc64..048dcd2fe 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -196,8 +196,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ) reduced_block_size = cast( dim3, - tuple(div_ceil(min(wit, bs), self._warp_size) * self._warp_size - for wit, bs in zip(num_work_items, self._block_size)), + (div_ceil(min(num_work_items[0], self._block_size[0]), self._warp_size) * self._warp_size,) + + tuple(min(wit, bs) for wit, bs in zip(num_work_items[1:], self._block_size[1:])) ) grid_size = cast( dim3, -- GitLab From 3072fbbc71beb46c3b066d55a75373f239626a01 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 24 Feb 2025 14:51:44 +0100 Subject: [PATCH 07/54] Add algorithm for determining best fit between GPU block config and iteration space --- src/pystencils/codegen/gpu_indexing.py | 63 +++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 048dcd2fe..f9b3f572c 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -17,7 +17,11 @@ from ..backend.kernelcreation import ( ) from ..backend.platforms.cuda import ThreadMapping -from ..backend.ast.expressions import PsExpression, PsIntDiv +from ..backend.ast.expressions import PsExpression +from math import prod + +from ..sympyextensions.integer_functions import div_ceil +from ..types.quick import UInt dim3 = tuple[int, int, int] _Dim1Lambda = tuple[Lambda] @@ -53,6 +57,51 @@ class GpuLaunchConfiguration(ABC): this launch configuration, such that when the configuration changes, the JIT parameter cache is invalidated.""" + @staticmethod + def determine_block_size(it_space: tuple[int, int, int], block_size: tuple[int, int, int], + warp_size: int) -> tuple[int, ...]: + + def trim(to_trim: list[int]) -> list[int]: + return [min(b, i) for b, i in zip(to_trim, it_space)] + + trimmed = trim(list(block_size)) + if prod(trimmed) >= warp_size: + # case 1: greater than min block size -> use trimmed result + if prod(trimmed) % warp_size == 0: + return tuple(trimmed) + else: + trimmed[0] = div_ceil(trimmed[0], warp_size) * warp_size + + prev_trim_size = 0 + resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space)) + max_block_sizes = [1024, 1024, 64] # TODO: move to target + while prod(trimmed) is not prev_trim_size: + prev_trim_size = prod(trimmed) + + # case 2: trimmed block is equivalent to the whole iteration space + if all(b == i for b, i in zip(trimmed, it_space)): + return tuple(trimmed) + else: + # double block size in each dimension until block is large enough (or case 2 triggers) + for d in resize_order: + trimmed[d] *= 2 + + # trim fastest moving dim to multiples of warp size + if d == 0 and trimmed[d] > warp_size and trimmed[d] % warp_size != 0: + trimmed[d] = trimmed[d] - (trimmed[d] % warp_size) # subtract remainder + + # check if block sizes are within hardware capabilities + trimmed[d] = min(trimmed[d], max_block_sizes[d]) + + # trim again + trimmed = trim(trimmed) + + # case 3: trim block is large enough + if prod(trimmed) >= warp_size: + return tuple(trimmed) + + raise CodegenError("Unable to determine GPU block size for this kernel.") + class AutomaticLaunchConfiguration(GpuLaunchConfiguration): """Launch configuration that is dynamically computed from kernel parameters. @@ -80,11 +129,12 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): def evaluate(self, **kwargs) -> tuple[dim3, dim3]: def ceil_to_multiple(arg1, arg2): - PsIntDiv(arg1 + arg2 - PsExpression.make(PsConstant(1)), arg2) * arg2 + return div_ceil(arg1 + arg2 - 1, arg2) * arg2 + # impossible to use block size determination function since the iteration space is unknown + # -> round block size in fastest moving dimension up to multiple of warp size block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) - # round block size in fastest moving dimension up to multiple of warp size - block_size = (ceil_to_multiple(block_size[0], self._warp_size[0]), *block_size[1:]) + block_size = (ceil_to_multiple(block_size[0], int(self._warp_size[0](**kwargs))), *block_size[1:]) grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) return cast(dim3, block_size), cast(dim3, grid_size) @@ -196,8 +246,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ) reduced_block_size = cast( dim3, - (div_ceil(min(num_work_items[0], self._block_size[0]), self._warp_size) * self._warp_size,) - + tuple(min(wit, bs) for wit, bs in zip(num_work_items[1:], self._block_size[1:])) + self.determine_block_size(num_work_items, self._block_size, self._warp_size), ) grid_size = cast( dim3, @@ -335,7 +384,7 @@ class GpuIndexing: for _ in range(4 - rank) ) - warp_size = (self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size))),) + warp_size = (self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size, UInt(32)))),) def factory(): return AutomaticLaunchConfiguration( -- GitLab From 799c22f0cfb068c42403080db8457b745cfb64c1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 24 Feb 2025 16:50:31 +0100 Subject: [PATCH 08/54] Introduce BasicOption max_block_sizes to GPU targets and employ in GpuIndexing --- src/pystencils/codegen/config.py | 16 +++++++++++++--- src/pystencils/codegen/driver.py | 5 ++++- src/pystencils/codegen/gpu_indexing.py | 12 +++++++++--- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index fcf1190f5..94a8f270d 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -380,6 +380,9 @@ class GpuOptions(ConfigBase): The block size may be overridden at runtime. """ + max_block_sizes: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption() + """Maximum dimension size of a thread block in dimensions (x, y, z).""" + manual_launch_grid: BasicOption[bool] = BasicOption(False) """Always require a manually specified launch grid when running this kernel. @@ -396,9 +399,6 @@ class GpuOptions(ConfigBase): @staticmethod def default_warp_size(target: Target): - if target.is_vector_cpu(): - raise ValueError(f"Given target {target} is no GPU target.") - match target: case Target.CUDA: return 32 @@ -407,6 +407,16 @@ class GpuOptions(ConfigBase): f"No default warp/wavefront size known for target {target}" ) + @staticmethod + def default_maximum_block_sizes(target: Target): + match target: + case Target.CUDA: + return (1024, 1024, 64) + case _: + raise NotImplementedError( + f"No default value known for maximum GPU block sizes available on target {target}" + ) + @indexing_scheme.validate def _validate_idx_scheme(self, val: str | GpuIndexingScheme): if isinstance(val, GpuIndexingScheme): diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 6f041b6a7..ca2caca35 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -408,11 +408,14 @@ class DefaultKernelCreationDriver: block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") warp_size: int | None = self._cfg.gpu.get_option("warp_size") + max_block_sizes: tuple[int, int, int] | None = self._cfg.gpu.get_option("max_block_sizes") if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) + if max_block_sizes is None: + max_block_sizes = GpuOptions.default_maximum_block_sizes(self._target) - return GpuIndexing(self._ctx, idx_scheme, warp_size, block_size, manual_launch_grid) + return GpuIndexing(self._ctx, idx_scheme, warp_size, max_block_sizes, block_size, manual_launch_grid) def _get_platform(self) -> Platform: if Target._CPU in self._target: diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index f9b3f572c..ce5def203 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -59,7 +59,8 @@ class GpuLaunchConfiguration(ABC): @staticmethod def determine_block_size(it_space: tuple[int, int, int], block_size: tuple[int, int, int], - warp_size: int) -> tuple[int, ...]: + max_block_sizes: tuple[int, int, int], warp_size: int) -> tuple[int, ...]: + """Returns an optimized block size configuration with block sizes being aligned with the warp size.""" def trim(to_trim: list[int]) -> list[int]: return [min(b, i) for b, i in zip(to_trim, it_space)] @@ -74,7 +75,6 @@ class GpuLaunchConfiguration(ABC): prev_trim_size = 0 resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space)) - max_block_sizes = [1024, 1024, 64] # TODO: move to target while prod(trimmed) is not prev_trim_size: prev_trim_size = prod(trimmed) @@ -203,12 +203,15 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self, num_work_items: _Dim3Lambda, warp_size: int, + max_block_sizes: dim3, default_block_size: dim3 | None = None, ) -> None: self._num_work_items = num_work_items self._warp_size = warp_size + self._max_block_sizes = max_block_sizes + self._block_size: dim3 | None = default_block_size self._params: frozenset[Parameter] = frozenset().union( @@ -246,7 +249,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ) reduced_block_size = cast( dim3, - self.determine_block_size(num_work_items, self._block_size, self._warp_size), + self.determine_block_size(num_work_items, self._block_size, self._max_block_sizes, self._warp_size), ) grid_size = cast( dim3, @@ -282,12 +285,14 @@ class GpuIndexing: ctx: KernelCreationContext, scheme: GpuIndexingScheme, warp_size: int, + max_block_sizes: dim3, default_block_size: dim3 | _AUTO_TYPE | None = None, manual_launch_grid: bool = False, ) -> None: self._ctx = ctx self._scheme = scheme self._warp_size = warp_size + self._max_block_sizes = max_block_sizes self._default_block_size = default_block_size self._manual_launch_grid = manual_launch_grid @@ -340,6 +345,7 @@ class GpuIndexing: return DynamicBlockSizeLaunchConfiguration( num_work_items, self._warp_size, + self._max_block_sizes, self._get_default_block_size(rank), ) -- GitLab From e010924c25662932911ca1c98ed2527abd0417db Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 24 Feb 2025 16:54:43 +0100 Subject: [PATCH 09/54] Add missing comment [skip ci] --- src/pystencils/codegen/config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 94a8f270d..8e6ae0805 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -381,7 +381,10 @@ class GpuOptions(ConfigBase): """ max_block_sizes: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption() - """Maximum dimension size of a thread block in dimensions (x, y, z).""" + """Maximum dimension size of a thread block in dimensions (x, y, z). + + If set to `None` (the default), the default value for the given target will be automatically used. + """ manual_launch_grid: BasicOption[bool] = BasicOption(False) """Always require a manually specified launch grid when running this kernel. -- GitLab From a3a7744c3f88972645c0c9386b0f0b89e44970d0 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 26 Feb 2025 18:18:25 +0100 Subject: [PATCH 10/54] Slightly adapt algorithm for determining block sizes with known iteration space --- src/pystencils/codegen/gpu_indexing.py | 34 ++++++++++++++++++++------ 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index ce5def203..bdf2225c0 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -58,20 +58,35 @@ class GpuLaunchConfiguration(ABC): cache is invalidated.""" @staticmethod - def determine_block_size(it_space: tuple[int, int, int], block_size: tuple[int, int, int], + def determine_block_size(it_space: tuple[int, ...], block_size: tuple[int, int, int], max_block_sizes: tuple[int, int, int], warp_size: int) -> tuple[int, ...]: """Returns an optimized block size configuration with block sizes being aligned with the warp size.""" def trim(to_trim: list[int]) -> list[int]: return [min(b, i) for b, i in zip(to_trim, it_space)] + def ceil_to_multiple(to_round: list[int]) -> tuple[int, ...]: + def round(val): + return div_ceil(val, warp_size) * warp_size + + # find index of element closest to warp size and round up + index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size))) + if index_to_round+1 < len(to_round): + return (*to_round[:index_to_round], round(to_round[index_to_round]), *to_round[index_to_round+1:]) + else: + return (*to_round[:index_to_round], round(to_round[index_to_round])) + + def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]: + if not all(size < max_size for size, max_size in zip(to_return, max_block_sizes)) \ + or prod(to_return) > max(max_block_sizes): + raise CodegenError(f"Unable to determine GPU block size for this kernel for initial block size config" + f"{block_size} and iteration space {it_space}.") + return to_return + trimmed = trim(list(block_size)) - if prod(trimmed) >= warp_size: + if prod(trimmed) >= warp_size and prod(trimmed) % warp_size == 0: # case 1: greater than min block size -> use trimmed result - if prod(trimmed) % warp_size == 0: - return tuple(trimmed) - else: - trimmed[0] = div_ceil(trimmed[0], warp_size) * warp_size + return check_sizes_and_return(tuple(trimmed)) prev_trim_size = 0 resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space)) @@ -80,7 +95,10 @@ class GpuLaunchConfiguration(ABC): # case 2: trimmed block is equivalent to the whole iteration space if all(b == i for b, i in zip(trimmed, it_space)): - return tuple(trimmed) + if prod(trimmed) % warp_size != 0: + return check_sizes_and_return(ceil_to_multiple(trimmed)) + else: + return check_sizes_and_return(tuple(trimmed)) else: # double block size in each dimension until block is large enough (or case 2 triggers) for d in resize_order: @@ -98,7 +116,7 @@ class GpuLaunchConfiguration(ABC): # case 3: trim block is large enough if prod(trimmed) >= warp_size: - return tuple(trimmed) + return check_sizes_and_return(tuple(trimmed)) raise CodegenError("Unable to determine GPU block size for this kernel.") -- GitLab From a77092d80731936dec9457c28cbdb10d97ce2d65 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 26 Feb 2025 18:48:11 +0100 Subject: [PATCH 11/54] Remove unnecessary condition --- src/pystencils/backend/platforms/cuda.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index ab5d6c19b..461597ae4 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -280,16 +280,10 @@ class CudaPlatform(GenericGpu): ) conds.append(PsLt(ctr_expr, dim.stop)) - if conds: - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) - else: - body.statements = indexing_decls + body.statements - ast = body - - return ast + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + return PsBlock(indexing_decls + [PsConditional(condition, body)]) def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace -- GitLab From cbe094b6a257799685066ba88ce86e99accdee83 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 26 Feb 2025 19:36:34 +0100 Subject: [PATCH 12/54] Fix lint --- src/pystencils/codegen/gpu_indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index bdf2225c0..bcb24e0e0 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -71,8 +71,8 @@ class GpuLaunchConfiguration(ABC): # find index of element closest to warp size and round up index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size))) - if index_to_round+1 < len(to_round): - return (*to_round[:index_to_round], round(to_round[index_to_round]), *to_round[index_to_round+1:]) + if index_to_round + 1 < len(to_round): + return (*to_round[:index_to_round], round(to_round[index_to_round]), *to_round[index_to_round + 1:]) else: return (*to_round[:index_to_round], round(to_round[index_to_round])) -- GitLab From 267e2d4128afde5ebe5a5b9e54704dc14c2543fb Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 3 Mar 2025 16:58:16 +0100 Subject: [PATCH 13/54] Adapt formatting --- src/pystencils/backend/platforms/cuda.py | 4 +- src/pystencils/codegen/config.py | 2 +- src/pystencils/codegen/driver.py | 13 ++++- src/pystencils/codegen/gpu_indexing.py | 62 ++++++++++++++++-------- tests/kernelcreation/test_gpu.py | 4 +- 5 files changed, 57 insertions(+), 28 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 461597ae4..218cae0f6 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -117,7 +117,7 @@ class Blockwise4DMapping(ThreadMapping): THREAD_IDX[0], BLOCK_IDX[0], BLOCK_IDX[1], - BLOCK_IDX[2] + BLOCK_IDX[2], ] def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: @@ -163,7 +163,7 @@ class Blockwise4DMapping(ThreadMapping): class CudaPlatform(GenericGpu): """Platform for CUDA-based GPUs. - + Args: ctx: The kernel creation context thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 8e6ae0805..e369dde4a 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -625,7 +625,7 @@ class CreateKernelConfig(ConfigBase): elif self.get_target() == Target.CUDA: try: from ..jit.gpu_cupy import CupyJit - + return CupyJit() except ImportError: diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index ca2caca35..9404f9754 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -408,14 +408,23 @@ class DefaultKernelCreationDriver: block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") warp_size: int | None = self._cfg.gpu.get_option("warp_size") - max_block_sizes: tuple[int, int, int] | None = self._cfg.gpu.get_option("max_block_sizes") + max_block_sizes: tuple[int, int, int] | None = self._cfg.gpu.get_option( + "max_block_sizes" + ) if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) if max_block_sizes is None: max_block_sizes = GpuOptions.default_maximum_block_sizes(self._target) - return GpuIndexing(self._ctx, idx_scheme, warp_size, max_block_sizes, block_size, manual_launch_grid) + return GpuIndexing( + self._ctx, + idx_scheme, + warp_size, + max_block_sizes, + block_size, + manual_launch_grid, + ) def _get_platform(self) -> Platform: if Target._CPU in self._target: diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index bcb24e0e0..5bdb0b24a 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -58,8 +58,12 @@ class GpuLaunchConfiguration(ABC): cache is invalidated.""" @staticmethod - def determine_block_size(it_space: tuple[int, ...], block_size: tuple[int, int, int], - max_block_sizes: tuple[int, int, int], warp_size: int) -> tuple[int, ...]: + def determine_block_size( + it_space: tuple[int, ...], + block_size: tuple[int, int, int], + max_block_sizes: tuple[int, int, int], + warp_size: int, + ) -> tuple[int, ...]: """Returns an optimized block size configuration with block sizes being aligned with the warp size.""" def trim(to_trim: list[int]) -> list[int]: @@ -70,17 +74,26 @@ class GpuLaunchConfiguration(ABC): return div_ceil(val, warp_size) * warp_size # find index of element closest to warp size and round up - index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size))) + index_to_round = to_round.index( + max(to_round, key=lambda i: abs(i % warp_size)) + ) if index_to_round + 1 < len(to_round): - return (*to_round[:index_to_round], round(to_round[index_to_round]), *to_round[index_to_round + 1:]) + return ( + *to_round[:index_to_round], + round(to_round[index_to_round]), + *to_round[index_to_round + 1 :], + ) else: return (*to_round[:index_to_round], round(to_round[index_to_round])) def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]: - if not all(size < max_size for size, max_size in zip(to_return, max_block_sizes)) \ - or prod(to_return) > max(max_block_sizes): - raise CodegenError(f"Unable to determine GPU block size for this kernel for initial block size config" - f"{block_size} and iteration space {it_space}.") + if not all( + size < max_size for size, max_size in zip(to_return, max_block_sizes) + ) or prod(to_return) > max(max_block_sizes): + raise CodegenError( + f"Unable to determine GPU block size for this kernel for initial block size config" + f"{block_size} and iteration space {it_space}." + ) return to_return trimmed = trim(list(block_size)) @@ -105,8 +118,13 @@ class GpuLaunchConfiguration(ABC): trimmed[d] *= 2 # trim fastest moving dim to multiples of warp size - if d == 0 and trimmed[d] > warp_size and trimmed[d] % warp_size != 0: - trimmed[d] = trimmed[d] - (trimmed[d] % warp_size) # subtract remainder + if ( + d == 0 + and trimmed[d] > warp_size + and trimmed[d] % warp_size != 0 + ): + # subtract remainder + trimmed[d] = trimmed[d] - (trimmed[d] % warp_size) # check if block sizes are within hardware capabilities trimmed[d] = min(trimmed[d], max_block_sizes[d]) @@ -128,10 +146,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): """ def __init__( - self, - block_size: _Dim3Lambda, - grid_size: _Dim3Lambda, - warp_size: _Dim1Lambda + self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, warp_size: _Dim1Lambda ) -> None: self._block_size = block_size self._grid_size = grid_size @@ -152,7 +167,10 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): # impossible to use block size determination function since the iteration space is unknown # -> round block size in fastest moving dimension up to multiple of warp size block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) - block_size = (ceil_to_multiple(block_size[0], int(self._warp_size[0](**kwargs))), *block_size[1:]) + block_size = ( + ceil_to_multiple(block_size[0], int(self._warp_size[0](**kwargs))), + *block_size[1:], + ) grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) return cast(dim3, block_size), cast(dim3, grid_size) @@ -267,7 +285,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ) reduced_block_size = cast( dim3, - self.determine_block_size(num_work_items, self._block_size, self._max_block_sizes, self._warp_size), + self.determine_block_size( + num_work_items, self._block_size, self._max_block_sizes, self._warp_size + ), ) grid_size = cast( dim3, @@ -408,13 +428,15 @@ class GpuIndexing: for _ in range(4 - rank) ) - warp_size = (self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size, UInt(32)))),) + warp_size = ( + self._kernel_factory.create_lambda( + PsExpression.make(PsConstant(self._warp_size, UInt(32))) + ), + ) def factory(): return AutomaticLaunchConfiguration( - block_size, - cast(_Dim3Lambda, grid_size), - warp_size + block_size, cast(_Dim3Lambda, grid_size), warp_size ) return factory diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 38e849c3c..ad3d2bd3e 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -30,9 +30,7 @@ except ImportError: @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) @pytest.mark.parametrize("manual_grid", [False, True]) -def test_indexing_options( - indexing_scheme: str, manual_grid: bool -): +def test_indexing_options(indexing_scheme: str, manual_grid: bool): src, dst = fields("src, dst: [3D]") asm = Assignment( dst.center(), -- GitLab From a8f193c14ae6bc808afb35743a215bab1feb2e2e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 3 Mar 2025 17:22:10 +0100 Subject: [PATCH 14/54] Add small description of block config determination in docs for gpu codegen --- docs/source/backend/gpu_codegen.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 1082669e6..e0d9549c4 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -72,6 +72,12 @@ These depend on the type of the launch configuration: while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel parameters, the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size. +The `DynamicBlockSizeLaunchConfiguration` permits the user to set a block size which is used to then dynamically compute +the grid size. However, the actual block size being used may differ from the user specification. This may origin from +trimming operations between the original block size and the iteration space to avoid spawning unnecessary large blocks. +Another factor affecting the actual block size is having block sizes that match with the warp size given by the hardware +for an improved performance. Here, the block size is rounded to a multiple of the warp size while considering the +maximum amount of threads per block. The `evaluate` method can only be used from within a Python runtime environment. When exporting pystencils CUDA kernels for external use in C++ projects, -- GitLab From f12d2f8b727b03859f4d8a4185b7c9cfd1a72264 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 4 Mar 2025 11:52:10 +0100 Subject: [PATCH 15/54] Fix lint --- src/pystencils/codegen/gpu_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 5bdb0b24a..72de41979 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -81,7 +81,7 @@ class GpuLaunchConfiguration(ABC): return ( *to_round[:index_to_round], round(to_round[index_to_round]), - *to_round[index_to_round + 1 :], + *to_round[index_to_round + 1:], ) else: return (*to_round[:index_to_round], round(to_round[index_to_round])) -- GitLab From 4ea0ea261c4512351e2decc83a5ddca33da5bbc2 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 5 Mar 2025 13:28:33 +0100 Subject: [PATCH 16/54] Move ceil_to_multiple to utils module --- src/pystencils/codegen/gpu_indexing.py | 5 +---- src/pystencils/utils.py | 5 +++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 72de41979..539e4b58b 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -20,7 +20,7 @@ from ..backend.platforms.cuda import ThreadMapping from ..backend.ast.expressions import PsExpression from math import prod -from ..sympyextensions.integer_functions import div_ceil +from ..utils import div_ceil, ceil_to_multiple from ..types.quick import UInt dim3 = tuple[int, int, int] @@ -161,9 +161,6 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): return self._params def evaluate(self, **kwargs) -> tuple[dim3, dim3]: - def ceil_to_multiple(arg1, arg2): - return div_ceil(arg1 + arg2 - 1, arg2) * arg2 - # impossible to use block size determination function since the iteration space is unknown # -> round block size in fastest moving dimension up to multiple of warp size block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py index 0049d0a2c..234f65a5e 100644 --- a/src/pystencils/utils.py +++ b/src/pystencils/utils.py @@ -316,3 +316,8 @@ def div_ceil(divident, divisor): The result is unspecified if either argument is negative.""" return c_intdiv(divident + divisor - 1, divisor) + + +def ceil_to_multiple(divident, divisor): + """Rounds 'divident' to the next multiple of 'divisor'.""" + return div_ceil(divident + divisor - 1, divisor) * divisor -- GitLab From 35e93e54291933b8bf65b0c6278e025f6238766c Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 5 Mar 2025 13:32:15 +0100 Subject: [PATCH 17/54] Omit lambda for warp size in GpuIndexing --- src/pystencils/codegen/gpu_indexing.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 539e4b58b..557eb8ba4 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -24,7 +24,6 @@ from ..utils import div_ceil, ceil_to_multiple from ..types.quick import UInt dim3 = tuple[int, int, int] -_Dim1Lambda = tuple[Lambda] _Dim3Lambda = tuple[Lambda, Lambda, Lambda] @@ -146,7 +145,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): """ def __init__( - self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, warp_size: _Dim1Lambda + self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, warp_size: int ) -> None: self._block_size = block_size self._grid_size = grid_size @@ -165,7 +164,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): # -> round block size in fastest moving dimension up to multiple of warp size block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) block_size = ( - ceil_to_multiple(block_size[0], int(self._warp_size[0](**kwargs))), + ceil_to_multiple(block_size[0], self._warp_size), *block_size[1:], ) @@ -425,15 +424,9 @@ class GpuIndexing: for _ in range(4 - rank) ) - warp_size = ( - self._kernel_factory.create_lambda( - PsExpression.make(PsConstant(self._warp_size, UInt(32))) - ), - ) - def factory(): return AutomaticLaunchConfiguration( - block_size, cast(_Dim3Lambda, grid_size), warp_size + block_size, cast(_Dim3Lambda, grid_size), self._warp_size ) return factory -- GitLab From 9891308beaea99fbe4ea4101199bdc00fc7c89b1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 5 Mar 2025 13:36:53 +0100 Subject: [PATCH 18/54] Refactor condition for failure in check_sizes_and_return and emit reason in error message --- src/pystencils/codegen/gpu_indexing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 557eb8ba4..91f89bdbf 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -86,12 +86,13 @@ class GpuLaunchConfiguration(ABC): return (*to_round[:index_to_round], round(to_round[index_to_round])) def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]: - if not all( - size < max_size for size, max_size in zip(to_return, max_block_sizes) + if any( + size >= max_size for size, max_size in zip(to_return, max_block_sizes) ) or prod(to_return) > max(max_block_sizes): raise CodegenError( f"Unable to determine GPU block size for this kernel for initial block size config" f"{block_size} and iteration space {it_space}." + f"Final block size was too large: {to_return}." ) return to_return -- GitLab From e3ca4e0546b88a3eaa7c5fae56de8c9722e19b1e Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 5 Mar 2025 13:38:55 +0100 Subject: [PATCH 19/54] Apply 2 suggestion(s) to 2 file(s) Co-authored-by: Frederik Hennig <frederik.hennig@fau.de> --- docs/source/backend/gpu_codegen.md | 8 ++++---- src/pystencils/codegen/config.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index e0d9549c4..70b9db75b 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -73,10 +73,10 @@ while the `AutomaticLaunchConfiguration` permits no modification and computes gr parameters, the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size. The `DynamicBlockSizeLaunchConfiguration` permits the user to set a block size which is used to then dynamically compute -the grid size. However, the actual block size being used may differ from the user specification. This may origin from -trimming operations between the original block size and the iteration space to avoid spawning unnecessary large blocks. -Another factor affecting the actual block size is having block sizes that match with the warp size given by the hardware -for an improved performance. Here, the block size is rounded to a multiple of the warp size while considering the +the grid size. However, the actual block size being used may differ from the user specification. This may be due to +trimming operations between the original block size and the iteration space to avoid spawning unnecessarily large blocks. +Another factor affecting the actual block size is the need to have block sizes that match with the warp size given by the hardware +for improved performance. Here, the block size is rounded to a multiple of the warp size while considering the maximum amount of threads per block. The `evaluate` method can only be used from within a Python runtime environment. diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index e369dde4a..803b4a7e6 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -397,7 +397,7 @@ class GpuOptions(ConfigBase): warp_size: BasicOption[int] = BasicOption() """Specifies the size of a warp (CUDA) or wavefront (HIP). - If set to `None` (the default), the default value for the given target will be automatically used. + If this option is not set the default value for the given target will be automatically used. """ @staticmethod -- GitLab From 221296727be890c48a34f3b4f0eaa905765d53eb Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 5 Mar 2025 15:35:25 +0100 Subject: [PATCH 20/54] Minor fix for parameters of automatic launch config --- src/pystencils/codegen/gpu_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 91f89bdbf..462cb0981 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -153,7 +153,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): self._warp_size = warp_size self._params: frozenset[Parameter] = frozenset().union( - *(lb.parameters for lb in chain(block_size, grid_size, warp_size)) + *(lb.parameters for lb in chain(block_size, grid_size)) ) @property -- GitLab From 2b50014e48bd2755b9f25760bb7330836122ea5e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 5 Mar 2025 16:58:11 +0100 Subject: [PATCH 21/54] Remove max_block_sizes config option and add getters for gpu targets in GpuIndexing. Encapsulate in HardwareProperties dataclass --- src/pystencils/codegen/config.py | 16 ----- src/pystencils/codegen/driver.py | 7 +- src/pystencils/codegen/gpu_indexing.py | 94 +++++++++++++++----------- 3 files changed, 57 insertions(+), 60 deletions(-) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index e369dde4a..40401045d 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -380,12 +380,6 @@ class GpuOptions(ConfigBase): The block size may be overridden at runtime. """ - max_block_sizes: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption() - """Maximum dimension size of a thread block in dimensions (x, y, z). - - If set to `None` (the default), the default value for the given target will be automatically used. - """ - manual_launch_grid: BasicOption[bool] = BasicOption(False) """Always require a manually specified launch grid when running this kernel. @@ -410,16 +404,6 @@ class GpuOptions(ConfigBase): f"No default warp/wavefront size known for target {target}" ) - @staticmethod - def default_maximum_block_sizes(target: Target): - match target: - case Target.CUDA: - return (1024, 1024, 64) - case _: - raise NotImplementedError( - f"No default value known for maximum GPU block sizes available on target {target}" - ) - @indexing_scheme.validate def _validate_idx_scheme(self, val: str | GpuIndexingScheme): if isinstance(val, GpuIndexingScheme): diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 9404f9754..c70bc6c68 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -408,20 +408,15 @@ class DefaultKernelCreationDriver: block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") warp_size: int | None = self._cfg.gpu.get_option("warp_size") - max_block_sizes: tuple[int, int, int] | None = self._cfg.gpu.get_option( - "max_block_sizes" - ) if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) - if max_block_sizes is None: - max_block_sizes = GpuOptions.default_maximum_block_sizes(self._target) return GpuIndexing( self._ctx, + self._target, idx_scheme, warp_size, - max_block_sizes, block_size, manual_launch_grid, ) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 462cb0981..797458dd9 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -1,6 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod +from dataclasses import dataclass from typing import cast, Any, Callable from itertools import chain @@ -8,7 +9,7 @@ from .functions import Lambda from .parameters import Parameter from .errors import CodegenError from .config import GpuIndexingScheme, _AUTO_TYPE -from ..backend.constants import PsConstant +from .target import Target from ..backend.kernelcreation import ( KernelCreationContext, @@ -20,13 +21,19 @@ from ..backend.platforms.cuda import ThreadMapping from ..backend.ast.expressions import PsExpression from math import prod -from ..utils import div_ceil, ceil_to_multiple -from ..types.quick import UInt +from ..utils import ceil_to_multiple dim3 = tuple[int, int, int] _Dim3Lambda = tuple[Lambda, Lambda, Lambda] +@dataclass +class HardwareProperties: + warp_size: int + max_threads_per_block: int + max_block_sizes: dim3 + + class GpuLaunchConfiguration(ABC): """Base class for launch configurations for CUDA and HIP kernels. @@ -60,35 +67,32 @@ class GpuLaunchConfiguration(ABC): def determine_block_size( it_space: tuple[int, ...], block_size: tuple[int, int, int], - max_block_sizes: tuple[int, int, int], - warp_size: int, + hw_props: HardwareProperties ) -> tuple[int, ...]: """Returns an optimized block size configuration with block sizes being aligned with the warp size.""" def trim(to_trim: list[int]) -> list[int]: return [min(b, i) for b, i in zip(to_trim, it_space)] - def ceil_to_multiple(to_round: list[int]) -> tuple[int, ...]: - def round(val): - return div_ceil(val, warp_size) * warp_size - + def round_blocksizes(to_round: list[int]) -> tuple[int, ...]: # find index of element closest to warp size and round up index_to_round = to_round.index( - max(to_round, key=lambda i: abs(i % warp_size)) + max(to_round, key=lambda i: abs(i % hw_props.warp_size)) ) if index_to_round + 1 < len(to_round): return ( *to_round[:index_to_round], - round(to_round[index_to_round]), + ceil_to_multiple(to_round[index_to_round], hw_props.warp_size), *to_round[index_to_round + 1:], ) else: - return (*to_round[:index_to_round], round(to_round[index_to_round])) + return (*to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], hw_props.warp_size)) def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]: if any( - size >= max_size for size, max_size in zip(to_return, max_block_sizes) - ) or prod(to_return) > max(max_block_sizes): + size >= max_size for size, max_size in zip(to_return, hw_props.max_block_sizes) + ) or prod(to_return) > hw_props.max_threads_per_block: raise CodegenError( f"Unable to determine GPU block size for this kernel for initial block size config" f"{block_size} and iteration space {it_space}." @@ -97,7 +101,7 @@ class GpuLaunchConfiguration(ABC): return to_return trimmed = trim(list(block_size)) - if prod(trimmed) >= warp_size and prod(trimmed) % warp_size == 0: + if prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size == 0: # case 1: greater than min block size -> use trimmed result return check_sizes_and_return(tuple(trimmed)) @@ -108,8 +112,8 @@ class GpuLaunchConfiguration(ABC): # case 2: trimmed block is equivalent to the whole iteration space if all(b == i for b, i in zip(trimmed, it_space)): - if prod(trimmed) % warp_size != 0: - return check_sizes_and_return(ceil_to_multiple(trimmed)) + if prod(trimmed) % hw_props.warp_size != 0: + return check_sizes_and_return(round_blocksizes(trimmed)) else: return check_sizes_and_return(tuple(trimmed)) else: @@ -120,20 +124,20 @@ class GpuLaunchConfiguration(ABC): # trim fastest moving dim to multiples of warp size if ( d == 0 - and trimmed[d] > warp_size - and trimmed[d] % warp_size != 0 + and trimmed[d] > hw_props.warp_size + and trimmed[d] % hw_props.warp_size != 0 ): # subtract remainder - trimmed[d] = trimmed[d] - (trimmed[d] % warp_size) + trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size) # check if block sizes are within hardware capabilities - trimmed[d] = min(trimmed[d], max_block_sizes[d]) + trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d]) # trim again trimmed = trim(trimmed) # case 3: trim block is large enough - if prod(trimmed) >= warp_size: + if prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size != 0: return check_sizes_and_return(tuple(trimmed)) raise CodegenError("Unable to determine GPU block size for this kernel.") @@ -146,11 +150,11 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): """ def __init__( - self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, warp_size: int + self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, hw_props: HardwareProperties ) -> None: self._block_size = block_size self._grid_size = grid_size - self._warp_size = warp_size + self._hw_props = hw_props self._params: frozenset[Parameter] = frozenset().union( *(lb.parameters for lb in chain(block_size, grid_size)) @@ -165,7 +169,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): # -> round block size in fastest moving dimension up to multiple of warp size block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) block_size = ( - ceil_to_multiple(block_size[0], self._warp_size), + ceil_to_multiple(block_size[0], self._hw_props.warp_size), *block_size[1:], ) @@ -235,15 +239,12 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def __init__( self, num_work_items: _Dim3Lambda, - warp_size: int, - max_block_sizes: dim3, + hw_props: HardwareProperties, default_block_size: dim3 | None = None, ) -> None: self._num_work_items = num_work_items - self._warp_size = warp_size - - self._max_block_sizes = max_block_sizes + self._hw_props = hw_props self._block_size: dim3 | None = default_block_size @@ -282,9 +283,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ) reduced_block_size = cast( dim3, - self.determine_block_size( - num_work_items, self._block_size, self._max_block_sizes, self._warp_size - ), + self.determine_block_size(num_work_items, self._block_size, self._hw_props), ) grid_size = cast( dim3, @@ -318,25 +317,45 @@ class GpuIndexing: def __init__( self, ctx: KernelCreationContext, + target: Target, scheme: GpuIndexingScheme, warp_size: int, - max_block_sizes: dim3, default_block_size: dim3 | _AUTO_TYPE | None = None, manual_launch_grid: bool = False, ) -> None: self._ctx = ctx + self._target = target self._scheme = scheme self._warp_size = warp_size - self._max_block_sizes = max_block_sizes self._default_block_size = default_block_size self._manual_launch_grid = manual_launch_grid + self._hw_props = HardwareProperties(warp_size, + self.get_max_threads_per_block(target), + self.get_max_block_sizes(target)) + from ..backend.kernelcreation import AstFactory from .driver import KernelFactory self._ast_factory = AstFactory(self._ctx) self._kernel_factory = KernelFactory(self._ctx) + @staticmethod + def get_max_block_sizes(target: Target): + match target: + case Target.CUDA: + return (1024, 1024, 64) + case _: + raise CodegenError(f"Cannot determine max GPU block sizes for target {target}") + + @staticmethod + def get_max_threads_per_block(target: Target): + match target: + case Target.CUDA: + return 1024 + case _: + raise CodegenError(f"Cannot determine max GPU threads per block for target {target}") + def get_thread_mapping(self) -> ThreadMapping: """Retrieve a thread mapping object for use by the backend""" @@ -379,8 +398,7 @@ class GpuIndexing: def factory(): return DynamicBlockSizeLaunchConfiguration( num_work_items, - self._warp_size, - self._max_block_sizes, + self._hw_props, self._get_default_block_size(rank), ) @@ -427,7 +445,7 @@ class GpuIndexing: def factory(): return AutomaticLaunchConfiguration( - block_size, cast(_Dim3Lambda, grid_size), self._warp_size + block_size, cast(_Dim3Lambda, grid_size), self._hw_props ) return factory -- GitLab From 60d6323b795e35cefd9e521ee3e186c6fd074413 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 5 Mar 2025 17:01:00 +0100 Subject: [PATCH 22/54] Adapt formatting of gpu_indexing.py --- src/pystencils/codegen/gpu_indexing.py | 49 ++++++++++++++++++-------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 797458dd9..ecc389f18 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -67,7 +67,7 @@ class GpuLaunchConfiguration(ABC): def determine_block_size( it_space: tuple[int, ...], block_size: tuple[int, int, int], - hw_props: HardwareProperties + hw_props: HardwareProperties, ) -> tuple[int, ...]: """Returns an optimized block size configuration with block sizes being aligned with the warp size.""" @@ -86,13 +86,19 @@ class GpuLaunchConfiguration(ABC): *to_round[index_to_round + 1:], ) else: - return (*to_round[:index_to_round], - ceil_to_multiple(to_round[index_to_round], hw_props.warp_size)) + return ( + *to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], hw_props.warp_size), + ) def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]: - if any( - size >= max_size for size, max_size in zip(to_return, hw_props.max_block_sizes) - ) or prod(to_return) > hw_props.max_threads_per_block: + if ( + any( + size >= max_size + for size, max_size in zip(to_return, hw_props.max_block_sizes) + ) + or prod(to_return) > hw_props.max_threads_per_block + ): raise CodegenError( f"Unable to determine GPU block size for this kernel for initial block size config" f"{block_size} and iteration space {it_space}." @@ -101,7 +107,10 @@ class GpuLaunchConfiguration(ABC): return to_return trimmed = trim(list(block_size)) - if prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size == 0: + if ( + prod(trimmed) >= hw_props.warp_size + and prod(trimmed) % hw_props.warp_size == 0 + ): # case 1: greater than min block size -> use trimmed result return check_sizes_and_return(tuple(trimmed)) @@ -137,7 +146,10 @@ class GpuLaunchConfiguration(ABC): trimmed = trim(trimmed) # case 3: trim block is large enough - if prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size != 0: + if ( + prod(trimmed) >= hw_props.warp_size + and prod(trimmed) % hw_props.warp_size != 0 + ): return check_sizes_and_return(tuple(trimmed)) raise CodegenError("Unable to determine GPU block size for this kernel.") @@ -150,7 +162,10 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): """ def __init__( - self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, hw_props: HardwareProperties + self, + block_size: _Dim3Lambda, + grid_size: _Dim3Lambda, + hw_props: HardwareProperties, ) -> None: self._block_size = block_size self._grid_size = grid_size @@ -330,9 +345,11 @@ class GpuIndexing: self._default_block_size = default_block_size self._manual_launch_grid = manual_launch_grid - self._hw_props = HardwareProperties(warp_size, - self.get_max_threads_per_block(target), - self.get_max_block_sizes(target)) + self._hw_props = HardwareProperties( + warp_size, + self.get_max_threads_per_block(target), + self.get_max_block_sizes(target), + ) from ..backend.kernelcreation import AstFactory from .driver import KernelFactory @@ -346,7 +363,9 @@ class GpuIndexing: case Target.CUDA: return (1024, 1024, 64) case _: - raise CodegenError(f"Cannot determine max GPU block sizes for target {target}") + raise CodegenError( + f"Cannot determine max GPU block sizes for target {target}" + ) @staticmethod def get_max_threads_per_block(target: Target): @@ -354,7 +373,9 @@ class GpuIndexing: case Target.CUDA: return 1024 case _: - raise CodegenError(f"Cannot determine max GPU threads per block for target {target}") + raise CodegenError( + f"Cannot determine max GPU threads per block for target {target}" + ) def get_thread_mapping(self) -> ThreadMapping: """Retrieve a thread mapping object for use by the backend""" -- GitLab From 26ba005a4ba9e2f50f446572c531b2539ee4a74f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 6 Mar 2025 15:52:03 +0100 Subject: [PATCH 23/54] Slightly adapt block size fitting algorithm --- src/pystencils/codegen/gpu_indexing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index ecc389f18..7edb7c8e6 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -94,14 +94,14 @@ class GpuLaunchConfiguration(ABC): def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]: if ( any( - size >= max_size + size > max_size for size, max_size in zip(to_return, hw_props.max_block_sizes) ) or prod(to_return) > hw_props.max_threads_per_block ): raise CodegenError( - f"Unable to determine GPU block size for this kernel for initial block size config" - f"{block_size} and iteration space {it_space}." + f"Unable to determine GPU block size for this kernel for initial block size config " + f"{block_size} and iteration space {it_space}. " f"Final block size was too large: {to_return}." ) return to_return @@ -148,7 +148,7 @@ class GpuLaunchConfiguration(ABC): # case 3: trim block is large enough if ( prod(trimmed) >= hw_props.warp_size - and prod(trimmed) % hw_props.warp_size != 0 + and prod(trimmed) % hw_props.warp_size == 0 ): return check_sizes_and_return(tuple(trimmed)) -- GitLab From 59903a15073b945d7d5d9e8c5aecf8792858aac4 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 6 Mar 2025 15:53:06 +0100 Subject: [PATCH 24/54] Add test checking generated GPU launch configurations --- tests/kernelcreation/test_gpu.py | 69 ++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index ad3d2bd3e..5ddb2978c 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -12,6 +12,7 @@ from pystencils import ( create_kernel, Target, ) +from pystencils.codegen.gpu_indexing import GpuIndexing, HardwareProperties from pystencils.slicing import ( add_ghost_layers, @@ -19,6 +20,7 @@ from pystencils.slicing import ( remove_ghost_layers, normalize_slice, ) +from math import prod try: import cupy as cp @@ -72,6 +74,73 @@ def test_indexing_options(indexing_scheme: str, manual_grid: bool): cp.testing.assert_allclose(dst_arr, expected) +@pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) +@pytest.mark.parametrize("iteration_space", + [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65)]) +@pytest.mark.parametrize("initial_block_size", + [(8, 4, 4), (3, 8, 8), (3, 3, 16), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)]) +def test_indexing_launch_config( + indexing_scheme: str, + iteration_space: tuple[int, int, int], + initial_block_size: tuple[int, int, int] +): + + src, dst = fields("src, dst: [3D]") + asm = Assignment( + dst.center(), + src[-1, 0, 0] + + src[1, 0, 0] + + src[0, -1, 0] + + src[0, 1, 0] + + src[0, 0, -1] + + src[0, 0, 1], + ) + + target = Target.CUDA + cfg = CreateKernelConfig(target=target) + cfg.gpu.indexing_scheme = indexing_scheme + + ast = create_kernel(asm, cfg) + kernel = ast.compile() + + warp_size = cfg.gpu.default_warp_size(target) + max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) + max_block_sizes = GpuIndexing.get_max_block_sizes(target) + + block_size = kernel.launch_config.determine_block_size( + iteration_space, + initial_block_size, + HardwareProperties(warp_size, max_threads_per_block, max_block_sizes) + ) + + def check_suitability(b): + return prod(b) >= warp_size and prod(b) % warp_size == 0 + + # block size fitting should not modify an already ideal configuration + # -> check if ideal configurations are modified + if ( + check_suitability(initial_block_size) + and all(x == y for x, y in zip(initial_block_size, iteration_space)) # trimming may alter results + ): + assert all(x == y for x, y in zip(initial_block_size, block_size)), \ + f"Initial block size unnecessarily adapted from {initial_block_size} to {block_size}." + + assert check_suitability(block_size), \ + "Determined block size shall be divisible by warp size." + + kernel.launch_config.block_size = block_size + + src_arr = cp.ones(iteration_space) + dst_arr = cp.zeros_like(src_arr) + + kernel(src=src_arr, dst=dst_arr) + + expected = cp.zeros_like(src_arr) + expected[1:-1, 1:-1, 1:-1].fill(6.0) + + cp.testing.assert_allclose(dst_arr, expected) + + def test_invalid_indexing_schemes(): src, dst = fields("src, dst: [4D]") asm = Assignment(src.center(0), dst.center(0)) -- GitLab From 5c840aa50a1c013e44e4709cef5764130aaf203a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 6 Mar 2025 16:45:38 +0100 Subject: [PATCH 25/54] Rename determine_block_size to fit_block_size --- src/pystencils/codegen/gpu_indexing.py | 4 ++-- tests/kernelcreation/test_gpu.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 7edb7c8e6..3e41088c9 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -64,7 +64,7 @@ class GpuLaunchConfiguration(ABC): cache is invalidated.""" @staticmethod - def determine_block_size( + def fit_block_size( it_space: tuple[int, ...], block_size: tuple[int, int, int], hw_props: HardwareProperties, @@ -298,7 +298,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ) reduced_block_size = cast( dim3, - self.determine_block_size(num_work_items, self._block_size, self._hw_props), + self.fit_block_size(num_work_items, self._block_size, self._hw_props), ) grid_size = cast( dim3, diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 5ddb2978c..73f3ff3d2 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -107,7 +107,7 @@ def test_indexing_launch_config( max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) max_block_sizes = GpuIndexing.get_max_block_sizes(target) - block_size = kernel.launch_config.determine_block_size( + block_size = kernel.launch_config.fit_block_size( iteration_space, initial_block_size, HardwareProperties(warp_size, max_threads_per_block, max_block_sizes) -- GitLab From 946eccef8981bacd505b88c4d4c2abb3da1b0d60 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 6 Mar 2025 16:46:01 +0100 Subject: [PATCH 26/54] Adapt docstring for DynamicBlockSizeLaunchConfiguration --- src/pystencils/codegen/gpu_indexing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 3e41088c9..b06bc4e14 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -243,12 +243,13 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size. - The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items - in the kernel's iteration space as follows. + In case the user-specified block size does not coincide with multiples of the warp size of the GPU, + it will be altered by a block size fitting algorithm. + + The actual launch grid size is then computed as follows. For each dimension :math:`c \\in \\{ x, y, z \\}`, - - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``; - - otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``. + ``block_size.c = fit_block_size(user_block_size.c)`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``. """ def __init__( -- GitLab From a5f511376461897de7e90485650a55e2696019ac Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 6 Mar 2025 17:01:27 +0100 Subject: [PATCH 27/54] Small adaptations to gpu_kernels.md user manual --- docs/source/user_manual/gpu_kernels.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 610c61ddf..8e102c642 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -111,11 +111,16 @@ For most kernels with an at most three-dimensional iteration space, this behavior is sufficient and desired. It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`. -If the `Linear3D` indexing scheme is used, you may modifiy the GPU thread block size in two places. +If the `Linear3D` indexing scheme is used, you may modify the GPU thread block size in two places. The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` code generator option; if none is specified, a default depending on the iteration space's dimensionality will be used. +For performance-related reasons, the block size shall coincide with multiples of the hardware's warp size. +We enforce this by applying the fitting algorithm found in `GpuLaunchConfiguration.fit_block_size` +to alter the initial block size configuration. +This modification only occurs if the initial configuration is not optimal. + The block size can furthermore be modified at the compiled kernel's wrapper object via the `launch_config.block_size` attribute: @@ -126,6 +131,8 @@ kfunc.launch_config.block_size = (256, 2, 1) # Run the kernel kfunc(f=f_arr, g=g_arr) ``` +Please note that when modifying the block size directly, it must align with the warp size, +since this way of modification trespasses our fitting algorithm. In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments and the given thread block size. -- GitLab From a0cc7e8dae0b52e0c34959f599e5c61246d022e4 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 7 Mar 2025 13:25:12 +0100 Subject: [PATCH 28/54] Fix ceil_to_multiple --- src/pystencils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py index 234f65a5e..5ef13f31f 100644 --- a/src/pystencils/utils.py +++ b/src/pystencils/utils.py @@ -320,4 +320,4 @@ def div_ceil(divident, divisor): def ceil_to_multiple(divident, divisor): """Rounds 'divident' to the next multiple of 'divisor'.""" - return div_ceil(divident + divisor - 1, divisor) * divisor + return div_ceil(divident, divisor) * divisor -- GitLab From 09fcfd3d5c496f4dea4682a0ac6f7629b050caaa Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 7 Mar 2025 13:28:14 +0100 Subject: [PATCH 29/54] Terminate block size fitting if block size is large enough and round to next multiple of warp size --- src/pystencils/codegen/gpu_indexing.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index b06bc4e14..89b339990 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -74,7 +74,11 @@ class GpuLaunchConfiguration(ABC): def trim(to_trim: list[int]) -> list[int]: return [min(b, i) for b, i in zip(to_trim, it_space)] - def round_blocksizes(to_round: list[int]) -> tuple[int, ...]: + def round_block_sizes(to_round: list[int]) -> tuple[int, ...]: + # check if already aligns with warp size + if prod(to_round) % hw_props.warp_size == 0: + return tuple(to_round) + # find index of element closest to warp size and round up index_to_round = to_round.index( max(to_round, key=lambda i: abs(i % hw_props.warp_size)) @@ -121,10 +125,7 @@ class GpuLaunchConfiguration(ABC): # case 2: trimmed block is equivalent to the whole iteration space if all(b == i for b, i in zip(trimmed, it_space)): - if prod(trimmed) % hw_props.warp_size != 0: - return check_sizes_and_return(round_blocksizes(trimmed)) - else: - return check_sizes_and_return(tuple(trimmed)) + return check_sizes_and_return(round_block_sizes(trimmed)) else: # double block size in each dimension until block is large enough (or case 2 triggers) for d in resize_order: @@ -146,11 +147,8 @@ class GpuLaunchConfiguration(ABC): trimmed = trim(trimmed) # case 3: trim block is large enough - if ( - prod(trimmed) >= hw_props.warp_size - and prod(trimmed) % hw_props.warp_size == 0 - ): - return check_sizes_and_return(tuple(trimmed)) + if prod(trimmed) >= hw_props.warp_size: + return check_sizes_and_return(round_block_sizes(trimmed)) raise CodegenError("Unable to determine GPU block size for this kernel.") -- GitLab From 22f45248be16142ca2c56a7def2004c8c762caf3 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 7 Mar 2025 13:29:19 +0100 Subject: [PATCH 30/54] Add more edge cases to block size fitting test --- tests/kernelcreation/test_gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 73f3ff3d2..e130db608 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -76,9 +76,9 @@ def test_indexing_options(indexing_scheme: str, manual_grid: bool): @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) @pytest.mark.parametrize("iteration_space", - [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65)]) + [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)]) @pytest.mark.parametrize("initial_block_size", - [(8, 4, 4), (3, 8, 8), (3, 3, 16), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)]) + [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)]) def test_indexing_launch_config( indexing_scheme: str, iteration_space: tuple[int, int, int], -- GitLab From 88485c6b57df0200516ef29d8936c7a4eedd26b0 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 10 Mar 2025 18:35:16 +0100 Subject: [PATCH 31/54] Distribute default block size cubicly over dims --- src/pystencils/codegen/gpu_indexing.py | 4 ++-- src/pystencils/jit/gpu_cupy.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 5bdb0b24a..e0d655225 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -398,9 +398,9 @@ class GpuIndexing: case 1: return (256, 1, 1) case 2: - return (128, 2, 1) + return (16, 16, 1) case 3: - return (128, 2, 2) + return (8, 8, 8) case _: assert False, "unreachable code" else: diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 1c771a427..1fddf6f77 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -207,7 +207,7 @@ class CupyKernelWrapper(KernelWrapper): class CupyJit(JitBase): - def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)): + def __init__(self, default_block_size: Sequence[int] = (16, 16, 1)): self._runtime_headers = {"<cstdint>"} if len(default_block_size) > 3: -- GitLab From 4728c8e70217ca51214420d62ace2ff44fade312 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 10 Mar 2025 18:54:43 +0100 Subject: [PATCH 32/54] Use default block size if block size fitting fails --- src/pystencils/codegen/gpu_indexing.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index a29c2f7bf..3a05d9ea6 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -4,6 +4,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import cast, Any, Callable from itertools import chain +from warnings import warn from .functions import Lambda from .parameters import Parameter @@ -295,10 +296,15 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): num_work_items = cast( dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items) ) - reduced_block_size = cast( - dim3, - self.fit_block_size(num_work_items, self._block_size, self._hw_props), - ) + try: + reduced_block_size = cast( + dim3, + self.fit_block_size(num_work_items, self._block_size, self._hw_props), + ) + except CodegenError: + warn(f"Block size fitting could not determine optimal block size configuration. " + f"Defaulting back to {self._block_size}") + reduced_block_size = self._block_size grid_size = cast( dim3, tuple( -- GitLab From 0d8e962adaa969bfc17928da15772a1ee3d6b5f0 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 11 Mar 2025 12:58:22 +0100 Subject: [PATCH 33/54] Adapt 3d default gpu block size to reduce register pressure --- src/pystencils/codegen/gpu_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index cef20b3f6..018b62c58 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -446,7 +446,7 @@ class GpuIndexing: case 2: return (16, 16, 1) case 3: - return (8, 8, 8) + return (8, 8, 4) case _: assert False, "unreachable code" else: -- GitLab From 2624d819df3f82f42b6f196f5671ef8b95227447 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 11 Mar 2025 19:32:53 +0100 Subject: [PATCH 34/54] Introduce assume_warp_aligned_block_size & use_block_size_fitting options --- src/pystencils/codegen/config.py | 17 +++ src/pystencils/codegen/driver.py | 4 + src/pystencils/codegen/gpu_indexing.py | 169 +++++++++++++++++-------- tests/kernelcreation/test_gpu.py | 59 +++++---- 4 files changed, 173 insertions(+), 76 deletions(-) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index cbc1b0bdb..37263e058 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -394,6 +394,23 @@ class GpuOptions(ConfigBase): If this option is not set the default value for the given target will be automatically used. """ + assume_warp_aligned_block_size: BasicOption[bool] = BasicOption(False) + """Specifies whether block sizes are divisible by the hardware's warp size. + + If set to `True`, the code generator can employ optimizations that require this assumption, + e.g. warp-level reductions. + The generator also checks if user-provided block sizes fulfill this criterion. + """ + + use_block_size_fitting: BasicOption[bool] = BasicOption(False) + """Experimental. Specifies if a fitting algorithm should be used at runtime to + compute optimal block sizes that align with the warp size, + i.e. their product is divisible by the warp size. + + This option only takes effect for `DynamicBlockSizeLaunchConfiguration`'s, + i.e. if `Linear3D <GpuIndexingScheme.Linear3D>` is chosen as an indexing scheme. + """ + @staticmethod def default_warp_size(target: Target): match target: diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index c70bc6c68..d47b889c7 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -407,6 +407,8 @@ class DefaultKernelCreationDriver: idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") + use_block_size_fitting: bool = self._cfg.gpu.get_option("use_block_size_fitting") + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") warp_size: int | None = self._cfg.gpu.get_option("warp_size") if warp_size is None: @@ -419,6 +421,8 @@ class DefaultKernelCreationDriver: warp_size, block_size, manual_launch_grid, + use_block_size_fitting, + assume_warp_aligned_block_size, ) def _get_platform(self) -> Platform: diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 018b62c58..204737981 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -65,51 +65,77 @@ class GpuLaunchConfiguration(ABC): cache is invalidated.""" @staticmethod + def block_size_exceeds_hw_limits( + block_size: tuple[int, ...], + hw_props: HardwareProperties) -> bool: + """Checks if provided block size conforms limits given by the hardware.""" + + return any( + size > max_size + for size, max_size in zip(block_size, hw_props.max_block_sizes) + ) or prod(block_size) > hw_props.max_threads_per_block + + @staticmethod + def _gen_error_msg(block_size: tuple[int, ...]): + return f"Unable to determine GPU block size for this kernel. \ + Final block size was too large: {block_size}." + + @staticmethod + def _round_block_sizes_to_warp_size(to_round: list[int], warp_size: int) -> tuple[int, ...]: + # check if already aligns with warp size + if prod(to_round) % warp_size == 0: + return tuple(to_round) + + # find index of element closest to warp size and round up + index_to_round = to_round.index( + max(to_round, key=lambda i: abs(i % warp_size)) + ) + if index_to_round + 1 < len(to_round): + return ( + *to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], warp_size), + *to_round[index_to_round + 1:], + ) + else: + return ( + *to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], warp_size), + ) + + @classmethod + def trim_block_size( + cls, + it_space: tuple[int, ...], + block_size: tuple[int, ...], + hw_props: HardwareProperties, + ) -> tuple[int, ...]: + """Returns specified block sizes trimmed with iteration space. + Raises `CodegenError` if trimmed block size does not conform hardware limits. + """ + + ret = tuple([min(b, i) for b, i in zip(block_size, it_space)]) + if cls.block_size_exceeds_hw_limits(ret, hw_props): + raise CodegenError(cls._gen_error_msg(ret)) + return ret + + @classmethod def fit_block_size( + cls, it_space: tuple[int, ...], block_size: tuple[int, int, int], hw_props: HardwareProperties, ) -> tuple[int, ...]: - """Returns an optimized block size configuration with block sizes being aligned with the warp size.""" + """Returns an optimized block size configuration with block sizes being aligned with the warp size. + Raises `CodegenError` if optimal block size could not be found or does not conform hardware limits. + """ def trim(to_trim: list[int]) -> list[int]: return [min(b, i) for b, i in zip(to_trim, it_space)] - def round_block_sizes(to_round: list[int]) -> tuple[int, ...]: - # check if already aligns with warp size - if prod(to_round) % hw_props.warp_size == 0: - return tuple(to_round) - - # find index of element closest to warp size and round up - index_to_round = to_round.index( - max(to_round, key=lambda i: abs(i % hw_props.warp_size)) - ) - if index_to_round + 1 < len(to_round): - return ( - *to_round[:index_to_round], - ceil_to_multiple(to_round[index_to_round], hw_props.warp_size), - *to_round[index_to_round + 1:], - ) - else: - return ( - *to_round[:index_to_round], - ceil_to_multiple(to_round[index_to_round], hw_props.warp_size), - ) - - def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]: - if ( - any( - size > max_size - for size, max_size in zip(to_return, hw_props.max_block_sizes) - ) - or prod(to_return) > hw_props.max_threads_per_block - ): - raise CodegenError( - f"Unable to determine GPU block size for this kernel for initial block size config " - f"{block_size} and iteration space {it_space}. " - f"Final block size was too large: {to_return}." - ) - return to_return + def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]: + if cls.block_size_exceeds_hw_limits(ret, hw_props): + raise CodegenError(cls._gen_error_msg(ret)) + return ret trimmed = trim(list(block_size)) if ( @@ -126,7 +152,7 @@ class GpuLaunchConfiguration(ABC): # case 2: trimmed block is equivalent to the whole iteration space if all(b == i for b, i in zip(trimmed, it_space)): - return check_sizes_and_return(round_block_sizes(trimmed)) + return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size)) else: # double block size in each dimension until block is large enough (or case 2 triggers) for d in resize_order: @@ -149,7 +175,7 @@ class GpuLaunchConfiguration(ABC): # case 3: trim block is large enough if prod(trimmed) >= hw_props.warp_size: - return check_sizes_and_return(round_block_sizes(trimmed)) + return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size)) raise CodegenError("Unable to determine GPU block size for this kernel.") @@ -165,10 +191,12 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): block_size: _Dim3Lambda, grid_size: _Dim3Lambda, hw_props: HardwareProperties, + assume_warp_aligned_block_size: bool, ) -> None: self._block_size = block_size self._grid_size = grid_size self._hw_props = hw_props + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size self._params: frozenset[Parameter] = frozenset().union( *(lb.parameters for lb in chain(block_size, grid_size)) @@ -183,10 +211,14 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): # -> round block size in fastest moving dimension up to multiple of warp size block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) block_size = ( - ceil_to_multiple(block_size[0], self._hw_props.warp_size), + ceil_to_multiple(block_size[0], self._hw_props.warp_size) + if self._assume_warp_aligned_block_size else block_size[0], *block_size[1:], ) + if self.block_size_exceeds_hw_limits(block_size, self._hw_props): + raise CodegenError(f"Block size {block_size} exceeds hardware limits.") + grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) return cast(dim3, block_size), cast(dim3, grid_size) @@ -202,7 +234,13 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): def __init__( self, + hw_props: HardwareProperties, + assume_warp_aligned_block_size: bool = False ) -> None: + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + + self._hw_props = hw_props + self._block_size: dim3 | None = None self._grid_size: dim3 | None = None @@ -233,6 +271,10 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): if self._grid_size is None: raise AttributeError("No GPU grid size was set by the user.") + if self._assume_warp_aligned_block_size and prod(self._block_size) % self._hw_props.warp_size != 0: + raise CodegenError("Specified block sizes must align with warp size with " + "`assume_warp_aligned_block_size` enabled.") + return self._block_size, self._grid_size def jit_cache_key(self) -> Any: @@ -255,12 +297,17 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self, num_work_items: _Dim3Lambda, hw_props: HardwareProperties, + use_block_size_fitting: bool, + assume_warp_aligned_block_size: bool, default_block_size: dim3 | None = None, ) -> None: self._num_work_items = num_work_items self._hw_props = hw_props + self._use_block_size_fitting = use_block_size_fitting + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + self._block_size: dim3 | None = default_block_size self._params: frozenset[Parameter] = frozenset().union( @@ -296,23 +343,33 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): num_work_items = cast( dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items) ) + + computed_block_size: tuple[int, ...] try: - reduced_block_size = cast( - dim3, - self.fit_block_size(num_work_items, self._block_size, self._hw_props), - ) - except CodegenError: - warn(f"Block size fitting could not determine optimal block size configuration. " + if self._use_block_size_fitting: + computed_block_size = self.fit_block_size(num_work_items, self._block_size, self._hw_props) + else: + computed_block_size = self.trim_block_size(num_work_items, self._block_size, self._hw_props) + + # check if assumption for warp size alignment is met + if self._assume_warp_aligned_block_size and prod(computed_block_size) % self._hw_props.warp_size != 0: + raise CodegenError("Adapted block size is not divisible by warp size.") + except CodegenError as e: + warn(f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. " + f"Block size fitting could not determine optimal block size configuration. " f"Defaulting back to {self._block_size}") - reduced_block_size = self._block_size + computed_block_size = self._block_size + + adapted_block_size = cast(dim3, computed_block_size) + grid_size = cast( dim3, tuple( - div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size) + div_ceil(wit, bs) for wit, bs in zip(num_work_items, adapted_block_size) ), ) - return reduced_block_size, grid_size + return adapted_block_size, grid_size def jit_cache_key(self) -> Any: return self._block_size @@ -342,6 +399,8 @@ class GpuIndexing: warp_size: int, default_block_size: dim3 | _AUTO_TYPE | None = None, manual_launch_grid: bool = False, + use_block_size_fitting: bool = False, + assume_warp_aligned_block_size: bool = False, ) -> None: self._ctx = ctx self._target = target @@ -349,6 +408,8 @@ class GpuIndexing: self._warp_size = warp_size self._default_block_size = default_block_size self._manual_launch_grid = manual_launch_grid + self._use_block_size_fitting = use_block_size_fitting + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size self._hw_props = HardwareProperties( warp_size, @@ -396,7 +457,10 @@ class GpuIndexing: def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]: """Retrieve a factory for the launch configuration for later consumption by the runtime system""" if self._manual_launch_grid: - return ManualLaunchConfiguration + def factory(): + return ManualLaunchConfiguration(self._hw_props, self._assume_warp_aligned_block_size) + + return factory match self._scheme: case GpuIndexingScheme.Linear3D: @@ -430,6 +494,8 @@ class GpuIndexing: return DynamicBlockSizeLaunchConfiguration( num_work_items, self._hw_props, + self._use_block_size_fitting, + self._assume_warp_aligned_block_size, self._get_default_block_size(rank), ) @@ -476,7 +542,10 @@ class GpuIndexing: def factory(): return AutomaticLaunchConfiguration( - block_size, cast(_Dim3Lambda, grid_size), self._hw_props + block_size, + cast(_Dim3Lambda, grid_size), + self._hw_props, + self._assume_warp_aligned_block_size ) return factory diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 8dbeadb7f..6cbc8f813 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -75,12 +75,16 @@ def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool): @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) +@pytest.mark.parametrize("use_block_size_fitting", [True, False]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) @pytest.mark.parametrize("iteration_space", [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)]) @pytest.mark.parametrize("initial_block_size", [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)]) def test_indexing_launch_config( indexing_scheme: str, + use_block_size_fitting: bool, + assume_warp_aligned_block_size: bool, iteration_space: tuple[int, int, int], initial_block_size: tuple[int, int, int] ): @@ -99,36 +103,39 @@ def test_indexing_launch_config( target = Target.CUDA cfg = CreateKernelConfig(target=target) cfg.gpu.indexing_scheme = indexing_scheme + cfg.gpu.use_block_size_fitting = use_block_size_fitting + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size ast = create_kernel(asm, cfg) kernel = ast.compile() - warp_size = cfg.gpu.default_warp_size(target) - max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) - max_block_sizes = GpuIndexing.get_max_block_sizes(target) - - block_size = kernel.launch_config.fit_block_size( - iteration_space, - initial_block_size, - HardwareProperties(warp_size, max_threads_per_block, max_block_sizes) - ) - - def check_suitability(b): - return prod(b) >= warp_size and prod(b) % warp_size == 0 - - # block size fitting should not modify an already ideal configuration - # -> check if ideal configurations are modified - if ( - check_suitability(initial_block_size) - and all(x == y for x, y in zip(initial_block_size, iteration_space)) # trimming may alter results - ): - assert all(x == y for x, y in zip(initial_block_size, block_size)), \ - f"Initial block size unnecessarily adapted from {initial_block_size} to {block_size}." - - assert check_suitability(block_size), \ - "Determined block size shall be divisible by warp size." - - kernel.launch_config.block_size = block_size + if use_block_size_fitting: + warp_size = cfg.gpu.default_warp_size(target) + max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) + max_block_sizes = GpuIndexing.get_max_block_sizes(target) + + block_size = kernel.launch_config.fit_block_size( + iteration_space, + initial_block_size, + HardwareProperties(warp_size, max_threads_per_block, max_block_sizes) + ) + + def check_suitability(b): + return prod(b) >= warp_size and prod(b) % warp_size == 0 + + # block size fitting should not modify an already ideal configuration + # -> check if ideal configurations are modified + if ( + check_suitability(initial_block_size) + and all(x == y for x, y in zip(initial_block_size, iteration_space)) # trimming may alter results + ): + assert all(x == y for x, y in zip(initial_block_size, block_size)), \ + f"Initial block size unnecessarily adapted from {initial_block_size} to {block_size}." + + assert check_suitability(block_size), \ + "Determined block size shall be divisible by warp size." + + kernel.launch_config.block_size = block_size src_arr = cp.ones(iteration_space) dst_arr = cp.zeros_like(src_arr) -- GitLab From c427c469e29784b98972979c2d75d6f2c50232ea Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 11 Mar 2025 19:35:08 +0100 Subject: [PATCH 35/54] Format gpu_indexing.py --- src/pystencils/codegen/gpu_indexing.py | 88 +++++++++++++++++--------- 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 204737981..edddce10e 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -66,14 +66,17 @@ class GpuLaunchConfiguration(ABC): @staticmethod def block_size_exceeds_hw_limits( - block_size: tuple[int, ...], - hw_props: HardwareProperties) -> bool: + block_size: tuple[int, ...], hw_props: HardwareProperties + ) -> bool: """Checks if provided block size conforms limits given by the hardware.""" - return any( - size > max_size - for size, max_size in zip(block_size, hw_props.max_block_sizes) - ) or prod(block_size) > hw_props.max_threads_per_block + return ( + any( + size > max_size + for size, max_size in zip(block_size, hw_props.max_block_sizes) + ) + or prod(block_size) > hw_props.max_threads_per_block + ) @staticmethod def _gen_error_msg(block_size: tuple[int, ...]): @@ -81,15 +84,15 @@ class GpuLaunchConfiguration(ABC): Final block size was too large: {block_size}." @staticmethod - def _round_block_sizes_to_warp_size(to_round: list[int], warp_size: int) -> tuple[int, ...]: + def _round_block_sizes_to_warp_size( + to_round: list[int], warp_size: int + ) -> tuple[int, ...]: # check if already aligns with warp size if prod(to_round) % warp_size == 0: return tuple(to_round) # find index of element closest to warp size and round up - index_to_round = to_round.index( - max(to_round, key=lambda i: abs(i % warp_size)) - ) + index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size))) if index_to_round + 1 < len(to_round): return ( *to_round[:index_to_round], @@ -152,7 +155,9 @@ class GpuLaunchConfiguration(ABC): # case 2: trimmed block is equivalent to the whole iteration space if all(b == i for b, i in zip(trimmed, it_space)): - return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size)) + return check_sizes_and_return( + cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size) + ) else: # double block size in each dimension until block is large enough (or case 2 triggers) for d in resize_order: @@ -175,7 +180,11 @@ class GpuLaunchConfiguration(ABC): # case 3: trim block is large enough if prod(trimmed) >= hw_props.warp_size: - return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size)) + return check_sizes_and_return( + cls._round_block_sizes_to_warp_size( + trimmed, hw_props.warp_size + ) + ) raise CodegenError("Unable to determine GPU block size for this kernel.") @@ -211,8 +220,11 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): # -> round block size in fastest moving dimension up to multiple of warp size block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) block_size = ( - ceil_to_multiple(block_size[0], self._hw_props.warp_size) - if self._assume_warp_aligned_block_size else block_size[0], + ( + ceil_to_multiple(block_size[0], self._hw_props.warp_size) + if self._assume_warp_aligned_block_size + else block_size[0] + ), *block_size[1:], ) @@ -233,9 +245,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): """ def __init__( - self, - hw_props: HardwareProperties, - assume_warp_aligned_block_size: bool = False + self, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool = False ) -> None: self._assume_warp_aligned_block_size = assume_warp_aligned_block_size @@ -271,9 +281,14 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): if self._grid_size is None: raise AttributeError("No GPU grid size was set by the user.") - if self._assume_warp_aligned_block_size and prod(self._block_size) % self._hw_props.warp_size != 0: - raise CodegenError("Specified block sizes must align with warp size with " - "`assume_warp_aligned_block_size` enabled.") + if ( + self._assume_warp_aligned_block_size + and prod(self._block_size) % self._hw_props.warp_size != 0 + ): + raise CodegenError( + "Specified block sizes must align with warp size with " + "`assume_warp_aligned_block_size` enabled." + ) return self._block_size, self._grid_size @@ -347,17 +362,26 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): computed_block_size: tuple[int, ...] try: if self._use_block_size_fitting: - computed_block_size = self.fit_block_size(num_work_items, self._block_size, self._hw_props) + computed_block_size = self.fit_block_size( + num_work_items, self._block_size, self._hw_props + ) else: - computed_block_size = self.trim_block_size(num_work_items, self._block_size, self._hw_props) + computed_block_size = self.trim_block_size( + num_work_items, self._block_size, self._hw_props + ) # check if assumption for warp size alignment is met - if self._assume_warp_aligned_block_size and prod(computed_block_size) % self._hw_props.warp_size != 0: + if ( + self._assume_warp_aligned_block_size + and prod(computed_block_size) % self._hw_props.warp_size != 0 + ): raise CodegenError("Adapted block size is not divisible by warp size.") except CodegenError as e: - warn(f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. " - f"Block size fitting could not determine optimal block size configuration. " - f"Defaulting back to {self._block_size}") + warn( + f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. " + f"Block size fitting could not determine optimal block size configuration. " + f"Defaulting back to {self._block_size}" + ) computed_block_size = self._block_size adapted_block_size = cast(dim3, computed_block_size) @@ -457,8 +481,11 @@ class GpuIndexing: def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]: """Retrieve a factory for the launch configuration for later consumption by the runtime system""" if self._manual_launch_grid: + def factory(): - return ManualLaunchConfiguration(self._hw_props, self._assume_warp_aligned_block_size) + return ManualLaunchConfiguration( + self._hw_props, self._assume_warp_aligned_block_size + ) return factory @@ -481,10 +508,9 @@ class GpuIndexing: ) work_items_expr += tuple( - self._ast_factory.parse_index(1) - for _ in range(3 - rank) + self._ast_factory.parse_index(1) for _ in range(3 - rank) ) - + num_work_items = cast( _Dim3Lambda, tuple(self._kernel_factory.create_lambda(wit) for wit in work_items_expr), @@ -545,7 +571,7 @@ class GpuIndexing: block_size, cast(_Dim3Lambda, grid_size), self._hw_props, - self._assume_warp_aligned_block_size + self._assume_warp_aligned_block_size, ) return factory -- GitLab From 81a3e2abd4e69b000d00c15856d557c450afa90e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Mar 2025 11:55:43 +0100 Subject: [PATCH 36/54] Fix docstrings --- src/pystencils/codegen/gpu_indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index edddce10e..f33cd7836 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -113,7 +113,7 @@ class GpuLaunchConfiguration(ABC): hw_props: HardwareProperties, ) -> tuple[int, ...]: """Returns specified block sizes trimmed with iteration space. - Raises `CodegenError` if trimmed block size does not conform hardware limits. + Raises CodegenError if trimmed block size does not conform hardware limits. """ ret = tuple([min(b, i) for b, i in zip(block_size, it_space)]) @@ -129,7 +129,7 @@ class GpuLaunchConfiguration(ABC): hw_props: HardwareProperties, ) -> tuple[int, ...]: """Returns an optimized block size configuration with block sizes being aligned with the warp size. - Raises `CodegenError` if optimal block size could not be found or does not conform hardware limits. + Raises CodegenError if optimal block size could not be found or does not conform hardware limits. """ def trim(to_trim: list[int]) -> list[int]: -- GitLab From 166656313f141f1ed943b2c885c3c933e0cffdc7 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Mar 2025 12:44:36 +0100 Subject: [PATCH 37/54] Adapt docu for recent changes --- docs/source/backend/gpu_codegen.md | 9 +++++---- docs/source/user_manual/gpu_kernels.md | 16 +++++++++------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 70b9db75b..6cc00a6e2 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -74,10 +74,11 @@ parameters, the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size. The `DynamicBlockSizeLaunchConfiguration` permits the user to set a block size which is used to then dynamically compute the grid size. However, the actual block size being used may differ from the user specification. This may be due to -trimming operations between the original block size and the iteration space to avoid spawning unnecessarily large blocks. -Another factor affecting the actual block size is the need to have block sizes that match with the warp size given by the hardware -for improved performance. Here, the block size is rounded to a multiple of the warp size while considering the -maximum amount of threads per block. +trimming operations between the original block size and the iteration space to avoid spawning unnecessarily large +blocks. In case the `GpuOptions.use_block_size_fitting` option is set, a block fitting algorithm adapts the original +block size such that it aligns with the warp size given by the hardware for improved performance. The algorithm +`GpuLaunchConfiguration.fit_block_size` incrementally increases the trimmed block size until it is rounded to a multiple +of the warp size while considering the maximum amount of threads per block. The `evaluate` method can only be used from within a Python runtime environment. When exporting pystencils CUDA kernels for external use in C++ projects, diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 8e102c642..2443108c1 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -115,11 +115,7 @@ If the `Linear3D` indexing scheme is used, you may modify the GPU thread block s The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` code generator option; if none is specified, a default depending on the iteration space's dimensionality will be used. - -For performance-related reasons, the block size shall coincide with multiples of the hardware's warp size. -We enforce this by applying the fitting algorithm found in `GpuLaunchConfiguration.fit_block_size` -to alter the initial block size configuration. -This modification only occurs if the initial configuration is not optimal. +Please note that this **initial block size configuration may be altered** by the code generator. The block size can furthermore be modified at the compiled kernel's wrapper object via the `launch_config.block_size` attribute: @@ -131,8 +127,14 @@ kfunc.launch_config.block_size = (256, 2, 1) # Run the kernel kfunc(f=f_arr, g=g_arr) ``` -Please note that when modifying the block size directly, it must align with the warp size, -since this way of modification trespasses our fitting algorithm. + +Block sizes aligning with multiples of the hardware's warp size allow for a better usage of the GPUs resources. +Even though such block size configurations are not enforced, notifying our code generator via the +`GpuOptions.assume_warp_aligned_block_size` option that the configured block size is divisible by the warp size allows +for further optimization potential, e.g. for warp-level reductions. +When setting this option to `True`, the user has to make sure that this alignment applies. This can be achieved by +manually providing suitable block sizes via the `launch_config.block_size` attribute or by enabling the option +`GpuOptions.fit_block_size` to automatically adapt the initial block size correspondingly. In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments and the given thread block size. -- GitLab From 3bfa575a7ee21bc7bb9b9ed76cab9aa36acf125f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Mar 2025 15:04:26 +0100 Subject: [PATCH 38/54] Adapt docstring for DynamicBlockSizeLaunchConfiguration --- src/pystencils/codegen/gpu_indexing.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index f33cd7836..5b9d6c9c9 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -299,13 +299,29 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size. - In case the user-specified block size does not coincide with multiples of the warp size of the GPU, - it will be altered by a block size fitting algorithm. + The actual launch grid size is computed from the user-defined ``user_block_size``, the number of work items + in the kernel's iteration space and potentially the `GpuOptions.warp_size` as follows. - The actual launch grid size is then computed as follows. For each dimension :math:`c \\in \\{ x, y, z \\}`, - ``block_size.c = fit_block_size(user_block_size.c)`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``. + - if `GpuOptions.use_block_size_fitting == True`: + In case the user-specified block size does not coincide with multiples of the warp size of the GPU, + it will be altered by a block size fitting algorithm, i.e. + + ``block_size.c = fit_block_size(user_block_size.c)`` + + The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`. + + - otherwise: + In this case trimming between the number of work items and the kernel's iteration space occurs, i.e. + + - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` + - otherwise, ``block_size.c = user_block_size.c`` + + + The actual launch grid size is then computed as follows. + + ``grid_size.c = ceil(num_work_items.c / block_size.c)``. """ def __init__( -- GitLab From 7cafe934d8de5032c34dcaabc4b4e3e2957b79e4 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Mar 2025 16:12:12 +0100 Subject: [PATCH 39/54] Fix docstring --- src/pystencils/codegen/gpu_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 5b9d6c9c9..2c5e89100 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -304,7 +304,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): For each dimension :math:`c \\in \\{ x, y, z \\}`, - - if `GpuOptions.use_block_size_fitting == True`: + - if `GpuOptions.use_block_size_fitting`: In case the user-specified block size does not coincide with multiples of the warp size of the GPU, it will be altered by a block size fitting algorithm, i.e. -- GitLab From 9ebeacfeb726fb14be570e40f2fd981cbae15940 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 14 Mar 2025 14:05:45 +0100 Subject: [PATCH 40/54] Remove block_size and use_block_size_fitting options and introduce `trim_block_size` and `fit_block_size` instance functions for DynamicBlockSizeLaunchConfigurations --- src/pystencils/codegen/config.py | 17 -- src/pystencils/codegen/driver.py | 4 - src/pystencils/codegen/gpu_indexing.py | 376 +++++++++++++------------ src/pystencils/jit/gpu_cupy.py | 5 +- tests/kernelcreation/test_gpu.py | 105 ++++--- 5 files changed, 275 insertions(+), 232 deletions(-) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 37263e058..56d1f234c 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -372,14 +372,6 @@ class GpuOptions(ConfigBase): indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D) """Thread indexing scheme for dense GPU kernels.""" - block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO) - """Desired block size for the execution of GPU kernels. - - This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>` - is chosen as an indexing scheme. - The block size may be overridden at runtime. - """ - manual_launch_grid: BasicOption[bool] = BasicOption(False) """Always require a manually specified launch grid when running this kernel. @@ -402,15 +394,6 @@ class GpuOptions(ConfigBase): The generator also checks if user-provided block sizes fulfill this criterion. """ - use_block_size_fitting: BasicOption[bool] = BasicOption(False) - """Experimental. Specifies if a fitting algorithm should be used at runtime to - compute optimal block sizes that align with the warp size, - i.e. their product is divisible by the warp size. - - This option only takes effect for `DynamicBlockSizeLaunchConfiguration`'s, - i.e. if `Linear3D <GpuIndexingScheme.Linear3D>` is chosen as an indexing scheme. - """ - @staticmethod def default_warp_size(target: Target): match target: diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index d47b889c7..bd5fc8b0b 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -405,9 +405,7 @@ class DefaultKernelCreationDriver: from .gpu_indexing import dim3 idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") - block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") - use_block_size_fitting: bool = self._cfg.gpu.get_option("use_block_size_fitting") assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") warp_size: int | None = self._cfg.gpu.get_option("warp_size") @@ -419,9 +417,7 @@ class DefaultKernelCreationDriver: self._target, idx_scheme, warp_size, - block_size, manual_launch_grid, - use_block_size_fitting, assume_warp_aligned_block_size, ) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 2c5e89100..c25aa3154 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -9,7 +9,7 @@ from warnings import warn from .functions import Lambda from .parameters import Parameter from .errors import CodegenError -from .config import GpuIndexingScheme, _AUTO_TYPE +from .config import GpuIndexingScheme from .target import Target from ..backend.kernelcreation import ( @@ -45,6 +45,18 @@ class GpuLaunchConfiguration(ABC): parameters to the associated kernel """ + @property + @abstractmethod + def block_size(self) -> dim3 | None: + """Returns desired block size if available.""" + pass + + @block_size.setter + @abstractmethod + def block_size(self, val: dim3): + """Sets desired block size if possible.""" + pass + @property @abstractmethod def parameters(self) -> frozenset[Parameter]: @@ -64,9 +76,24 @@ class GpuLaunchConfiguration(ABC): this launch configuration, such that when the configuration changes, the JIT parameter cache is invalidated.""" + @staticmethod + def get_default_block_size(rank: int) -> dim3: + """Returns the default block size configuration used by the generator.""" + + match rank: + case 1: + return (256, 1, 1) + case 2: + return (16, 16, 1) + case 3: + return (8, 8, 4) + case _: + assert False, "unreachable code" + @staticmethod def block_size_exceeds_hw_limits( - block_size: tuple[int, ...], hw_props: HardwareProperties + block_size: tuple[int, ...], + hw_props: HardwareProperties ) -> bool: """Checks if provided block size conforms limits given by the hardware.""" @@ -79,115 +106,10 @@ class GpuLaunchConfiguration(ABC): ) @staticmethod - def _gen_error_msg(block_size: tuple[int, ...]): + def _excessive_block_size_error_msg(block_size: tuple[int, ...]): return f"Unable to determine GPU block size for this kernel. \ Final block size was too large: {block_size}." - @staticmethod - def _round_block_sizes_to_warp_size( - to_round: list[int], warp_size: int - ) -> tuple[int, ...]: - # check if already aligns with warp size - if prod(to_round) % warp_size == 0: - return tuple(to_round) - - # find index of element closest to warp size and round up - index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size))) - if index_to_round + 1 < len(to_round): - return ( - *to_round[:index_to_round], - ceil_to_multiple(to_round[index_to_round], warp_size), - *to_round[index_to_round + 1:], - ) - else: - return ( - *to_round[:index_to_round], - ceil_to_multiple(to_round[index_to_round], warp_size), - ) - - @classmethod - def trim_block_size( - cls, - it_space: tuple[int, ...], - block_size: tuple[int, ...], - hw_props: HardwareProperties, - ) -> tuple[int, ...]: - """Returns specified block sizes trimmed with iteration space. - Raises CodegenError if trimmed block size does not conform hardware limits. - """ - - ret = tuple([min(b, i) for b, i in zip(block_size, it_space)]) - if cls.block_size_exceeds_hw_limits(ret, hw_props): - raise CodegenError(cls._gen_error_msg(ret)) - return ret - - @classmethod - def fit_block_size( - cls, - it_space: tuple[int, ...], - block_size: tuple[int, int, int], - hw_props: HardwareProperties, - ) -> tuple[int, ...]: - """Returns an optimized block size configuration with block sizes being aligned with the warp size. - Raises CodegenError if optimal block size could not be found or does not conform hardware limits. - """ - - def trim(to_trim: list[int]) -> list[int]: - return [min(b, i) for b, i in zip(to_trim, it_space)] - - def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]: - if cls.block_size_exceeds_hw_limits(ret, hw_props): - raise CodegenError(cls._gen_error_msg(ret)) - return ret - - trimmed = trim(list(block_size)) - if ( - prod(trimmed) >= hw_props.warp_size - and prod(trimmed) % hw_props.warp_size == 0 - ): - # case 1: greater than min block size -> use trimmed result - return check_sizes_and_return(tuple(trimmed)) - - prev_trim_size = 0 - resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space)) - while prod(trimmed) is not prev_trim_size: - prev_trim_size = prod(trimmed) - - # case 2: trimmed block is equivalent to the whole iteration space - if all(b == i for b, i in zip(trimmed, it_space)): - return check_sizes_and_return( - cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size) - ) - else: - # double block size in each dimension until block is large enough (or case 2 triggers) - for d in resize_order: - trimmed[d] *= 2 - - # trim fastest moving dim to multiples of warp size - if ( - d == 0 - and trimmed[d] > hw_props.warp_size - and trimmed[d] % hw_props.warp_size != 0 - ): - # subtract remainder - trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size) - - # check if block sizes are within hardware capabilities - trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d]) - - # trim again - trimmed = trim(trimmed) - - # case 3: trim block is large enough - if prod(trimmed) >= hw_props.warp_size: - return check_sizes_and_return( - cls._round_block_sizes_to_warp_size( - trimmed, hw_props.warp_size - ) - ) - - raise CodegenError("Unable to determine GPU block size for this kernel.") - class AutomaticLaunchConfiguration(GpuLaunchConfiguration): """Launch configuration that is dynamically computed from kernel parameters. @@ -211,6 +133,15 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): *(lb.parameters for lb in chain(block_size, grid_size)) ) + @property + def block_size(self) -> dim3 | None: + """Block size is only available when `evaluate` is called.""" + return None + + @block_size.setter + def block_size(self, val: dim3): + warn("Fruitless attempt to set `block_size`.") + @property def parameters(self) -> frozenset[Parameter]: return self._params @@ -290,6 +221,9 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): "`assume_warp_aligned_block_size` enabled." ) + if self.block_size_exceeds_hw_limits(self._block_size, self._hw_props): + raise CodegenError(self._excessive_block_size_error_msg(self._block_size)) + return self._block_size, self._grid_size def jit_cache_key(self) -> Any: @@ -328,18 +262,16 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self, num_work_items: _Dim3Lambda, hw_props: HardwareProperties, - use_block_size_fitting: bool, assume_warp_aligned_block_size: bool, - default_block_size: dim3 | None = None, ) -> None: self._num_work_items = num_work_items self._hw_props = hw_props - self._use_block_size_fitting = use_block_size_fitting self._assume_warp_aligned_block_size = assume_warp_aligned_block_size - self._block_size: dim3 | None = default_block_size + self._init_block_size: dim3 | None = None + self._compute_block_size: Callable[[tuple[int, ...], tuple[int, ...], HardwareProperties], tuple[int, ...]] | None = None self._params: frozenset[Parameter] = frozenset().union( *(wit.parameters for wit in num_work_items) @@ -351,68 +283,189 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): dimension from kernel parameters.""" return self._num_work_items + @property + def parameters(self) -> frozenset[Parameter]: + """Parameters of this launch configuration""" + return self._params + @property def block_size(self) -> dim3 | None: - """The desired GPU block size.""" - return self._block_size + """Block size is only available when `evaluate` is called.""" + return None @block_size.setter def block_size(self, val: dim3): - self._block_size = val + warn("Fruitless attempt to set `block_size`.") - @property - def parameters(self) -> frozenset[Parameter]: - """Parameters of this launch configuration""" - return self._params + @staticmethod + def _round_block_sizes_to_warp_size( + to_round: tuple[int, ...], + warp_size: int + ) -> tuple[int, ...]: + # check if already aligns with warp size + if prod(to_round) % warp_size == 0: + return tuple(to_round) - def evaluate(self, **kwargs) -> tuple[dim3, dim3]: - if self._block_size is None: - raise AttributeError("No GPU block size was specified by the user!") + # find index of element closest to warp size and round up + index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size))) + if index_to_round + 1 < len(to_round): + return ( + *to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], warp_size), + *to_round[index_to_round + 1:], + ) + else: + return ( + *to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], warp_size), + ) + + def trim_block_size(self, block_size: dim3): + def call_trimming_factory( + it: dim3, + bs: dim3, + hw: HardwareProperties, + ): + return self._trim_block_size_to_it_space(it, bs, hw) + self._init_block_size = block_size + self._compute_block_size = call_trimming_factory + + def _trim_block_size_to_it_space( + self, + it_space: dim3, + block_size: dim3, + hw_props: HardwareProperties, + ) -> tuple[int, ...]: + """Returns specified block sizes trimmed with iteration space. + Raises CodegenError if trimmed block size does not conform hardware limits. + """ + + ret = tuple([min(b, i) for b, i in zip(block_size, it_space)]) + if self.block_size_exceeds_hw_limits(ret, hw_props): + raise CodegenError(self._excessive_block_size_error_msg(ret)) + + if ( + self._assume_warp_aligned_block_size + and prod(ret) % self._hw_props.warp_size != 0 + ): + self._round_block_sizes_to_warp_size(ret, hw_props.warp_size) + + return ret + + def fit_block_size(self, block_size: dim3): + def call_fitting_factory( + it: dim3, + bs: dim3, + hw: HardwareProperties, + ): + return self._fit_block_size_to_it_space(it, bs, hw) + + self._init_block_size = block_size + self._compute_block_size = call_fitting_factory + + def _fit_block_size_to_it_space( + self, + it_space: dim3, + block_size: dim3, + hw_props: HardwareProperties, + ) -> tuple[int, ...]: + """Returns an optimized block size configuration with block sizes being aligned with the warp size. + Raises CodegenError if optimal block size could not be found or does not conform hardware limits. + """ + + def trim(to_trim: list[int]) -> list[int]: + return [min(b, i) for b, i in zip(to_trim, it_space)] + + def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]: + if self.block_size_exceeds_hw_limits(ret, hw_props): + raise CodegenError(self._excessive_block_size_error_msg(ret)) + return ret + + trimmed = trim(list(block_size)) + if ( + prod(trimmed) >= hw_props.warp_size + and prod(trimmed) % hw_props.warp_size == 0 + ): + # case 1: greater than min block size -> use trimmed result + return check_sizes_and_return(tuple(trimmed)) + + prev_trim_size = 0 + resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space)) + while prod(trimmed) is not prev_trim_size: + prev_trim_size = prod(trimmed) + + # case 2: trimmed block is equivalent to the whole iteration space + if all(b == i for b, i in zip(trimmed, it_space)): + return check_sizes_and_return( + self._round_block_sizes_to_warp_size(tuple(trimmed), hw_props.warp_size) + ) + else: + # double block size in each dimension until block is large enough (or case 2 triggers) + for d in resize_order: + trimmed[d] *= 2 + + # trim fastest moving dim to multiples of warp size + if ( + d == 0 + and trimmed[d] > hw_props.warp_size + and trimmed[d] % hw_props.warp_size != 0 + ): + # subtract remainder + trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size) + + # check if block sizes are within hardware capabilities + trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d]) + + # trim again + trimmed = trim(trimmed) + + # case 3: trim block is large enough + if prod(trimmed) >= hw_props.warp_size: + return check_sizes_and_return( + self._round_block_sizes_to_warp_size( + tuple(trimmed), hw_props.warp_size + ) + ) + + raise CodegenError("Unable to determine GPU block size for this kernel.") + + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: from ..utils import div_ceil num_work_items = cast( dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items) ) - computed_block_size: tuple[int, ...] - try: - if self._use_block_size_fitting: - computed_block_size = self.fit_block_size( - num_work_items, self._block_size, self._hw_props - ) - else: - computed_block_size = self.trim_block_size( - num_work_items, self._block_size, self._hw_props + block_size: dim3 + default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items)) + if self._compute_block_size: + try: + computed_bs = self._compute_block_size( + num_work_items, self._init_block_size, self._hw_props) + + block_size = cast(dim3, computed_bs) + except CodegenError as e: + block_size = default_bs + warn( + f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. " + f"Block size fitting could not determine optimal block size configuration. " + f"Defaulting back to {default_bs}." ) - - # check if assumption for warp size alignment is met - if ( - self._assume_warp_aligned_block_size - and prod(computed_block_size) % self._hw_props.warp_size != 0 - ): - raise CodegenError("Adapted block size is not divisible by warp size.") - except CodegenError as e: - warn( - f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. " - f"Block size fitting could not determine optimal block size configuration. " - f"Defaulting back to {self._block_size}" - ) - computed_block_size = self._block_size - - adapted_block_size = cast(dim3, computed_block_size) + else: + block_size = default_bs grid_size = cast( dim3, tuple( - div_ceil(wit, bs) for wit, bs in zip(num_work_items, adapted_block_size) + div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size) ), ) - return adapted_block_size, grid_size + return block_size, grid_size def jit_cache_key(self) -> Any: - return self._block_size + return () class GpuIndexing: @@ -437,18 +490,14 @@ class GpuIndexing: target: Target, scheme: GpuIndexingScheme, warp_size: int, - default_block_size: dim3 | _AUTO_TYPE | None = None, manual_launch_grid: bool = False, - use_block_size_fitting: bool = False, assume_warp_aligned_block_size: bool = False, ) -> None: self._ctx = ctx self._target = target self._scheme = scheme self._warp_size = warp_size - self._default_block_size = default_block_size self._manual_launch_grid = manual_launch_grid - self._use_block_size_fitting = use_block_size_fitting self._assume_warp_aligned_block_size = assume_warp_aligned_block_size self._hw_props = HardwareProperties( @@ -536,30 +585,11 @@ class GpuIndexing: return DynamicBlockSizeLaunchConfiguration( num_work_items, self._hw_props, - self._use_block_size_fitting, self._assume_warp_aligned_block_size, - self._get_default_block_size(rank), ) return factory - def _get_default_block_size(self, rank: int) -> dim3: - if self._default_block_size is None: - raise CodegenError("The default block size option was not set") - - if isinstance(self._default_block_size, _AUTO_TYPE): - match rank: - case 1: - return (256, 1, 1) - case 2: - return (16, 16, 1) - case 3: - return (8, 8, 4) - case _: - assert False, "unreachable code" - else: - return self._default_block_size - def _get_blockwise4d_config_factory( self, ) -> Callable[[], AutomaticLaunchConfiguration]: diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index f63af091f..7d9db4711 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -207,7 +207,10 @@ class CupyKernelWrapper(KernelWrapper): class CupyJit(JitBase): - def __init__(self, default_block_size: Sequence[int] = (16, 16, 1)): + def __init__( + self, + default_block_size: Sequence[int] = GpuLaunchConfiguration.get_default_block_size(rank=3) + ): self._runtime_headers = {"<cstdint>"} if len(default_block_size) > 3: diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 6cbc8f813..ae2a6d43d 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -32,7 +32,12 @@ except ImportError: @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) @pytest.mark.parametrize("manual_grid", [False, True]) -def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool): +@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True]) +def test_indexing_options_3d( + indexing_scheme: str, + manual_grid: bool, + assume_warp_aligned_block_size: bool, +): src, dst = fields("src, dst: [3D]") asm = Assignment( dst.center(), @@ -47,6 +52,7 @@ def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool): cfg = CreateKernelConfig(target=Target.CUDA) cfg.gpu.indexing_scheme = indexing_scheme cfg.gpu.manual_launch_grid = manual_grid + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size ast = create_kernel(asm, cfg) kernel = ast.compile() @@ -57,14 +63,25 @@ def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool): if manual_grid: match indexing_scheme: case "linear3d": - kernel.launch_config.block_size = (10, 8, 8) - kernel.launch_config.grid_size = (4, 4, 2) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (8, 10, 8) + kernel.launch_config.grid_size = (5, 4, 2) + else: + kernel.launch_config.block_size = (10, 10, 8) + kernel.launch_config.grid_size = (4, 4, 2) case "blockwise4d": - kernel.launch_config.block_size = (40, 1, 1) - kernel.launch_config.grid_size = (32, 16, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (64, 1, 1) + kernel.launch_config.grid_size = (32, 16, 1) + else: + kernel.launch_config.block_size = (40, 1, 1) + kernel.launch_config.grid_size = (32, 16, 1) elif indexing_scheme == "linear3d": - kernel.launch_config.block_size = (10, 8, 8) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (32, 8, 2) + else: + kernel.launch_config.block_size = (10, 10, 10) kernel(src=src_arr, dst=dst_arr) @@ -73,22 +90,18 @@ def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool): cp.testing.assert_allclose(dst_arr, expected) - -@pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) -@pytest.mark.parametrize("use_block_size_fitting", [True, False]) -@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) @pytest.mark.parametrize("iteration_space", [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)]) @pytest.mark.parametrize("initial_block_size", [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)]) -def test_indexing_launch_config( - indexing_scheme: str, - use_block_size_fitting: bool, - assume_warp_aligned_block_size: bool, +@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) +@pytest.mark.parametrize("use_block_fitting", [True, False]) +def test_block_size_adaptations( iteration_space: tuple[int, int, int], - initial_block_size: tuple[int, int, int] + initial_block_size: tuple[int, int, int], + assume_warp_aligned_block_size: bool, + use_block_fitting: bool, ): - src, dst = fields("src, dst: [3D]") asm = Assignment( dst.center(), @@ -102,24 +115,25 @@ def test_indexing_launch_config( target = Target.CUDA cfg = CreateKernelConfig(target=target) - cfg.gpu.indexing_scheme = indexing_scheme - cfg.gpu.use_block_size_fitting = use_block_size_fitting + cfg.gpu.indexing_scheme = "linear3d" cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size + warp_size = cfg.gpu.default_warp_size(target) + max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) + max_block_sizes = GpuIndexing.get_max_block_sizes(target) + ast = create_kernel(asm, cfg) kernel = ast.compile() - if use_block_size_fitting: - warp_size = cfg.gpu.default_warp_size(target) - max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) - max_block_sizes = GpuIndexing.get_max_block_sizes(target) - - block_size = kernel.launch_config.fit_block_size( + if use_block_fitting: + # test internal block fitting function later used in `kernel.launch_config.fit_block_size` + internal_block_size = kernel.launch_config._fit_block_size_to_it_space( iteration_space, initial_block_size, HardwareProperties(warp_size, max_threads_per_block, max_block_sizes) ) + # checks if criterion for warp size alignment is fulfilled def check_suitability(b): return prod(b) >= warp_size and prod(b) % warp_size == 0 @@ -127,15 +141,19 @@ def test_indexing_launch_config( # -> check if ideal configurations are modified if ( check_suitability(initial_block_size) - and all(x == y for x, y in zip(initial_block_size, iteration_space)) # trimming may alter results + and all(x == y for x, y in zip(initial_block_size, iteration_space)) # trimming may alter results ): - assert all(x == y for x, y in zip(initial_block_size, block_size)), \ - f"Initial block size unnecessarily adapted from {initial_block_size} to {block_size}." + assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \ + f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}." - assert check_suitability(block_size), \ + assert check_suitability(internal_block_size), \ "Determined block size shall be divisible by warp size." - kernel.launch_config.block_size = block_size + # set block size via fitting algorithm + kernel.launch_config.fit_block_size(initial_block_size) + else: + # set block size via trimming algorithm + kernel.launch_config.trim_block_size(initial_block_size) src_arr = cp.ones(iteration_space) dst_arr = cp.zeros_like(src_arr) @@ -149,10 +167,12 @@ def test_indexing_launch_config( @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) -@pytest.mark.parametrize("omit_range_check", [False, True]) @pytest.mark.parametrize("manual_grid", [False, True]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True]) def test_indexing_options_2d( - indexing_scheme: str, omit_range_check: bool, manual_grid: bool + indexing_scheme: str, + manual_grid: bool, + assume_warp_aligned_block_size: bool ): src, dst = fields("src, dst: [2D]") asm = Assignment( @@ -165,8 +185,8 @@ def test_indexing_options_2d( cfg = CreateKernelConfig(target=Target.CUDA) cfg.gpu.indexing_scheme = indexing_scheme - cfg.gpu.omit_range_check = omit_range_check cfg.gpu.manual_launch_grid = manual_grid + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size ast = create_kernel(asm, cfg) kernel = ast.compile() @@ -177,14 +197,25 @@ def test_indexing_options_2d( if manual_grid: match indexing_scheme: case "linear3d": - kernel.launch_config.block_size = (10, 8, 1) - kernel.launch_config.grid_size = (4, 2, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (8, 8, 1) + kernel.launch_config.grid_size = (5, 2, 1) + else: + kernel.launch_config.block_size = (10, 8, 1) + kernel.launch_config.grid_size = (4, 2, 1) case "blockwise4d": - kernel.launch_config.block_size = (40, 1, 1) - kernel.launch_config.grid_size = (16, 1, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (64, 1, 1) + kernel.launch_config.grid_size = (16, 1, 1) + else: + kernel.launch_config.block_size = (40, 1, 1) + kernel.launch_config.grid_size = (16, 1, 1) elif indexing_scheme == "linear3d": - kernel.launch_config.block_size = (10, 8, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (8, 8, 1) + else: + kernel.launch_config.block_size = (10, 8, 1) kernel(src=src_arr, dst=dst_arr) -- GitLab From dde422637cec5298cf90b3cf8c82655ebc594a2e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 14 Mar 2025 14:08:45 +0100 Subject: [PATCH 41/54] Fix lint --- src/pystencils/codegen/driver.py | 2 - src/pystencils/codegen/gpu_indexing.py | 66 ++++++++++++++------------ tests/kernelcreation/test_gpu.py | 20 ++++---- 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index bd5fc8b0b..65352f520 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -402,8 +402,6 @@ class DefaultKernelCreationDriver: if self._target != Target.CUDA: return None - from .gpu_indexing import dim3 - idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index c25aa3154..31e89353c 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -92,8 +92,7 @@ class GpuLaunchConfiguration(ABC): @staticmethod def block_size_exceeds_hw_limits( - block_size: tuple[int, ...], - hw_props: HardwareProperties + block_size: tuple[int, ...], hw_props: HardwareProperties ) -> bool: """Checks if provided block size conforms limits given by the hardware.""" @@ -271,7 +270,12 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self._assume_warp_aligned_block_size = assume_warp_aligned_block_size self._init_block_size: dim3 | None = None - self._compute_block_size: Callable[[tuple[int, ...], tuple[int, ...], HardwareProperties], tuple[int, ...]] | None = None + self._compute_block_size: ( + Callable[ + [tuple[int, ...], tuple[int, ...], HardwareProperties], tuple[int, ...] + ] + | None + ) = None self._params: frozenset[Parameter] = frozenset().union( *(wit.parameters for wit in num_work_items) @@ -299,8 +303,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): @staticmethod def _round_block_sizes_to_warp_size( - to_round: tuple[int, ...], - warp_size: int + to_round: tuple[int, ...], warp_size: int ) -> tuple[int, ...]: # check if already aligns with warp size if prod(to_round) % warp_size == 0: @@ -322,9 +325,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def trim_block_size(self, block_size: dim3): def call_trimming_factory( - it: dim3, - bs: dim3, - hw: HardwareProperties, + it: dim3, + bs: dim3, + hw: HardwareProperties, ): return self._trim_block_size_to_it_space(it, bs, hw) @@ -332,10 +335,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self._compute_block_size = call_trimming_factory def _trim_block_size_to_it_space( - self, - it_space: dim3, - block_size: dim3, - hw_props: HardwareProperties, + self, + it_space: dim3, + block_size: dim3, + hw_props: HardwareProperties, ) -> tuple[int, ...]: """Returns specified block sizes trimmed with iteration space. Raises CodegenError if trimmed block size does not conform hardware limits. @@ -346,8 +349,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): raise CodegenError(self._excessive_block_size_error_msg(ret)) if ( - self._assume_warp_aligned_block_size - and prod(ret) % self._hw_props.warp_size != 0 + self._assume_warp_aligned_block_size + and prod(ret) % self._hw_props.warp_size != 0 ): self._round_block_sizes_to_warp_size(ret, hw_props.warp_size) @@ -355,9 +358,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def fit_block_size(self, block_size: dim3): def call_fitting_factory( - it: dim3, - bs: dim3, - hw: HardwareProperties, + it: dim3, + bs: dim3, + hw: HardwareProperties, ): return self._fit_block_size_to_it_space(it, bs, hw) @@ -365,10 +368,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self._compute_block_size = call_fitting_factory def _fit_block_size_to_it_space( - self, - it_space: dim3, - block_size: dim3, - hw_props: HardwareProperties, + self, + it_space: dim3, + block_size: dim3, + hw_props: HardwareProperties, ) -> tuple[int, ...]: """Returns an optimized block size configuration with block sizes being aligned with the warp size. Raises CodegenError if optimal block size could not be found or does not conform hardware limits. @@ -384,8 +387,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): trimmed = trim(list(block_size)) if ( - prod(trimmed) >= hw_props.warp_size - and prod(trimmed) % hw_props.warp_size == 0 + prod(trimmed) >= hw_props.warp_size + and prod(trimmed) % hw_props.warp_size == 0 ): # case 1: greater than min block size -> use trimmed result return check_sizes_and_return(tuple(trimmed)) @@ -398,7 +401,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): # case 2: trimmed block is equivalent to the whole iteration space if all(b == i for b, i in zip(trimmed, it_space)): return check_sizes_and_return( - self._round_block_sizes_to_warp_size(tuple(trimmed), hw_props.warp_size) + self._round_block_sizes_to_warp_size( + tuple(trimmed), hw_props.warp_size + ) ) else: # double block size in each dimension until block is large enough (or case 2 triggers) @@ -407,9 +412,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): # trim fastest moving dim to multiples of warp size if ( - d == 0 - and trimmed[d] > hw_props.warp_size - and trimmed[d] % hw_props.warp_size != 0 + d == 0 + and trimmed[d] > hw_props.warp_size + and trimmed[d] % hw_props.warp_size != 0 ): # subtract remainder trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size) @@ -442,7 +447,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): if self._compute_block_size: try: computed_bs = self._compute_block_size( - num_work_items, self._init_block_size, self._hw_props) + num_work_items, self._init_block_size, self._hw_props + ) block_size = cast(dim3, computed_bs) except CodegenError as e: @@ -457,9 +463,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): grid_size = cast( dim3, - tuple( - div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size) - ), + tuple(div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size)), ) return block_size, grid_size diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index ae2a6d43d..70b4edd35 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -34,9 +34,9 @@ except ImportError: @pytest.mark.parametrize("manual_grid", [False, True]) @pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True]) def test_indexing_options_3d( - indexing_scheme: str, - manual_grid: bool, - assume_warp_aligned_block_size: bool, + indexing_scheme: str, + manual_grid: bool, + assume_warp_aligned_block_size: bool, ): src, dst = fields("src, dst: [3D]") asm = Assignment( @@ -97,10 +97,10 @@ def test_indexing_options_3d( @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) @pytest.mark.parametrize("use_block_fitting", [True, False]) def test_block_size_adaptations( - iteration_space: tuple[int, int, int], - initial_block_size: tuple[int, int, int], - assume_warp_aligned_block_size: bool, - use_block_fitting: bool, + iteration_space: tuple[int, int, int], + initial_block_size: tuple[int, int, int], + assume_warp_aligned_block_size: bool, + use_block_fitting: bool, ): src, dst = fields("src, dst: [3D]") asm = Assignment( @@ -130,7 +130,7 @@ def test_block_size_adaptations( internal_block_size = kernel.launch_config._fit_block_size_to_it_space( iteration_space, initial_block_size, - HardwareProperties(warp_size, max_threads_per_block, max_block_sizes) + HardwareProperties(warp_size, max_threads_per_block, max_block_sizes), ) # checks if criterion for warp size alignment is fulfilled @@ -170,9 +170,7 @@ def test_block_size_adaptations( @pytest.mark.parametrize("manual_grid", [False, True]) @pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True]) def test_indexing_options_2d( - indexing_scheme: str, - manual_grid: bool, - assume_warp_aligned_block_size: bool + indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool ): src, dst = fields("src, dst: [2D]") asm = Assignment( -- GitLab From 7b818fa39bc395619d59ce99d5d5f251d51bc63c Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 14 Mar 2025 14:36:53 +0100 Subject: [PATCH 42/54] Fix typecheck --- src/pystencils/codegen/config.py | 4 +--- src/pystencils/codegen/gpu_indexing.py | 16 +++++++--------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 56d1f234c..04ceaf93c 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -742,9 +742,7 @@ class CreateKernelConfig(ConfigBase): UserWarning, ) - self.gpu = GpuOptions( - block_size=gpu_indexing_params.get("block_size", None) - ) + self.gpu = GpuOptions() def _deprecated_option(name, instead): # pragma: no cover diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 31e89353c..b80a5027e 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -269,12 +269,11 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self._assume_warp_aligned_block_size = assume_warp_aligned_block_size - self._init_block_size: dim3 | None = None + default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items)) + self._default_block_size = default_bs + self._init_block_size: dim3 = default_bs self._compute_block_size: ( - Callable[ - [tuple[int, ...], tuple[int, ...], HardwareProperties], tuple[int, ...] - ] - | None + Callable[[dim3, dim3, HardwareProperties], tuple[int, ...]] | None ) = None self._params: frozenset[Parameter] = frozenset().union( @@ -443,7 +442,6 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ) block_size: dim3 - default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items)) if self._compute_block_size: try: computed_bs = self._compute_block_size( @@ -452,14 +450,14 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): block_size = cast(dim3, computed_bs) except CodegenError as e: - block_size = default_bs + block_size = self._default_block_size warn( f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. " f"Block size fitting could not determine optimal block size configuration. " - f"Defaulting back to {default_bs}." + f"Defaulting back to {self._default_block_size}." ) else: - block_size = default_bs + block_size = self._default_block_size grid_size = cast( dim3, -- GitLab From b400aabfc82ea32ef413f6803d41c984970fb022 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 14 Mar 2025 17:27:47 +0100 Subject: [PATCH 43/54] Adapt documentation for CUDA block sizes --- docs/source/backend/gpu_codegen.md | 14 ++++----- docs/source/user_manual/gpu_kernels.md | 43 +++++++++++++++++--------- src/pystencils/codegen/gpu_indexing.py | 37 +++++++++++++++------- 3 files changed, 61 insertions(+), 33 deletions(-) diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 6cc00a6e2..e5035d7c7 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -72,13 +72,13 @@ These depend on the type of the launch configuration: while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel parameters, the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size. -The `DynamicBlockSizeLaunchConfiguration` permits the user to set a block size which is used to then dynamically compute -the grid size. However, the actual block size being used may differ from the user specification. This may be due to -trimming operations between the original block size and the iteration space to avoid spawning unnecessarily large -blocks. In case the `GpuOptions.use_block_size_fitting` option is set, a block fitting algorithm adapts the original -block size such that it aligns with the warp size given by the hardware for improved performance. The algorithm -`GpuLaunchConfiguration.fit_block_size` incrementally increases the trimmed block size until it is rounded to a multiple -of the warp size while considering the maximum amount of threads per block. +The `DynamicBlockSizeLaunchConfiguration` dynamically computes the grid size from either the default block size +or a computed block size. Computing block sizes can be signaled by the user via the `trim_block_size` or +`fit_block_size` member functions. These function receive an initial block size as an argument and adapt it. +The `trim_block_size` function trims the initial block size with the sizes of the iteration space, i.e. it takes +the minimum value of both sizes per dimension. The `fit_block_size` performs a block fitting algorithm that adapts +the initial block size by incrementally enlarging the trimmed block size until it is large enough +and aligns with the warp size. The `evaluate` method can only be used from within a Python runtime environment. When exporting pystencils CUDA kernels for external use in C++ projects, diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 2443108c1..0a6620123 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -111,20 +111,30 @@ For most kernels with an at most three-dimensional iteration space, this behavior is sufficient and desired. It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`. -If the `Linear3D` indexing scheme is used, you may modify the GPU thread block size in two places. -The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` -code generator option; -if none is specified, a default depending on the iteration space's dimensionality will be used. -Please note that this **initial block size configuration may be altered** by the code generator. - -The block size can furthermore be modified at the compiled kernel's wrapper object via the -`launch_config.block_size` attribute: +The GPU thread block size of a compiled kernel's wrapper object can only be directly modified +for manual launch configurations, cf. the section for [manual launch configurations](#manual_launch_grids). +Linear indexing schemes without manual launch configurations either employ default block sizes +or compute block sizes using user-exposed member functions that adapt the initial block size that was +passed as an argument. The available functions are `fit_block_size` and `trim_block_size`. +The `trim_block_size` function trims the user-defined initial block size with the iteration space. +The `fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is +divisible by the hardware's warp size. ```{code-cell} ipython3 kfunc = kernel.compile() -kfunc.launch_config.block_size = (256, 2, 1) -# Run the kernel +# three different configuration cases for block size: + +# a) Nothing +# -> use default block size, perform no fitting, no trimming + +# b) Activate fitting with initial block size +kfunc.launch_config.fit_block_size(256, 2, 1) + +# c) Activate trimming with initial block size +kfunc.launch_config.trim_block_size(256, 2, 1) + +# finally run the kernel... kfunc(f=f_arr, g=g_arr) ``` @@ -132,12 +142,17 @@ Block sizes aligning with multiples of the hardware's warp size allow for a bett Even though such block size configurations are not enforced, notifying our code generator via the `GpuOptions.assume_warp_aligned_block_size` option that the configured block size is divisible by the warp size allows for further optimization potential, e.g. for warp-level reductions. -When setting this option to `True`, the user has to make sure that this alignment applies. This can be achieved by -manually providing suitable block sizes via the `launch_config.block_size` attribute or by enabling the option -`GpuOptions.fit_block_size` to automatically adapt the initial block size correspondingly. +When setting this option to `True`, the user has to make sure that this alignment applies. +For [manual launch configurations](#manual_launch_grids), this can be achieved by manually providing suitable +block sizes via the `launch_config.block_size`. For the other launch configurations, this criterion is guaranteed +by using the default block sizes in pystencils. Using `fit_block_size` also guarantees this while also producing +block sizes that are better customized towards the kernel's iteration space. For `trim_block_size`, +the trimmed block's dimension that is closest the next multiple of the warp size is rounded up to the next multiple +of the warp size. For all cases, the final block size is checked against the imposed hardware limits and an error +is thrown in case these limits are exceeded. In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments -and the given thread block size. +and the final thread block size. :::{attention} diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index b80a5027e..026d4bf07 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -230,27 +230,34 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): - """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size. - - The actual launch grid size is computed from the user-defined ``user_block_size``, the number of work items - in the kernel's iteration space and potentially the `GpuOptions.warp_size` as follows. + """GPU launch configuration that dynamically computes the grid size from either the default block size + or a computed block size. Computing block sizes can be triggerred via the `trim_block_size` or `fit_block_size` + member functions. These functions adapt a user-defined initial block size that they receive as an argument. + The adaptation of the initial block sizes is described in the following: For each dimension :math:`c \\in \\{ x, y, z \\}`, - - if `GpuOptions.use_block_size_fitting`: - In case the user-specified block size does not coincide with multiples of the warp size of the GPU, - it will be altered by a block size fitting algorithm, i.e. + - if `fit_block_size` was chosen, the initial block size is adapted such that it aligns with + multiples of the hardware's warp size. This is done using a fitting algorithm first trims + the initial block size with the iteration space and increases it incrementally until it is large enough + and coincides with multiples of the warp size, i.e. - ``block_size.c = fit_block_size(user_block_size.c)`` + ``block_size.c = _fit_block_size_to_it_space(iter_space.c, init_block_size.c, hardware_properties)`` The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`. - - otherwise: - In this case trimming between the number of work items and the kernel's iteration space occurs, i.e. + - elif `trim_block_size` was chosen, a trimming between the number of work items + and the kernel's iteration space occurs, i.e. + + - if ``init_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` + - otherwise, ``block_size.c = init_block_size.c`` - - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` - - otherwise, ``block_size.c = user_block_size.c`` + When `GpuOptions.assume_warp_aligned_block_size` is set, we ensure warp-alignment by + rounding the block size dimension that is closest the next multiple of the warp size. + - else the default block size is taken, i.e. + + ``block_size.c = get_default_block_size(rank=3).c`` The actual launch grid size is then computed as follows. @@ -330,6 +337,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ): return self._trim_block_size_to_it_space(it, bs, hw) + if self._compute_block_size: + warn("Overwriting an already existing callback function for computing the block size.") + self._init_block_size = block_size self._compute_block_size = call_trimming_factory @@ -363,6 +373,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ): return self._fit_block_size_to_it_space(it, bs, hw) + if self._compute_block_size: + warn("Overwriting an already existing callback function for computing the block size.") + self._init_block_size = block_size self._compute_block_size = call_fitting_factory -- GitLab From 403cbbcf05264e6096e40151aea78161acbb1002 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 17 Mar 2025 13:44:48 +0100 Subject: [PATCH 44/54] Fix docs --- docs/source/user_manual/gpu_kernels.md | 15 ++++++++------- src/pystencils/codegen/gpu_indexing.py | 22 ++++++++++++---------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 0a6620123..146c2c428 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -115,12 +115,13 @@ The GPU thread block size of a compiled kernel's wrapper object can only be dire for manual launch configurations, cf. the section for [manual launch configurations](#manual_launch_grids). Linear indexing schemes without manual launch configurations either employ default block sizes or compute block sizes using user-exposed member functions that adapt the initial block size that was -passed as an argument. The available functions are `fit_block_size` and `trim_block_size`. -The `trim_block_size` function trims the user-defined initial block size with the iteration space. -The `fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is +passed as an argument. The available functions are :meth:`fit_block_size` and :meth:`trim_block_size`. +The :meth:`trim_block_size` function trims the user-defined initial block size with the iteration space. +The :meth:`fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is divisible by the hardware's warp size. ```{code-cell} ipython3 +:tags: [raises-exception] kfunc = kernel.compile() # three different configuration cases for block size: @@ -129,10 +130,10 @@ kfunc = kernel.compile() # -> use default block size, perform no fitting, no trimming # b) Activate fitting with initial block size -kfunc.launch_config.fit_block_size(256, 2, 1) +kfunc.launch_config.fit_block_size((8, 8, 4)) # c) Activate trimming with initial block size -kfunc.launch_config.trim_block_size(256, 2, 1) +#kfunc.launch_config.trim_block_size((8, 8, 4)) # finally run the kernel... kfunc(f=f_arr, g=g_arr) @@ -145,8 +146,8 @@ for further optimization potential, e.g. for warp-level reductions. When setting this option to `True`, the user has to make sure that this alignment applies. For [manual launch configurations](#manual_launch_grids), this can be achieved by manually providing suitable block sizes via the `launch_config.block_size`. For the other launch configurations, this criterion is guaranteed -by using the default block sizes in pystencils. Using `fit_block_size` also guarantees this while also producing -block sizes that are better customized towards the kernel's iteration space. For `trim_block_size`, +by using the default block sizes in pystencils. Using :meth:`fit_block_size` also guarantees this while also producing +block sizes that are better customized towards the kernel's iteration space. For :meth:`trim_block_size`, the trimmed block's dimension that is closest the next multiple of the warp size is rounded up to the next multiple of the warp size. For all cases, the final block size is checked against the imposed hardware limits and an error is thrown in case these limits are exceeded. diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 026d4bf07..16b0826e7 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -231,23 +231,25 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): """GPU launch configuration that dynamically computes the grid size from either the default block size - or a computed block size. Computing block sizes can be triggerred via the `trim_block_size` or `fit_block_size` - member functions. These functions adapt a user-defined initial block size that they receive as an argument. - The adaptation of the initial block sizes is described in the following: + or a computed block size. Computing block sizes can be triggerred via the :meth:`trim_block_size` or + :meth:`fit_block_size` member functions. These functions adapt a user-defined initial block size that they + receive as an argument. The adaptation of the initial block sizes is described in the following: For each dimension :math:`c \\in \\{ x, y, z \\}`, - - if `fit_block_size` was chosen, the initial block size is adapted such that it aligns with - multiples of the hardware's warp size. This is done using a fitting algorithm first trims - the initial block size with the iteration space and increases it incrementally until it is large enough - and coincides with multiples of the warp size, i.e. + - if :meth:`fit_block_size` was chosen: + + the initial block size is adapted such that it aligns with multiples of the hardware's warp size. + This is done using a fitting algorithm first trims the initial block size with the iteration space + and increases it incrementally until it is large enough and coincides with multiples of the warp size, i.e. ``block_size.c = _fit_block_size_to_it_space(iter_space.c, init_block_size.c, hardware_properties)`` The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`. - - elif `trim_block_size` was chosen, a trimming between the number of work items - and the kernel's iteration space occurs, i.e. + - elif :meth:`trim_block_size` was chosen: + + a trimming between the number of work items and the kernel's iteration space occurs, i.e. - if ``init_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` - otherwise, ``block_size.c = init_block_size.c`` @@ -255,7 +257,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): When `GpuOptions.assume_warp_aligned_block_size` is set, we ensure warp-alignment by rounding the block size dimension that is closest the next multiple of the warp size. - - else the default block size is taken, i.e. + - otherwise: the default block size is taken i.e. ``block_size.c = get_default_block_size(rank=3).c`` -- GitLab From 27ae1f428a94163528d8ee3d4cd282e08f0f5c1a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 12:00:36 +0100 Subject: [PATCH 45/54] Adapt warnings --- src/pystencils/codegen/gpu_indexing.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 16b0826e7..a497411c7 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -139,7 +139,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): @block_size.setter def block_size(self, val: dim3): - warn("Fruitless attempt to set `block_size`.") + warn("Setting `block_size` on an automatic launch configuration has no effect.", UserWarning) @property def parameters(self) -> frozenset[Parameter]: @@ -307,7 +307,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): @block_size.setter def block_size(self, val: dim3): - warn("Fruitless attempt to set `block_size`.") + warn("Setting `block_size` on an dynamic launch configuration has no effect.", UserWarning) @staticmethod def _round_block_sizes_to_warp_size( @@ -339,9 +339,6 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ): return self._trim_block_size_to_it_space(it, bs, hw) - if self._compute_block_size: - warn("Overwriting an already existing callback function for computing the block size.") - self._init_block_size = block_size self._compute_block_size = call_trimming_factory @@ -375,9 +372,6 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ): return self._fit_block_size_to_it_space(it, bs, hw) - if self._compute_block_size: - warn("Overwriting an already existing callback function for computing the block size.") - self._init_block_size = block_size self._compute_block_size = call_fitting_factory -- GitLab From a6125bc7beb1e69bcfbf06b20d6906e6cd5ede5d Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 12:01:47 +0100 Subject: [PATCH 46/54] Move block_size_exceeds_hw_limits to HardwareProperties --- src/pystencils/codegen/gpu_indexing.py | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index a497411c7..042af70db 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -34,6 +34,20 @@ class HardwareProperties: max_threads_per_block: int max_block_sizes: dim3 + def block_size_exceeds_hw_limits( + self, + block_size: tuple[int, ...] + ) -> bool: + """Checks if provided block size conforms limits given by the hardware.""" + + return ( + any( + size > max_size + for size, max_size in zip(block_size, self.max_block_sizes) + ) + or prod(block_size) > self.max_threads_per_block + ) + class GpuLaunchConfiguration(ABC): """Base class for launch configurations for CUDA and HIP kernels. @@ -90,20 +104,6 @@ class GpuLaunchConfiguration(ABC): case _: assert False, "unreachable code" - @staticmethod - def block_size_exceeds_hw_limits( - block_size: tuple[int, ...], hw_props: HardwareProperties - ) -> bool: - """Checks if provided block size conforms limits given by the hardware.""" - - return ( - any( - size > max_size - for size, max_size in zip(block_size, hw_props.max_block_sizes) - ) - or prod(block_size) > hw_props.max_threads_per_block - ) - @staticmethod def _excessive_block_size_error_msg(block_size: tuple[int, ...]): return f"Unable to determine GPU block size for this kernel. \ @@ -158,7 +158,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): *block_size[1:], ) - if self.block_size_exceeds_hw_limits(block_size, self._hw_props): + if self._hw_props.block_size_exceeds_hw_limits(block_size): raise CodegenError(f"Block size {block_size} exceeds hardware limits.") grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) @@ -220,7 +220,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): "`assume_warp_aligned_block_size` enabled." ) - if self.block_size_exceeds_hw_limits(self._block_size, self._hw_props): + if self._hw_props.block_size_exceeds_hw_limits(self._block_size): raise CodegenError(self._excessive_block_size_error_msg(self._block_size)) return self._block_size, self._grid_size @@ -353,7 +353,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): """ ret = tuple([min(b, i) for b, i in zip(block_size, it_space)]) - if self.block_size_exceeds_hw_limits(ret, hw_props): + if hw_props.block_size_exceeds_hw_limits(ret): raise CodegenError(self._excessive_block_size_error_msg(ret)) if ( @@ -389,7 +389,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): return [min(b, i) for b, i in zip(to_trim, it_space)] def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]: - if self.block_size_exceeds_hw_limits(ret, hw_props): + if hw_props.block_size_exceeds_hw_limits(ret): raise CodegenError(self._excessive_block_size_error_msg(ret)) return ret -- GitLab From 74b1457af0851606743d6421a61bc0396edfca1f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 12:02:12 +0100 Subject: [PATCH 47/54] Remove _default_block_size from CupyJit --- src/pystencils/jit/gpu_cupy.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 7d9db4711..5a078df9f 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -207,22 +207,9 @@ class CupyKernelWrapper(KernelWrapper): class CupyJit(JitBase): - def __init__( - self, - default_block_size: Sequence[int] = GpuLaunchConfiguration.get_default_block_size(rank=3) - ): + def __init__(self): self._runtime_headers = {"<cstdint>"} - if len(default_block_size) > 3: - raise ValueError( - f"Invalid block size: {default_block_size}. Must be at most three-dimensional." - ) - - self._default_block_size: tuple[int, int, int] = cast( - tuple[int, int, int], - tuple(default_block_size) + (1,) * (3 - len(default_block_size)), - ) - def compile(self, kernel: Kernel) -> KernelWrapper: if not HAVE_CUPY: raise JitError( -- GitLab From 5737f64d2769201388fc1eaa4124e30b0a9c15cd Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 12:02:42 +0100 Subject: [PATCH 48/54] Adapt docs --- docs/source/user_manual/gpu_kernels.md | 2 +- src/pystencils/codegen/config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 146c2c428..48a24e703 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -133,7 +133,7 @@ kfunc = kernel.compile() kfunc.launch_config.fit_block_size((8, 8, 4)) # c) Activate trimming with initial block size -#kfunc.launch_config.trim_block_size((8, 8, 4)) +kfunc.launch_config.trim_block_size((8, 8, 4)) # finally run the kernel... kfunc(f=f_arr, g=g_arr) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 04ceaf93c..d6f8e403c 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -391,7 +391,7 @@ class GpuOptions(ConfigBase): If set to `True`, the code generator can employ optimizations that require this assumption, e.g. warp-level reductions. - The generator also checks if user-provided block sizes fulfill this criterion. + The pystencils Cupy runtime also checks if user-provided block sizes fulfill this criterion. """ @staticmethod -- GitLab From d733102e178bca762284a640d0084a7a889ceceb Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 12:08:14 +0100 Subject: [PATCH 49/54] Move ceil_to_multiple of block sizes for automatic launch configs to factory --- src/pystencils/codegen/gpu_indexing.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 042af70db..724106ec2 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -146,17 +146,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): return self._params def evaluate(self, **kwargs) -> tuple[dim3, dim3]: - # impossible to use block size determination function since the iteration space is unknown - # -> round block size in fastest moving dimension up to multiple of warp size block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) - block_size = ( - ( - ceil_to_multiple(block_size[0], self._hw_props.warp_size) - if self._assume_warp_aligned_block_size - else block_size[0] - ), - *block_size[1:], - ) if self._hw_props.block_size_exceeds_hw_limits(block_size): raise CodegenError(f"Block size {block_size} exceeds hardware limits.") @@ -610,8 +600,16 @@ class GpuIndexing: if rank > 4: raise ValueError(f"Iteration space rank is too large: {rank}") + # impossible to use block size determination function since the iteration space is unknown + # -> round block size in fastest moving dimension up to multiple of warp size + rounded_block_size: PsExpression + if self._assume_warp_aligned_block_size: + rounded_block_size = ceil_to_multiple(work_items[0], self._hw_props.warp_size) + else: + rounded_block_size = work_items[0] + block_size = ( - self._kernel_factory.create_lambda(work_items[0]), + self._kernel_factory.create_lambda(rounded_block_size), self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), ) -- GitLab From 3ad5b4a4ea09aa82b1fc4f24771cf4b691e3969e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 12:09:25 +0100 Subject: [PATCH 50/54] Fix lint --- src/pystencils/jit/gpu_cupy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 5a078df9f..4e36e369e 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -1,4 +1,4 @@ -from typing import Any, Sequence, cast, Callable +from typing import Any, Callable from dataclasses import dataclass try: -- GitLab From a3fd0c243d713b9e22a04779c360af0786bc9732 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 12:36:06 +0100 Subject: [PATCH 51/54] Fix type for ceil_to_multiple in automatic launch config lambda --- src/pystencils/codegen/gpu_indexing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 724106ec2..52efee81d 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -11,6 +11,7 @@ from .parameters import Parameter from .errors import CodegenError from .config import GpuIndexingScheme from .target import Target +from ..backend.constants import PsConstant from ..backend.kernelcreation import ( KernelCreationContext, @@ -604,7 +605,9 @@ class GpuIndexing: # -> round block size in fastest moving dimension up to multiple of warp size rounded_block_size: PsExpression if self._assume_warp_aligned_block_size: - rounded_block_size = ceil_to_multiple(work_items[0], self._hw_props.warp_size) + rounded_block_size = ceil_to_multiple( + work_items[0], + PsExpression.make(PsConstant(self._hw_props.warp_size, work_items[0].dtype))) else: rounded_block_size = work_items[0] -- GitLab From 1a2b8f74552467455d0dcbd1cf023646ad2a7f41 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 12:48:53 +0100 Subject: [PATCH 52/54] Throw attribute error for wrongfully used block size setter --- src/pystencils/codegen/gpu_indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 52efee81d..bcffbbcae 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -140,7 +140,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): @block_size.setter def block_size(self, val: dim3): - warn("Setting `block_size` on an automatic launch configuration has no effect.", UserWarning) + AttributeError("Setting `block_size` on an automatic launch configuration has no effect.") @property def parameters(self) -> frozenset[Parameter]: @@ -298,7 +298,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): @block_size.setter def block_size(self, val: dim3): - warn("Setting `block_size` on an dynamic launch configuration has no effect.", UserWarning) + AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.") @staticmethod def _round_block_sizes_to_warp_size( -- GitLab From f65cd9bf27082525cfb97cf3958bda75bae4719e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 12:50:14 +0100 Subject: [PATCH 53/54] Create expression for ceil_to_multiple manually --- src/pystencils/codegen/gpu_indexing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index bcffbbcae..d5f0aead2 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -20,7 +20,7 @@ from ..backend.kernelcreation import ( ) from ..backend.platforms.cuda import ThreadMapping -from ..backend.ast.expressions import PsExpression +from ..backend.ast.expressions import PsExpression, PsIntDiv from math import prod from ..utils import ceil_to_multiple @@ -605,9 +605,10 @@ class GpuIndexing: # -> round block size in fastest moving dimension up to multiple of warp size rounded_block_size: PsExpression if self._assume_warp_aligned_block_size: - rounded_block_size = ceil_to_multiple( - work_items[0], - PsExpression.make(PsConstant(self._hw_props.warp_size, work_items[0].dtype))) + warp_size = self._ast_factory.parse_index(self._hw_props.warp_size) + rounded_block_size = self._ast_factory.parse_index( + PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1), + warp_size.clone()) * warp_size.clone()) else: rounded_block_size = work_items[0] -- GitLab From c84ad702852e746a19221cfb80bdc2cefc634918 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Mar 2025 14:28:13 +0100 Subject: [PATCH 54/54] Fix lint --- src/pystencils/codegen/gpu_indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index d5f0aead2..d473e5b4a 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -11,7 +11,6 @@ from .parameters import Parameter from .errors import CodegenError from .config import GpuIndexingScheme from .target import Target -from ..backend.constants import PsConstant from ..backend.kernelcreation import ( KernelCreationContext, -- GitLab