From 7c371fb43a0eb731268fa3fe8fd7861f0b4b5f68 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 19 Mar 2025 13:03:39 +0000 Subject: [PATCH] make no assumptions about warp_size for Target.HIP --- src/pystencils/codegen/driver.py | 4 +++- src/pystencils/codegen/gpu_indexing.py | 20 ++++++++++++++------ tests/kernelcreation/test_gpu.py | 5 +++-- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 543d9db2f..e9fc69b76 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import cast, Sequence, Callable, TYPE_CHECKING from dataclasses import dataclass, replace +from warnings import warn from .target import Target from .config import ( @@ -410,7 +411,8 @@ class DefaultKernelCreationDriver: if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) - # TODO: Warn if warp_size is None and assume_warp_aligned_block_size is True + if warp_size is None and assume_warp_aligned_block_size: + warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.") return GpuIndexing( self._ctx, diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index b5e70043f..09570e345 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -30,7 +30,7 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda] @dataclass class HardwareProperties: - warp_size: int + warp_size: int | None max_threads_per_block: int max_block_sizes: dim3 @@ -204,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): if ( self._assume_warp_aligned_block_size + and self._hw_props.warp_size is not None and prod(self._block_size) % self._hw_props.warp_size != 0 ): raise CodegenError( @@ -316,7 +317,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): return ( *to_round[:index_to_round], ceil_to_multiple(to_round[index_to_round], warp_size), - *to_round[index_to_round + 1 :], + *to_round[index_to_round + 1:], ) else: return ( @@ -351,7 +352,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): if ( self._assume_warp_aligned_block_size - and prod(ret) % self._hw_props.warp_size != 0 + and hw_props.warp_size is not None + and prod(ret) % hw_props.warp_size != 0 ): self._round_block_sizes_to_warp_size(ret, hw_props.warp_size) @@ -387,6 +389,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): return ret trimmed = trim(list(block_size)) + + if hw_props.warp_size is None: + return tuple(trimmed) + if ( prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size == 0 @@ -493,14 +499,13 @@ class GpuIndexing: ctx: KernelCreationContext, target: Target, scheme: GpuIndexingScheme, - warp_size: int, + warp_size: int | None, manual_launch_grid: bool = False, assume_warp_aligned_block_size: bool = False, ) -> None: self._ctx = ctx self._target = target self._scheme = scheme - self._warp_size = warp_size self._manual_launch_grid = manual_launch_grid self._assume_warp_aligned_block_size = assume_warp_aligned_block_size @@ -608,7 +613,10 @@ class GpuIndexing: # impossible to use block size determination function since the iteration space is unknown # -> round block size in fastest moving dimension up to multiple of warp size rounded_block_size: PsExpression - if self._assume_warp_aligned_block_size: + if ( + self._assume_warp_aligned_block_size + and self._hw_props.warp_size is not None + ): warp_size = self._ast_factory.parse_index(self._hw_props.warp_size) rounded_block_size = self._ast_factory.parse_index( PsIntDiv( diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index bbe9aedd9..a3f8a5482 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -145,8 +145,9 @@ def test_block_size_adaptations( cfg = CreateKernelConfig(target=target) cfg.gpu.indexing_scheme = "linear3d" cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size - - warp_size = cfg.gpu.default_warp_size(cfg.get_target()) + + warp_size = 32 + cfg.gpu.warp_size = warp_size ast = create_kernel(asm, cfg) kernel = ast.compile() -- GitLab