make no assumptions about warp_size for Target.HIP

7c371fb4 · Frederik Hennig · 9e729903 · 7c371fb4 · 7c371fb4 · 7c371fb4
Commit 7c371fb4 authored 4 months ago by Frederik Hennig
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
 from __future__ import annotations
 from typing import cast, Sequence, Callable, TYPE_CHECKING
 from dataclasses import dataclass, replace
+from warnings import warn
 from .target import Target
 from .config import (
@@ -410,7 +411,8 @@ class DefaultKernelCreationDriver:
        if warp_size is None:
            warp_size = GpuOptions.default_warp_size(self._target)
-        # TODO: Warn if warp_size is None and assume_warp_aligned_block_size is True
+        if warp_size is None and assume_warp_aligned_block_size:
+            warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.")
        return GpuIndexing(
            self._ctx,

--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -30,7 +30,7 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 @dataclass
 class HardwareProperties:
-    warp_size: int
+    warp_size: int | None
    max_threads_per_block: int
    max_block_sizes: dim3
@@ -204,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
        if (
            self._assume_warp_aligned_block_size
+            and self._hw_props.warp_size is not None
            and prod(self._block_size) % self._hw_props.warp_size != 0
        ):
            raise CodegenError(
@@ -316,7 +317,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
            return (
                *to_round[:index_to_round],
                ceil_to_multiple(to_round[index_to_round], warp_size),
-                *to_round[index_to_round + 1 :],
+                *to_round[index_to_round + 1:],
            )
        else:
            return (
@@ -351,7 +352,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
        if (
            self._assume_warp_aligned_block_size
-            and prod(ret) % self._hw_props.warp_size != 0
+            and hw_props.warp_size is not None
+            and prod(ret) % hw_props.warp_size != 0
        ):
            self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
@@ -387,6 +389,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
            return ret
        trimmed = trim(list(block_size))
+        if hw_props.warp_size is None:
+            return tuple(trimmed)
        if (
            prod(trimmed) >= hw_props.warp_size
            and prod(trimmed) % hw_props.warp_size == 0
@@ -493,14 +499,13 @@ class GpuIndexing:
        ctx: KernelCreationContext,
        target: Target,
        scheme: GpuIndexingScheme,
-        warp_size: int,
+        warp_size: int | None,
        manual_launch_grid: bool = False,
        assume_warp_aligned_block_size: bool = False,
    ) -> None:
        self._ctx = ctx
        self._target = target
        self._scheme = scheme
-        self._warp_size = warp_size
        self._manual_launch_grid = manual_launch_grid
        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
@@ -608,7 +613,10 @@ class GpuIndexing:
        # impossible to use block size determination function since the iteration space is unknown
        # -> round block size in fastest moving dimension up to multiple of warp size
        rounded_block_size: PsExpression
-        if self._assume_warp_aligned_block_size:
+        if (
+            self._assume_warp_aligned_block_size
+            and self._hw_props.warp_size is not None
+        ):
            warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
            rounded_block_size = self._ast_factory.parse_index(
                PsIntDiv(

--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -145,8 +145,9 @@ def test_block_size_adaptations(
    cfg = CreateKernelConfig(target=target)
    cfg.gpu.indexing_scheme = "linear3d"
    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
-    warp_size = cfg.gpu.default_warp_size(cfg.get_target())
+    warp_size = 32
+    cfg.gpu.warp_size = warp_size
    ast = create_kernel(asm, cfg)
    kernel = ast.compile()