From 7c371fb43a0eb731268fa3fe8fd7861f0b4b5f68 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 19 Mar 2025 13:03:39 +0000
Subject: [PATCH] make no assumptions about warp_size for Target.HIP

---
 src/pystencils/codegen/driver.py       |  4 +++-
 src/pystencils/codegen/gpu_indexing.py | 20 ++++++++++++++------
 tests/kernelcreation/test_gpu.py       |  5 +++--
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 543d9db2f..e9fc69b76 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import cast, Sequence, Callable, TYPE_CHECKING
 from dataclasses import dataclass, replace
+from warnings import warn
 
 from .target import Target
 from .config import (
@@ -410,7 +411,8 @@ class DefaultKernelCreationDriver:
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
 
-        # TODO: Warn if warp_size is None and assume_warp_aligned_block_size is True
+        if warp_size is None and assume_warp_aligned_block_size:
+            warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.")
 
         return GpuIndexing(
             self._ctx,
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index b5e70043f..09570e345 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -30,7 +30,7 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 @dataclass
 class HardwareProperties:
-    warp_size: int
+    warp_size: int | None
     max_threads_per_block: int
     max_block_sizes: dim3
 
@@ -204,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
         if (
             self._assume_warp_aligned_block_size
+            and self._hw_props.warp_size is not None
             and prod(self._block_size) % self._hw_props.warp_size != 0
         ):
             raise CodegenError(
@@ -316,7 +317,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
             return (
                 *to_round[:index_to_round],
                 ceil_to_multiple(to_round[index_to_round], warp_size),
-                *to_round[index_to_round + 1 :],
+                *to_round[index_to_round + 1:],
             )
         else:
             return (
@@ -351,7 +352,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
         if (
             self._assume_warp_aligned_block_size
-            and prod(ret) % self._hw_props.warp_size != 0
+            and hw_props.warp_size is not None
+            and prod(ret) % hw_props.warp_size != 0
         ):
             self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
 
@@ -387,6 +389,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
             return ret
 
         trimmed = trim(list(block_size))
+
+        if hw_props.warp_size is None:
+            return tuple(trimmed)
+
         if (
             prod(trimmed) >= hw_props.warp_size
             and prod(trimmed) % hw_props.warp_size == 0
@@ -493,14 +499,13 @@ class GpuIndexing:
         ctx: KernelCreationContext,
         target: Target,
         scheme: GpuIndexingScheme,
-        warp_size: int,
+        warp_size: int | None,
         manual_launch_grid: bool = False,
         assume_warp_aligned_block_size: bool = False,
     ) -> None:
         self._ctx = ctx
         self._target = target
         self._scheme = scheme
-        self._warp_size = warp_size
         self._manual_launch_grid = manual_launch_grid
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
@@ -608,7 +613,10 @@ class GpuIndexing:
         # impossible to use block size determination function since the iteration space is unknown
         # -> round block size in fastest moving dimension up to multiple of warp size
         rounded_block_size: PsExpression
-        if self._assume_warp_aligned_block_size:
+        if (
+            self._assume_warp_aligned_block_size
+            and self._hw_props.warp_size is not None
+        ):
             warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
             rounded_block_size = self._ast_factory.parse_index(
                 PsIntDiv(
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index bbe9aedd9..a3f8a5482 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -145,8 +145,9 @@ def test_block_size_adaptations(
     cfg = CreateKernelConfig(target=target)
     cfg.gpu.indexing_scheme = "linear3d"
     cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
-
-    warp_size = cfg.gpu.default_warp_size(cfg.get_target())
+    
+    warp_size = 32
+    cfg.gpu.warp_size = warp_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
-- 
GitLab