From 73295d2776dcf3b753e9584a422b39514088c124 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 21 Feb 2025 20:21:02 +0100
Subject: [PATCH 01/54] Throw out omit_range_check configuration option

---
 src/pystencils/backend/platforms/cuda.py | 18 ++++--------------
 src/pystencils/backend/platforms/sycl.py | 17 ++++-------------
 src/pystencils/codegen/config.py         |  8 --------
 src/pystencils/codegen/driver.py         |  4 ----
 tests/kernelcreation/test_gpu.py         |  4 +---
 5 files changed, 9 insertions(+), 42 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index e896fc2bb..3d45117e2 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -166,20 +166,16 @@ class CudaPlatform(GenericGpu):
     
     Args:
         ctx: The kernel creation context
-        omit_range_check: If `True`, generated index translation code will not check if the point identified
-            by block and thread indices is actually contained in the iteration space
         thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
     """
 
     def __init__(
         self,
         ctx: KernelCreationContext,
-        omit_range_check: bool = False,
         thread_mapping: ThreadMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
-        self._omit_range_check = omit_range_check
         self._thread_mapping = (
             thread_mapping if thread_mapping is not None else Linear3DMapping()
         )
@@ -282,8 +278,7 @@ class CudaPlatform(GenericGpu):
             indexing_decls.append(
                 self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
-            if not self._omit_range_check:
-                conds.append(PsLt(ctr_expr, dim.stop))
+            conds.append(PsLt(ctr_expr, dim.stop))
 
         if conds:
             condition: PsExpression = conds[0]
@@ -324,12 +319,7 @@ class CudaPlatform(GenericGpu):
         ]
         body.statements = mappings + body.statements
 
-        if not self._omit_range_check:
-            stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr_expr.clone(), stop)
-            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-        else:
-            body.statements = [sparse_idx_decl] + body.statements
-            ast = body
+        stop = PsExpression.make(ispace.index_list.shape[0])
+        condition = PsLt(sparse_ctr_expr.clone(), stop)
+        return PsBlock([sparse_idx_decl, PsConditional(condition, body)])
 
-        return ast
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index f3c4bb3d5..d16c4f51b 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -35,12 +35,10 @@ class SyclPlatform(Platform):
     def __init__(
         self,
         ctx: KernelCreationContext,
-        omit_range_check: bool = False,
         automatic_block_size: bool = False,
     ):
         super().__init__(ctx)
 
-        self._omit_range_check = omit_range_check
         self._automatic_block_size = automatic_block_size
 
     @property
@@ -136,8 +134,7 @@ class SyclPlatform(Platform):
             indexing_decls.append(
                 PsDeclaration(ctr, dim.start + work_item_idx * dim.step)
             )
-            if not self._omit_range_check:
-                conds.append(PsLt(ctr, dim.stop))
+            conds.append(PsLt(ctr, dim.stop))
 
         if conds:
             condition: PsExpression = conds[0]
@@ -182,15 +179,9 @@ class SyclPlatform(Platform):
         ]
         body.statements = mappings + body.statements
 
-        if not self._omit_range_check:
-            stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr, stop)
-            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-        else:
-            body.statements = [sparse_idx_decl] + body.statements
-            ast = body
-
-        return ast
+        stop = PsExpression.make(ispace.index_list.shape[0])
+        condition = PsLt(sparse_ctr, stop)
+        return PsBlock([sparse_idx_decl, PsConditional(condition, body)])
 
     def _item_type(self, rank: int):
         if not self._automatic_block_size:
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 0d43b40e3..9856ad632 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -372,14 +372,6 @@ class GpuOptions(ConfigBase):
     indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D)
     """Thread indexing scheme for dense GPU kernels."""
 
-    omit_range_check: BasicOption[bool] = BasicOption(False)
-    """If set to `True`, omit the iteration counter range check.
-    
-    By default, the code generator introduces a check if the iteration counters computed from GPU block and thread
-    indices are within the prescribed loop range.
-    This check can be discarded through this option, at your own peril.
-    """
-
     block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
     """Desired block size for the execution of GPU kernels.
     
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index b8f9c7101..ac933e2e5 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -438,7 +438,6 @@ class DefaultKernelCreationDriver:
 
         elif self._target.is_gpu():
             gpu_opts = self._cfg.gpu
-            omit_range_check: bool = gpu_opts.get_option("omit_range_check")
 
             match self._target:
                 case Target.CUDA:
@@ -452,18 +451,15 @@ class DefaultKernelCreationDriver:
 
                     return CudaPlatform(
                         self._ctx,
-                        omit_range_check=omit_range_check,
                         thread_mapping=thread_mapping,
                     )
         elif self._target == Target.SYCL:
             from ..backend.platforms import SyclPlatform
 
             auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size")
-            omit_range_check = self._cfg.gpu.get_option("omit_range_check")
 
             return SyclPlatform(
                 self._ctx,
-                omit_range_check=omit_range_check,
                 automatic_block_size=auto_block_size,
             )
 
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 10b37e610..38e849c3c 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -29,10 +29,9 @@ except ImportError:
 
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
-@pytest.mark.parametrize("omit_range_check", [False, True])
 @pytest.mark.parametrize("manual_grid", [False, True])
 def test_indexing_options(
-    indexing_scheme: str, omit_range_check: bool, manual_grid: bool
+    indexing_scheme: str, manual_grid: bool
 ):
     src, dst = fields("src, dst: [3D]")
     asm = Assignment(
@@ -47,7 +46,6 @@ def test_indexing_options(
 
     cfg = CreateKernelConfig(target=Target.CUDA)
     cfg.gpu.indexing_scheme = indexing_scheme
-    cfg.gpu.omit_range_check = omit_range_check
     cfg.gpu.manual_launch_grid = manual_grid
 
     ast = create_kernel(asm, cfg)
-- 
GitLab


From 0a907233b111343317abf14a47ecccbbd55ae64c Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 21 Feb 2025 21:12:49 +0100
Subject: [PATCH 02/54] Introduce warp_size as GpuOption and to round block
 sizes to multiples of it

---
 src/pystencils/codegen/config.py       | 19 +++++++++++++++++++
 src/pystencils/codegen/driver.py       |  7 ++++++-
 src/pystencils/codegen/gpu_indexing.py | 25 +++++++++++++++++++++----
 3 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 9856ad632..fcf1190f5 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -388,6 +388,25 @@ class GpuOptions(ConfigBase):
     The launch grid will then have to be specified manually at runtime.
     """
 
+    warp_size: BasicOption[int] = BasicOption()
+    """Specifies the size of a warp (CUDA) or wavefront (HIP).
+    
+    If set to `None` (the default), the default value for the given target will be automatically used.
+    """
+
+    @staticmethod
+    def default_warp_size(target: Target):
+        if target.is_vector_cpu():
+            raise ValueError(f"Given target {target} is no GPU target.")
+
+        match target:
+            case Target.CUDA:
+                return 32
+            case _:
+                raise NotImplementedError(
+                    f"No default warp/wavefront size known for target {target}"
+                )
+
     @indexing_scheme.validate
     def _validate_idx_scheme(self, val: str | GpuIndexingScheme):
         if isinstance(val, GpuIndexingScheme):
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index ac933e2e5..8286d962a 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -11,6 +11,7 @@ from .config import (
     GhostLayerSpec,
     IterationSliceSpec,
     GpuIndexingScheme,
+    GpuOptions,
 )
 from .kernel import Kernel, GpuKernel
 from .properties import PsSymbolProperty, FieldBasePtr
@@ -406,8 +407,12 @@ class DefaultKernelCreationDriver:
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
         block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
+        warp_size: int | None = self._cfg.gpu.get_option("warp_size")
 
-        return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid)
+        if warp_size is None:
+            warp_size = GpuOptions.default_warp_size(self._target)
+
+        return GpuIndexing(self._ctx, idx_scheme, warp_size, block_size, manual_launch_grid)
 
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index c93f0f959..08a24d761 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -8,6 +8,7 @@ from .functions import Lambda
 from .parameters import Parameter
 from .errors import CodegenError
 from .config import GpuIndexingScheme, _AUTO_TYPE
+from ..backend.constants import PsConstant
 
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -16,8 +17,7 @@ from ..backend.kernelcreation import (
 )
 from ..backend.platforms.cuda import ThreadMapping
 
-from ..backend.ast.expressions import PsExpression
-
+from ..backend.ast.expressions import PsExpression, PsIntDiv
 
 dim3 = tuple[int, int, int]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
@@ -63,12 +63,14 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         self,
         block_size: _Dim3Lambda,
         grid_size: _Dim3Lambda,
+        warp_size: Lambda
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
+        self._warp_size = warp_size
 
         self._params: frozenset[Parameter] = frozenset().union(
-            *(lb.parameters for lb in chain(block_size, grid_size))
+            *(lb.parameters for lb in chain(block_size, grid_size, warp_size))
         )
 
     @property
@@ -76,7 +78,13 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         return self._params
 
     def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        def ceil_to_multiple(arg1, arg2):
+            PsIntDiv(arg1 + arg2 - PsExpression.make(PsConstant(1)), arg2) * arg2
+
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
+        # round block size in fastest moving dimension up to multiple of warp size
+        block_size = (ceil_to_multiple(block_size[0], self._warp_size), *block_size[1:])
+
         grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
         return cast(dim3, block_size), cast(dim3, grid_size)
 
@@ -143,10 +151,13 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
     def __init__(
         self,
         num_work_items: _Dim3Lambda,
+        warp_size: Lambda,
         default_block_size: dim3 | None = None,
     ) -> None:
         self._num_work_items = num_work_items
 
+        self._warp_size = warp_size
+
         self._block_size: dim3 | None = default_block_size
 
         self._params: frozenset[Parameter] = frozenset().union(
@@ -184,7 +195,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         )
         reduced_block_size = cast(
             dim3,
-            tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)),
+            tuple(div_ceil(min(wit, bs), self._warp_size) * self._warp_size
+                  for wit, bs in zip(num_work_items, self._block_size)),
         )
         grid_size = cast(
             dim3,
@@ -219,11 +231,13 @@ class GpuIndexing:
         self,
         ctx: KernelCreationContext,
         scheme: GpuIndexingScheme,
+        warp_size: int,
         default_block_size: dim3 | _AUTO_TYPE | None = None,
         manual_launch_grid: bool = False,
     ) -> None:
         self._ctx = ctx
         self._scheme = scheme
+        self._warp_size = warp_size
         self._default_block_size = default_block_size
         self._manual_launch_grid = manual_launch_grid
 
@@ -319,10 +333,13 @@ class GpuIndexing:
             for _ in range(4 - rank)
         )
 
+        warp_size = self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size)))
+
         def factory():
             return AutomaticLaunchConfiguration(
                 block_size,
                 cast(_Dim3Lambda, grid_size),
+                warp_size
             )
 
         return factory
-- 
GitLab


From 1b7d1283e6c0d95f1739720b327715400f7a0a40 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 24 Feb 2025 11:59:42 +0100
Subject: [PATCH 03/54] Add missing warp size parameter to
 DynamicBlockSizeLaunchConfiguration

---
 src/pystencils/codegen/gpu_indexing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 08a24d761..6cd82a645 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -151,7 +151,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
     def __init__(
         self,
         num_work_items: _Dim3Lambda,
-        warp_size: Lambda,
+        warp_size: int,
         default_block_size: dim3 | None = None,
     ) -> None:
         self._num_work_items = num_work_items
@@ -289,6 +289,7 @@ class GpuIndexing:
         def factory():
             return DynamicBlockSizeLaunchConfiguration(
                 num_work_items,
+                self._warp_size,
                 self._get_default_block_size(rank),
             )
 
-- 
GitLab


From d9744fed3d3caa9bd315718c4e06f8bd5edfe7c9 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 24 Feb 2025 12:10:43 +0100
Subject: [PATCH 04/54] Fix lint

---
 src/pystencils/backend/platforms/cuda.py | 1 -
 src/pystencils/codegen/driver.py         | 2 --
 2 files changed, 3 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 3d45117e2..ab5d6c19b 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -322,4 +322,3 @@ class CudaPlatform(GenericGpu):
         stop = PsExpression.make(ispace.index_list.shape[0])
         condition = PsLt(sparse_ctr_expr.clone(), stop)
         return PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 8286d962a..6f041b6a7 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -442,8 +442,6 @@ class DefaultKernelCreationDriver:
                 )
 
         elif self._target.is_gpu():
-            gpu_opts = self._cfg.gpu
-
             match self._target:
                 case Target.CUDA:
                     from ..backend.platforms import CudaPlatform
-- 
GitLab


From 675c6bd71acac5cec54a5306390e928e3df1ce82 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 24 Feb 2025 12:14:15 +0100
Subject: [PATCH 05/54] Fix typecheck

---
 src/pystencils/codegen/gpu_indexing.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 6cd82a645..731abcc64 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -20,6 +20,7 @@ from ..backend.platforms.cuda import ThreadMapping
 from ..backend.ast.expressions import PsExpression, PsIntDiv
 
 dim3 = tuple[int, int, int]
+_Dim1Lambda = tuple[Lambda]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
@@ -63,7 +64,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         self,
         block_size: _Dim3Lambda,
         grid_size: _Dim3Lambda,
-        warp_size: Lambda
+        warp_size: _Dim1Lambda
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
@@ -83,7 +84,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
 
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
         # round block size in fastest moving dimension up to multiple of warp size
-        block_size = (ceil_to_multiple(block_size[0], self._warp_size), *block_size[1:])
+        block_size = (ceil_to_multiple(block_size[0], self._warp_size[0]), *block_size[1:])
 
         grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
         return cast(dim3, block_size), cast(dim3, grid_size)
@@ -334,7 +335,7 @@ class GpuIndexing:
             for _ in range(4 - rank)
         )
 
-        warp_size = self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size)))
+        warp_size = (self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size))),)
 
         def factory():
             return AutomaticLaunchConfiguration(
-- 
GitLab


From eddb7882d3eafde8be8471a8c3accc6637e552f1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 24 Feb 2025 13:29:25 +0100
Subject: [PATCH 06/54] Only apply warp-size rounding to x block size

---
 src/pystencils/codegen/gpu_indexing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 731abcc64..048dcd2fe 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -196,8 +196,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         )
         reduced_block_size = cast(
             dim3,
-            tuple(div_ceil(min(wit, bs), self._warp_size) * self._warp_size
-                  for wit, bs in zip(num_work_items, self._block_size)),
+            (div_ceil(min(num_work_items[0], self._block_size[0]), self._warp_size) * self._warp_size,)
+            + tuple(min(wit, bs) for wit, bs in zip(num_work_items[1:], self._block_size[1:]))
         )
         grid_size = cast(
             dim3,
-- 
GitLab


From 3072fbbc71beb46c3b066d55a75373f239626a01 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 24 Feb 2025 14:51:44 +0100
Subject: [PATCH 07/54] Add algorithm for determining best fit between GPU
 block config and iteration space

---
 src/pystencils/codegen/gpu_indexing.py | 63 +++++++++++++++++++++++---
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 048dcd2fe..f9b3f572c 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -17,7 +17,11 @@ from ..backend.kernelcreation import (
 )
 from ..backend.platforms.cuda import ThreadMapping
 
-from ..backend.ast.expressions import PsExpression, PsIntDiv
+from ..backend.ast.expressions import PsExpression
+from math import prod
+
+from ..sympyextensions.integer_functions import div_ceil
+from ..types.quick import UInt
 
 dim3 = tuple[int, int, int]
 _Dim1Lambda = tuple[Lambda]
@@ -53,6 +57,51 @@ class GpuLaunchConfiguration(ABC):
         this launch configuration, such that when the configuration changes, the JIT parameter
         cache is invalidated."""
 
+    @staticmethod
+    def determine_block_size(it_space: tuple[int, int, int], block_size: tuple[int, int, int],
+                             warp_size: int) -> tuple[int, ...]:
+
+        def trim(to_trim: list[int]) -> list[int]:
+            return [min(b, i) for b, i in zip(to_trim, it_space)]
+
+        trimmed = trim(list(block_size))
+        if prod(trimmed) >= warp_size:
+            # case 1: greater than min block size -> use trimmed result
+            if prod(trimmed) % warp_size == 0:
+                return tuple(trimmed)
+            else:
+                trimmed[0] = div_ceil(trimmed[0], warp_size) * warp_size
+
+        prev_trim_size = 0
+        resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space))
+        max_block_sizes = [1024, 1024, 64]  # TODO: move to target
+        while prod(trimmed) is not prev_trim_size:
+            prev_trim_size = prod(trimmed)
+
+            # case 2: trimmed block is equivalent to the whole iteration space
+            if all(b == i for b, i in zip(trimmed, it_space)):
+                return tuple(trimmed)
+            else:
+                # double block size in each dimension until block is large enough (or case 2 triggers)
+                for d in resize_order:
+                    trimmed[d] *= 2
+
+                    # trim fastest moving dim to multiples of warp size
+                    if d == 0 and trimmed[d] > warp_size and trimmed[d] % warp_size != 0:
+                        trimmed[d] = trimmed[d] - (trimmed[d] % warp_size)  # subtract remainder
+
+                    # check if block sizes are within hardware capabilities
+                    trimmed[d] = min(trimmed[d], max_block_sizes[d])
+
+                    # trim again
+                    trimmed = trim(trimmed)
+
+                    # case 3: trim block is large enough
+                    if prod(trimmed) >= warp_size:
+                        return tuple(trimmed)
+
+        raise CodegenError("Unable to determine GPU block size for this kernel.")
+
 
 class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """Launch configuration that is dynamically computed from kernel parameters.
@@ -80,11 +129,12 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
 
     def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
         def ceil_to_multiple(arg1, arg2):
-            PsIntDiv(arg1 + arg2 - PsExpression.make(PsConstant(1)), arg2) * arg2
+            return div_ceil(arg1 + arg2 - 1, arg2) * arg2
 
+        # impossible to use block size determination function since the iteration space is unknown
+        # -> round block size in fastest moving dimension up to multiple of warp size
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
-        # round block size in fastest moving dimension up to multiple of warp size
-        block_size = (ceil_to_multiple(block_size[0], self._warp_size[0]), *block_size[1:])
+        block_size = (ceil_to_multiple(block_size[0], int(self._warp_size[0](**kwargs))), *block_size[1:])
 
         grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
         return cast(dim3, block_size), cast(dim3, grid_size)
@@ -196,8 +246,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         )
         reduced_block_size = cast(
             dim3,
-            (div_ceil(min(num_work_items[0], self._block_size[0]), self._warp_size) * self._warp_size,)
-            + tuple(min(wit, bs) for wit, bs in zip(num_work_items[1:], self._block_size[1:]))
+            self.determine_block_size(num_work_items, self._block_size, self._warp_size),
         )
         grid_size = cast(
             dim3,
@@ -335,7 +384,7 @@ class GpuIndexing:
             for _ in range(4 - rank)
         )
 
-        warp_size = (self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size))),)
+        warp_size = (self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size, UInt(32)))),)
 
         def factory():
             return AutomaticLaunchConfiguration(
-- 
GitLab


From 799c22f0cfb068c42403080db8457b745cfb64c1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 24 Feb 2025 16:50:31 +0100
Subject: [PATCH 08/54] Introduce BasicOption max_block_sizes to GPU targets
 and employ in GpuIndexing

---
 src/pystencils/codegen/config.py       | 16 +++++++++++++---
 src/pystencils/codegen/driver.py       |  5 ++++-
 src/pystencils/codegen/gpu_indexing.py | 12 +++++++++---
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index fcf1190f5..94a8f270d 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -380,6 +380,9 @@ class GpuOptions(ConfigBase):
     The block size may be overridden at runtime.
     """
 
+    max_block_sizes: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption()
+    """Maximum dimension size of a thread block in dimensions (x, y, z)."""
+
     manual_launch_grid: BasicOption[bool] = BasicOption(False)
     """Always require a manually specified launch grid when running this kernel.
     
@@ -396,9 +399,6 @@ class GpuOptions(ConfigBase):
 
     @staticmethod
     def default_warp_size(target: Target):
-        if target.is_vector_cpu():
-            raise ValueError(f"Given target {target} is no GPU target.")
-
         match target:
             case Target.CUDA:
                 return 32
@@ -407,6 +407,16 @@ class GpuOptions(ConfigBase):
                     f"No default warp/wavefront size known for target {target}"
                 )
 
+    @staticmethod
+    def default_maximum_block_sizes(target: Target):
+        match target:
+            case Target.CUDA:
+                return (1024, 1024, 64)
+            case _:
+                raise NotImplementedError(
+                    f"No default value known for maximum GPU block sizes available on target {target}"
+                )
+
     @indexing_scheme.validate
     def _validate_idx_scheme(self, val: str | GpuIndexingScheme):
         if isinstance(val, GpuIndexingScheme):
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 6f041b6a7..ca2caca35 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -408,11 +408,14 @@ class DefaultKernelCreationDriver:
         block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
         warp_size: int | None = self._cfg.gpu.get_option("warp_size")
+        max_block_sizes: tuple[int, int, int] | None = self._cfg.gpu.get_option("max_block_sizes")
 
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
+        if max_block_sizes is None:
+            max_block_sizes = GpuOptions.default_maximum_block_sizes(self._target)
 
-        return GpuIndexing(self._ctx, idx_scheme, warp_size, block_size, manual_launch_grid)
+        return GpuIndexing(self._ctx, idx_scheme, warp_size, max_block_sizes, block_size, manual_launch_grid)
 
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index f9b3f572c..ce5def203 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -59,7 +59,8 @@ class GpuLaunchConfiguration(ABC):
 
     @staticmethod
     def determine_block_size(it_space: tuple[int, int, int], block_size: tuple[int, int, int],
-                             warp_size: int) -> tuple[int, ...]:
+                             max_block_sizes: tuple[int, int, int], warp_size: int) -> tuple[int, ...]:
+        """Returns an optimized block size configuration with block sizes being aligned with the warp size."""
 
         def trim(to_trim: list[int]) -> list[int]:
             return [min(b, i) for b, i in zip(to_trim, it_space)]
@@ -74,7 +75,6 @@ class GpuLaunchConfiguration(ABC):
 
         prev_trim_size = 0
         resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space))
-        max_block_sizes = [1024, 1024, 64]  # TODO: move to target
         while prod(trimmed) is not prev_trim_size:
             prev_trim_size = prod(trimmed)
 
@@ -203,12 +203,15 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         self,
         num_work_items: _Dim3Lambda,
         warp_size: int,
+        max_block_sizes: dim3,
         default_block_size: dim3 | None = None,
     ) -> None:
         self._num_work_items = num_work_items
 
         self._warp_size = warp_size
 
+        self._max_block_sizes = max_block_sizes
+
         self._block_size: dim3 | None = default_block_size
 
         self._params: frozenset[Parameter] = frozenset().union(
@@ -246,7 +249,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         )
         reduced_block_size = cast(
             dim3,
-            self.determine_block_size(num_work_items, self._block_size, self._warp_size),
+            self.determine_block_size(num_work_items, self._block_size, self._max_block_sizes, self._warp_size),
         )
         grid_size = cast(
             dim3,
@@ -282,12 +285,14 @@ class GpuIndexing:
         ctx: KernelCreationContext,
         scheme: GpuIndexingScheme,
         warp_size: int,
+        max_block_sizes: dim3,
         default_block_size: dim3 | _AUTO_TYPE | None = None,
         manual_launch_grid: bool = False,
     ) -> None:
         self._ctx = ctx
         self._scheme = scheme
         self._warp_size = warp_size
+        self._max_block_sizes = max_block_sizes
         self._default_block_size = default_block_size
         self._manual_launch_grid = manual_launch_grid
 
@@ -340,6 +345,7 @@ class GpuIndexing:
             return DynamicBlockSizeLaunchConfiguration(
                 num_work_items,
                 self._warp_size,
+                self._max_block_sizes,
                 self._get_default_block_size(rank),
             )
 
-- 
GitLab


From e010924c25662932911ca1c98ed2527abd0417db Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 24 Feb 2025 16:54:43 +0100
Subject: [PATCH 09/54] Add missing comment [skip ci]

---
 src/pystencils/codegen/config.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 94a8f270d..8e6ae0805 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -381,7 +381,10 @@ class GpuOptions(ConfigBase):
     """
 
     max_block_sizes: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption()
-    """Maximum dimension size of a thread block in dimensions (x, y, z)."""
+    """Maximum dimension size of a thread block in dimensions (x, y, z).
+    
+    If set to `None` (the default), the default value for the given target will be automatically used.
+    """
 
     manual_launch_grid: BasicOption[bool] = BasicOption(False)
     """Always require a manually specified launch grid when running this kernel.
-- 
GitLab


From a3a7744c3f88972645c0c9386b0f0b89e44970d0 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 26 Feb 2025 18:18:25 +0100
Subject: [PATCH 10/54] Slightly adapt algorithm for determining block sizes
 with known iteration space

---
 src/pystencils/codegen/gpu_indexing.py | 34 ++++++++++++++++++++------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index ce5def203..bdf2225c0 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -58,20 +58,35 @@ class GpuLaunchConfiguration(ABC):
         cache is invalidated."""
 
     @staticmethod
-    def determine_block_size(it_space: tuple[int, int, int], block_size: tuple[int, int, int],
+    def determine_block_size(it_space: tuple[int, ...], block_size: tuple[int, int, int],
                              max_block_sizes: tuple[int, int, int], warp_size: int) -> tuple[int, ...]:
         """Returns an optimized block size configuration with block sizes being aligned with the warp size."""
 
         def trim(to_trim: list[int]) -> list[int]:
             return [min(b, i) for b, i in zip(to_trim, it_space)]
 
+        def ceil_to_multiple(to_round: list[int]) -> tuple[int, ...]:
+            def round(val):
+                return div_ceil(val, warp_size) * warp_size
+
+            # find index of element closest to warp size and round up
+            index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
+            if index_to_round+1 < len(to_round):
+                return (*to_round[:index_to_round], round(to_round[index_to_round]), *to_round[index_to_round+1:])
+            else:
+                return (*to_round[:index_to_round], round(to_round[index_to_round]))
+
+        def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]:
+            if not all(size < max_size for size, max_size in zip(to_return, max_block_sizes)) \
+                    or prod(to_return) > max(max_block_sizes):
+                raise CodegenError(f"Unable to determine GPU block size for this kernel for initial block size config"
+                                   f"{block_size} and iteration space {it_space}.")
+            return to_return
+
         trimmed = trim(list(block_size))
-        if prod(trimmed) >= warp_size:
+        if prod(trimmed) >= warp_size and prod(trimmed) % warp_size == 0:
             # case 1: greater than min block size -> use trimmed result
-            if prod(trimmed) % warp_size == 0:
-                return tuple(trimmed)
-            else:
-                trimmed[0] = div_ceil(trimmed[0], warp_size) * warp_size
+            return check_sizes_and_return(tuple(trimmed))
 
         prev_trim_size = 0
         resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space))
@@ -80,7 +95,10 @@ class GpuLaunchConfiguration(ABC):
 
             # case 2: trimmed block is equivalent to the whole iteration space
             if all(b == i for b, i in zip(trimmed, it_space)):
-                return tuple(trimmed)
+                if prod(trimmed) % warp_size != 0:
+                    return check_sizes_and_return(ceil_to_multiple(trimmed))
+                else:
+                    return check_sizes_and_return(tuple(trimmed))
             else:
                 # double block size in each dimension until block is large enough (or case 2 triggers)
                 for d in resize_order:
@@ -98,7 +116,7 @@ class GpuLaunchConfiguration(ABC):
 
                     # case 3: trim block is large enough
                     if prod(trimmed) >= warp_size:
-                        return tuple(trimmed)
+                        return check_sizes_and_return(tuple(trimmed))
 
         raise CodegenError("Unable to determine GPU block size for this kernel.")
 
-- 
GitLab


From a77092d80731936dec9457c28cbdb10d97ce2d65 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 26 Feb 2025 18:48:11 +0100
Subject: [PATCH 11/54] Remove unnecessary condition

---
 src/pystencils/backend/platforms/cuda.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index ab5d6c19b..461597ae4 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -280,16 +280,10 @@ class CudaPlatform(GenericGpu):
             )
             conds.append(PsLt(ctr_expr, dim.stop))
 
-        if conds:
-            condition: PsExpression = conds[0]
-            for cond in conds[1:]:
-                condition = PsAnd(condition, cond)
-            ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
-        else:
-            body.statements = indexing_decls + body.statements
-            ast = body
-
-        return ast
+        condition: PsExpression = conds[0]
+        for cond in conds[1:]:
+            condition = PsAnd(condition, cond)
+        return PsBlock(indexing_decls + [PsConditional(condition, body)])
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
-- 
GitLab


From cbe094b6a257799685066ba88ce86e99accdee83 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 26 Feb 2025 19:36:34 +0100
Subject: [PATCH 12/54] Fix lint

---
 src/pystencils/codegen/gpu_indexing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index bdf2225c0..bcb24e0e0 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -71,8 +71,8 @@ class GpuLaunchConfiguration(ABC):
 
             # find index of element closest to warp size and round up
             index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
-            if index_to_round+1 < len(to_round):
-                return (*to_round[:index_to_round], round(to_round[index_to_round]), *to_round[index_to_round+1:])
+            if index_to_round + 1 < len(to_round):
+                return (*to_round[:index_to_round], round(to_round[index_to_round]), *to_round[index_to_round + 1:])
             else:
                 return (*to_round[:index_to_round], round(to_round[index_to_round]))
 
-- 
GitLab


From 267e2d4128afde5ebe5a5b9e54704dc14c2543fb Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 3 Mar 2025 16:58:16 +0100
Subject: [PATCH 13/54] Adapt formatting

---
 src/pystencils/backend/platforms/cuda.py |  4 +-
 src/pystencils/codegen/config.py         |  2 +-
 src/pystencils/codegen/driver.py         | 13 ++++-
 src/pystencils/codegen/gpu_indexing.py   | 62 ++++++++++++++++--------
 tests/kernelcreation/test_gpu.py         |  4 +-
 5 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 461597ae4..218cae0f6 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -117,7 +117,7 @@ class Blockwise4DMapping(ThreadMapping):
         THREAD_IDX[0],
         BLOCK_IDX[0],
         BLOCK_IDX[1],
-        BLOCK_IDX[2]
+        BLOCK_IDX[2],
     ]
 
     def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
@@ -163,7 +163,7 @@ class Blockwise4DMapping(ThreadMapping):
 
 class CudaPlatform(GenericGpu):
     """Platform for CUDA-based GPUs.
-    
+
     Args:
         ctx: The kernel creation context
         thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 8e6ae0805..e369dde4a 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -625,7 +625,7 @@ class CreateKernelConfig(ConfigBase):
             elif self.get_target() == Target.CUDA:
                 try:
                     from ..jit.gpu_cupy import CupyJit
-                    
+
                     return CupyJit()
 
                 except ImportError:
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index ca2caca35..9404f9754 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -408,14 +408,23 @@ class DefaultKernelCreationDriver:
         block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
         warp_size: int | None = self._cfg.gpu.get_option("warp_size")
-        max_block_sizes: tuple[int, int, int] | None = self._cfg.gpu.get_option("max_block_sizes")
+        max_block_sizes: tuple[int, int, int] | None = self._cfg.gpu.get_option(
+            "max_block_sizes"
+        )
 
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
         if max_block_sizes is None:
             max_block_sizes = GpuOptions.default_maximum_block_sizes(self._target)
 
-        return GpuIndexing(self._ctx, idx_scheme, warp_size, max_block_sizes, block_size, manual_launch_grid)
+        return GpuIndexing(
+            self._ctx,
+            idx_scheme,
+            warp_size,
+            max_block_sizes,
+            block_size,
+            manual_launch_grid,
+        )
 
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index bcb24e0e0..5bdb0b24a 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -58,8 +58,12 @@ class GpuLaunchConfiguration(ABC):
         cache is invalidated."""
 
     @staticmethod
-    def determine_block_size(it_space: tuple[int, ...], block_size: tuple[int, int, int],
-                             max_block_sizes: tuple[int, int, int], warp_size: int) -> tuple[int, ...]:
+    def determine_block_size(
+        it_space: tuple[int, ...],
+        block_size: tuple[int, int, int],
+        max_block_sizes: tuple[int, int, int],
+        warp_size: int,
+    ) -> tuple[int, ...]:
         """Returns an optimized block size configuration with block sizes being aligned with the warp size."""
 
         def trim(to_trim: list[int]) -> list[int]:
@@ -70,17 +74,26 @@ class GpuLaunchConfiguration(ABC):
                 return div_ceil(val, warp_size) * warp_size
 
             # find index of element closest to warp size and round up
-            index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
+            index_to_round = to_round.index(
+                max(to_round, key=lambda i: abs(i % warp_size))
+            )
             if index_to_round + 1 < len(to_round):
-                return (*to_round[:index_to_round], round(to_round[index_to_round]), *to_round[index_to_round + 1:])
+                return (
+                    *to_round[:index_to_round],
+                    round(to_round[index_to_round]),
+                    *to_round[index_to_round + 1 :],
+                )
             else:
                 return (*to_round[:index_to_round], round(to_round[index_to_round]))
 
         def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]:
-            if not all(size < max_size for size, max_size in zip(to_return, max_block_sizes)) \
-                    or prod(to_return) > max(max_block_sizes):
-                raise CodegenError(f"Unable to determine GPU block size for this kernel for initial block size config"
-                                   f"{block_size} and iteration space {it_space}.")
+            if not all(
+                size < max_size for size, max_size in zip(to_return, max_block_sizes)
+            ) or prod(to_return) > max(max_block_sizes):
+                raise CodegenError(
+                    f"Unable to determine GPU block size for this kernel for initial block size config"
+                    f"{block_size} and iteration space {it_space}."
+                )
             return to_return
 
         trimmed = trim(list(block_size))
@@ -105,8 +118,13 @@ class GpuLaunchConfiguration(ABC):
                     trimmed[d] *= 2
 
                     # trim fastest moving dim to multiples of warp size
-                    if d == 0 and trimmed[d] > warp_size and trimmed[d] % warp_size != 0:
-                        trimmed[d] = trimmed[d] - (trimmed[d] % warp_size)  # subtract remainder
+                    if (
+                        d == 0
+                        and trimmed[d] > warp_size
+                        and trimmed[d] % warp_size != 0
+                    ):
+                        # subtract remainder
+                        trimmed[d] = trimmed[d] - (trimmed[d] % warp_size)
 
                     # check if block sizes are within hardware capabilities
                     trimmed[d] = min(trimmed[d], max_block_sizes[d])
@@ -128,10 +146,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """
 
     def __init__(
-        self,
-        block_size: _Dim3Lambda,
-        grid_size: _Dim3Lambda,
-        warp_size: _Dim1Lambda
+        self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, warp_size: _Dim1Lambda
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
@@ -152,7 +167,10 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         # impossible to use block size determination function since the iteration space is unknown
         # -> round block size in fastest moving dimension up to multiple of warp size
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
-        block_size = (ceil_to_multiple(block_size[0], int(self._warp_size[0](**kwargs))), *block_size[1:])
+        block_size = (
+            ceil_to_multiple(block_size[0], int(self._warp_size[0](**kwargs))),
+            *block_size[1:],
+        )
 
         grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
         return cast(dim3, block_size), cast(dim3, grid_size)
@@ -267,7 +285,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         )
         reduced_block_size = cast(
             dim3,
-            self.determine_block_size(num_work_items, self._block_size, self._max_block_sizes, self._warp_size),
+            self.determine_block_size(
+                num_work_items, self._block_size, self._max_block_sizes, self._warp_size
+            ),
         )
         grid_size = cast(
             dim3,
@@ -408,13 +428,15 @@ class GpuIndexing:
             for _ in range(4 - rank)
         )
 
-        warp_size = (self._kernel_factory.create_lambda(PsExpression.make(PsConstant(self._warp_size, UInt(32)))),)
+        warp_size = (
+            self._kernel_factory.create_lambda(
+                PsExpression.make(PsConstant(self._warp_size, UInt(32)))
+            ),
+        )
 
         def factory():
             return AutomaticLaunchConfiguration(
-                block_size,
-                cast(_Dim3Lambda, grid_size),
-                warp_size
+                block_size, cast(_Dim3Lambda, grid_size), warp_size
             )
 
         return factory
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 38e849c3c..ad3d2bd3e 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -30,9 +30,7 @@ except ImportError:
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
 @pytest.mark.parametrize("manual_grid", [False, True])
-def test_indexing_options(
-    indexing_scheme: str, manual_grid: bool
-):
+def test_indexing_options(indexing_scheme: str, manual_grid: bool):
     src, dst = fields("src, dst: [3D]")
     asm = Assignment(
         dst.center(),
-- 
GitLab


From a8f193c14ae6bc808afb35743a215bab1feb2e2e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 3 Mar 2025 17:22:10 +0100
Subject: [PATCH 14/54] Add small description of block config determination in
 docs for gpu codegen

---
 docs/source/backend/gpu_codegen.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 1082669e6..e0d9549c4 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -72,6 +72,12 @@ These depend on the type of the launch configuration:
 while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel
 parameters,
 the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size.
+The `DynamicBlockSizeLaunchConfiguration` permits the user to set a block size which is used to then dynamically compute
+the grid size. However, the actual block size being used may differ from the user specification. This may origin from
+trimming operations between the original block size and the iteration space to avoid spawning unnecessary large blocks.
+Another factor affecting the actual block size is having block sizes that match with the warp size given by the hardware 
+for an improved performance. Here, the block size is rounded to a multiple of the warp size while considering the
+maximum amount of threads per block.
 
 The `evaluate` method can only be used from within a Python runtime environment.
 When exporting pystencils CUDA kernels for external use in C++ projects,
-- 
GitLab


From f12d2f8b727b03859f4d8a4185b7c9cfd1a72264 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 4 Mar 2025 11:52:10 +0100
Subject: [PATCH 15/54] Fix lint

---
 src/pystencils/codegen/gpu_indexing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 5bdb0b24a..72de41979 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -81,7 +81,7 @@ class GpuLaunchConfiguration(ABC):
                 return (
                     *to_round[:index_to_round],
                     round(to_round[index_to_round]),
-                    *to_round[index_to_round + 1 :],
+                    *to_round[index_to_round + 1:],
                 )
             else:
                 return (*to_round[:index_to_round], round(to_round[index_to_round]))
-- 
GitLab


From 4ea0ea261c4512351e2decc83a5ddca33da5bbc2 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 5 Mar 2025 13:28:33 +0100
Subject: [PATCH 16/54] Move ceil_to_multiple to utils module

---
 src/pystencils/codegen/gpu_indexing.py | 5 +----
 src/pystencils/utils.py                | 5 +++++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 72de41979..539e4b58b 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -20,7 +20,7 @@ from ..backend.platforms.cuda import ThreadMapping
 from ..backend.ast.expressions import PsExpression
 from math import prod
 
-from ..sympyextensions.integer_functions import div_ceil
+from ..utils import div_ceil, ceil_to_multiple
 from ..types.quick import UInt
 
 dim3 = tuple[int, int, int]
@@ -161,9 +161,6 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         return self._params
 
     def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
-        def ceil_to_multiple(arg1, arg2):
-            return div_ceil(arg1 + arg2 - 1, arg2) * arg2
-
         # impossible to use block size determination function since the iteration space is unknown
         # -> round block size in fastest moving dimension up to multiple of warp size
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py
index 0049d0a2c..234f65a5e 100644
--- a/src/pystencils/utils.py
+++ b/src/pystencils/utils.py
@@ -316,3 +316,8 @@ def div_ceil(divident, divisor):
     The result is unspecified if either argument is negative."""
 
     return c_intdiv(divident + divisor - 1, divisor)
+
+
+def ceil_to_multiple(divident, divisor):
+    """Rounds 'divident' to the next multiple of 'divisor'."""
+    return div_ceil(divident + divisor - 1, divisor) * divisor
-- 
GitLab


From 35e93e54291933b8bf65b0c6278e025f6238766c Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 5 Mar 2025 13:32:15 +0100
Subject: [PATCH 17/54] Omit lambda for warp size in GpuIndexing

---
 src/pystencils/codegen/gpu_indexing.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 539e4b58b..557eb8ba4 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -24,7 +24,6 @@ from ..utils import div_ceil, ceil_to_multiple
 from ..types.quick import UInt
 
 dim3 = tuple[int, int, int]
-_Dim1Lambda = tuple[Lambda]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
@@ -146,7 +145,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """
 
     def __init__(
-        self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, warp_size: _Dim1Lambda
+        self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, warp_size: int
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
@@ -165,7 +164,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         # -> round block size in fastest moving dimension up to multiple of warp size
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
         block_size = (
-            ceil_to_multiple(block_size[0], int(self._warp_size[0](**kwargs))),
+            ceil_to_multiple(block_size[0], self._warp_size),
             *block_size[1:],
         )
 
@@ -425,15 +424,9 @@ class GpuIndexing:
             for _ in range(4 - rank)
         )
 
-        warp_size = (
-            self._kernel_factory.create_lambda(
-                PsExpression.make(PsConstant(self._warp_size, UInt(32)))
-            ),
-        )
-
         def factory():
             return AutomaticLaunchConfiguration(
-                block_size, cast(_Dim3Lambda, grid_size), warp_size
+                block_size, cast(_Dim3Lambda, grid_size), self._warp_size
             )
 
         return factory
-- 
GitLab


From 9891308beaea99fbe4ea4101199bdc00fc7c89b1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 5 Mar 2025 13:36:53 +0100
Subject: [PATCH 18/54] Refactor condition for failure in
 check_sizes_and_return and emit reason in error message

---
 src/pystencils/codegen/gpu_indexing.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 557eb8ba4..91f89bdbf 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -86,12 +86,13 @@ class GpuLaunchConfiguration(ABC):
                 return (*to_round[:index_to_round], round(to_round[index_to_round]))
 
         def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]:
-            if not all(
-                size < max_size for size, max_size in zip(to_return, max_block_sizes)
+            if any(
+                size >= max_size for size, max_size in zip(to_return, max_block_sizes)
             ) or prod(to_return) > max(max_block_sizes):
                 raise CodegenError(
                     f"Unable to determine GPU block size for this kernel for initial block size config"
                     f"{block_size} and iteration space {it_space}."
+                    f"Final block size was too large: {to_return}."
                 )
             return to_return
 
-- 
GitLab


From e3ca4e0546b88a3eaa7c5fae56de8c9722e19b1e Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 5 Mar 2025 13:38:55 +0100
Subject: [PATCH 19/54] Apply 2 suggestion(s) to 2 file(s)

Co-authored-by: Frederik Hennig <frederik.hennig@fau.de>
---
 docs/source/backend/gpu_codegen.md | 8 ++++----
 src/pystencils/codegen/config.py   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index e0d9549c4..70b9db75b 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -73,10 +73,10 @@ while the `AutomaticLaunchConfiguration` permits no modification and computes gr
 parameters,
 the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size.
 The `DynamicBlockSizeLaunchConfiguration` permits the user to set a block size which is used to then dynamically compute
-the grid size. However, the actual block size being used may differ from the user specification. This may origin from
-trimming operations between the original block size and the iteration space to avoid spawning unnecessary large blocks.
-Another factor affecting the actual block size is having block sizes that match with the warp size given by the hardware 
-for an improved performance. Here, the block size is rounded to a multiple of the warp size while considering the
+the grid size. However, the actual block size being used may differ from the user specification. This may be due to
+trimming operations between the original block size and the iteration space to avoid spawning unnecessarily large blocks.
+Another factor affecting the actual block size is the need to have block sizes that match with the warp size given by the hardware 
+for improved performance. Here, the block size is rounded to a multiple of the warp size while considering the
 maximum amount of threads per block.
 
 The `evaluate` method can only be used from within a Python runtime environment.
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index e369dde4a..803b4a7e6 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -397,7 +397,7 @@ class GpuOptions(ConfigBase):
     warp_size: BasicOption[int] = BasicOption()
     """Specifies the size of a warp (CUDA) or wavefront (HIP).
     
-    If set to `None` (the default), the default value for the given target will be automatically used.
+    If this option is not set the default value for the given target will be automatically used.
     """
 
     @staticmethod
-- 
GitLab


From 221296727be890c48a34f3b4f0eaa905765d53eb Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 5 Mar 2025 15:35:25 +0100
Subject: [PATCH 20/54] Minor fix for parameters of automatic launch config

---
 src/pystencils/codegen/gpu_indexing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 91f89bdbf..462cb0981 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -153,7 +153,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         self._warp_size = warp_size
 
         self._params: frozenset[Parameter] = frozenset().union(
-            *(lb.parameters for lb in chain(block_size, grid_size, warp_size))
+            *(lb.parameters for lb in chain(block_size, grid_size))
         )
 
     @property
-- 
GitLab


From 2b50014e48bd2755b9f25760bb7330836122ea5e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 5 Mar 2025 16:58:11 +0100
Subject: [PATCH 21/54] Remove max_block_sizes config option and add getters
 for gpu targets in GpuIndexing. Encapsulate in HardwareProperties dataclass

---
 src/pystencils/codegen/config.py       | 16 -----
 src/pystencils/codegen/driver.py       |  7 +-
 src/pystencils/codegen/gpu_indexing.py | 94 +++++++++++++++-----------
 3 files changed, 57 insertions(+), 60 deletions(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index e369dde4a..40401045d 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -380,12 +380,6 @@ class GpuOptions(ConfigBase):
     The block size may be overridden at runtime.
     """
 
-    max_block_sizes: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption()
-    """Maximum dimension size of a thread block in dimensions (x, y, z).
-    
-    If set to `None` (the default), the default value for the given target will be automatically used.
-    """
-
     manual_launch_grid: BasicOption[bool] = BasicOption(False)
     """Always require a manually specified launch grid when running this kernel.
     
@@ -410,16 +404,6 @@ class GpuOptions(ConfigBase):
                     f"No default warp/wavefront size known for target {target}"
                 )
 
-    @staticmethod
-    def default_maximum_block_sizes(target: Target):
-        match target:
-            case Target.CUDA:
-                return (1024, 1024, 64)
-            case _:
-                raise NotImplementedError(
-                    f"No default value known for maximum GPU block sizes available on target {target}"
-                )
-
     @indexing_scheme.validate
     def _validate_idx_scheme(self, val: str | GpuIndexingScheme):
         if isinstance(val, GpuIndexingScheme):
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 9404f9754..c70bc6c68 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -408,20 +408,15 @@ class DefaultKernelCreationDriver:
         block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
         warp_size: int | None = self._cfg.gpu.get_option("warp_size")
-        max_block_sizes: tuple[int, int, int] | None = self._cfg.gpu.get_option(
-            "max_block_sizes"
-        )
 
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
-        if max_block_sizes is None:
-            max_block_sizes = GpuOptions.default_maximum_block_sizes(self._target)
 
         return GpuIndexing(
             self._ctx,
+            self._target,
             idx_scheme,
             warp_size,
-            max_block_sizes,
             block_size,
             manual_launch_grid,
         )
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 462cb0981..797458dd9 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from typing import cast, Any, Callable
 from itertools import chain
 
@@ -8,7 +9,7 @@ from .functions import Lambda
 from .parameters import Parameter
 from .errors import CodegenError
 from .config import GpuIndexingScheme, _AUTO_TYPE
-from ..backend.constants import PsConstant
+from .target import Target
 
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -20,13 +21,19 @@ from ..backend.platforms.cuda import ThreadMapping
 from ..backend.ast.expressions import PsExpression
 from math import prod
 
-from ..utils import div_ceil, ceil_to_multiple
-from ..types.quick import UInt
+from ..utils import ceil_to_multiple
 
 dim3 = tuple[int, int, int]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
+@dataclass
+class HardwareProperties:
+    warp_size: int
+    max_threads_per_block: int
+    max_block_sizes: dim3
+
+
 class GpuLaunchConfiguration(ABC):
     """Base class for launch configurations for CUDA and HIP kernels.
 
@@ -60,35 +67,32 @@ class GpuLaunchConfiguration(ABC):
     def determine_block_size(
         it_space: tuple[int, ...],
         block_size: tuple[int, int, int],
-        max_block_sizes: tuple[int, int, int],
-        warp_size: int,
+        hw_props: HardwareProperties
     ) -> tuple[int, ...]:
         """Returns an optimized block size configuration with block sizes being aligned with the warp size."""
 
         def trim(to_trim: list[int]) -> list[int]:
             return [min(b, i) for b, i in zip(to_trim, it_space)]
 
-        def ceil_to_multiple(to_round: list[int]) -> tuple[int, ...]:
-            def round(val):
-                return div_ceil(val, warp_size) * warp_size
-
+        def round_blocksizes(to_round: list[int]) -> tuple[int, ...]:
             # find index of element closest to warp size and round up
             index_to_round = to_round.index(
-                max(to_round, key=lambda i: abs(i % warp_size))
+                max(to_round, key=lambda i: abs(i % hw_props.warp_size))
             )
             if index_to_round + 1 < len(to_round):
                 return (
                     *to_round[:index_to_round],
-                    round(to_round[index_to_round]),
+                    ceil_to_multiple(to_round[index_to_round], hw_props.warp_size),
                     *to_round[index_to_round + 1:],
                 )
             else:
-                return (*to_round[:index_to_round], round(to_round[index_to_round]))
+                return (*to_round[:index_to_round],
+                        ceil_to_multiple(to_round[index_to_round], hw_props.warp_size))
 
         def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]:
             if any(
-                size >= max_size for size, max_size in zip(to_return, max_block_sizes)
-            ) or prod(to_return) > max(max_block_sizes):
+                size >= max_size for size, max_size in zip(to_return, hw_props.max_block_sizes)
+            ) or prod(to_return) > hw_props.max_threads_per_block:
                 raise CodegenError(
                     f"Unable to determine GPU block size for this kernel for initial block size config"
                     f"{block_size} and iteration space {it_space}."
@@ -97,7 +101,7 @@ class GpuLaunchConfiguration(ABC):
             return to_return
 
         trimmed = trim(list(block_size))
-        if prod(trimmed) >= warp_size and prod(trimmed) % warp_size == 0:
+        if prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size == 0:
             # case 1: greater than min block size -> use trimmed result
             return check_sizes_and_return(tuple(trimmed))
 
@@ -108,8 +112,8 @@ class GpuLaunchConfiguration(ABC):
 
             # case 2: trimmed block is equivalent to the whole iteration space
             if all(b == i for b, i in zip(trimmed, it_space)):
-                if prod(trimmed) % warp_size != 0:
-                    return check_sizes_and_return(ceil_to_multiple(trimmed))
+                if prod(trimmed) % hw_props.warp_size != 0:
+                    return check_sizes_and_return(round_blocksizes(trimmed))
                 else:
                     return check_sizes_and_return(tuple(trimmed))
             else:
@@ -120,20 +124,20 @@ class GpuLaunchConfiguration(ABC):
                     # trim fastest moving dim to multiples of warp size
                     if (
                         d == 0
-                        and trimmed[d] > warp_size
-                        and trimmed[d] % warp_size != 0
+                        and trimmed[d] > hw_props.warp_size
+                        and trimmed[d] % hw_props.warp_size != 0
                     ):
                         # subtract remainder
-                        trimmed[d] = trimmed[d] - (trimmed[d] % warp_size)
+                        trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size)
 
                     # check if block sizes are within hardware capabilities
-                    trimmed[d] = min(trimmed[d], max_block_sizes[d])
+                    trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d])
 
                     # trim again
                     trimmed = trim(trimmed)
 
                     # case 3: trim block is large enough
-                    if prod(trimmed) >= warp_size:
+                    if prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size != 0:
                         return check_sizes_and_return(tuple(trimmed))
 
         raise CodegenError("Unable to determine GPU block size for this kernel.")
@@ -146,11 +150,11 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """
 
     def __init__(
-        self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, warp_size: int
+        self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, hw_props: HardwareProperties
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
-        self._warp_size = warp_size
+        self._hw_props = hw_props
 
         self._params: frozenset[Parameter] = frozenset().union(
             *(lb.parameters for lb in chain(block_size, grid_size))
@@ -165,7 +169,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         # -> round block size in fastest moving dimension up to multiple of warp size
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
         block_size = (
-            ceil_to_multiple(block_size[0], self._warp_size),
+            ceil_to_multiple(block_size[0], self._hw_props.warp_size),
             *block_size[1:],
         )
 
@@ -235,15 +239,12 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
     def __init__(
         self,
         num_work_items: _Dim3Lambda,
-        warp_size: int,
-        max_block_sizes: dim3,
+        hw_props: HardwareProperties,
         default_block_size: dim3 | None = None,
     ) -> None:
         self._num_work_items = num_work_items
 
-        self._warp_size = warp_size
-
-        self._max_block_sizes = max_block_sizes
+        self._hw_props = hw_props
 
         self._block_size: dim3 | None = default_block_size
 
@@ -282,9 +283,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         )
         reduced_block_size = cast(
             dim3,
-            self.determine_block_size(
-                num_work_items, self._block_size, self._max_block_sizes, self._warp_size
-            ),
+            self.determine_block_size(num_work_items, self._block_size, self._hw_props),
         )
         grid_size = cast(
             dim3,
@@ -318,25 +317,45 @@ class GpuIndexing:
     def __init__(
         self,
         ctx: KernelCreationContext,
+        target: Target,
         scheme: GpuIndexingScheme,
         warp_size: int,
-        max_block_sizes: dim3,
         default_block_size: dim3 | _AUTO_TYPE | None = None,
         manual_launch_grid: bool = False,
     ) -> None:
         self._ctx = ctx
+        self._target = target
         self._scheme = scheme
         self._warp_size = warp_size
-        self._max_block_sizes = max_block_sizes
         self._default_block_size = default_block_size
         self._manual_launch_grid = manual_launch_grid
 
+        self._hw_props = HardwareProperties(warp_size,
+                                            self.get_max_threads_per_block(target),
+                                            self.get_max_block_sizes(target))
+
         from ..backend.kernelcreation import AstFactory
         from .driver import KernelFactory
 
         self._ast_factory = AstFactory(self._ctx)
         self._kernel_factory = KernelFactory(self._ctx)
 
+    @staticmethod
+    def get_max_block_sizes(target: Target):
+        match target:
+            case Target.CUDA:
+                return (1024, 1024, 64)
+            case _:
+                raise CodegenError(f"Cannot determine max GPU block sizes for target {target}")
+
+    @staticmethod
+    def get_max_threads_per_block(target: Target):
+        match target:
+            case Target.CUDA:
+                return 1024
+            case _:
+                raise CodegenError(f"Cannot determine max GPU threads per block for target {target}")
+
     def get_thread_mapping(self) -> ThreadMapping:
         """Retrieve a thread mapping object for use by the backend"""
 
@@ -379,8 +398,7 @@ class GpuIndexing:
         def factory():
             return DynamicBlockSizeLaunchConfiguration(
                 num_work_items,
-                self._warp_size,
-                self._max_block_sizes,
+                self._hw_props,
                 self._get_default_block_size(rank),
             )
 
@@ -427,7 +445,7 @@ class GpuIndexing:
 
         def factory():
             return AutomaticLaunchConfiguration(
-                block_size, cast(_Dim3Lambda, grid_size), self._warp_size
+                block_size, cast(_Dim3Lambda, grid_size), self._hw_props
             )
 
         return factory
-- 
GitLab


From 60d6323b795e35cefd9e521ee3e186c6fd074413 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 5 Mar 2025 17:01:00 +0100
Subject: [PATCH 22/54] Adapt formatting of gpu_indexing.py

---
 src/pystencils/codegen/gpu_indexing.py | 49 ++++++++++++++++++--------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 797458dd9..ecc389f18 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -67,7 +67,7 @@ class GpuLaunchConfiguration(ABC):
     def determine_block_size(
         it_space: tuple[int, ...],
         block_size: tuple[int, int, int],
-        hw_props: HardwareProperties
+        hw_props: HardwareProperties,
     ) -> tuple[int, ...]:
         """Returns an optimized block size configuration with block sizes being aligned with the warp size."""
 
@@ -86,13 +86,19 @@ class GpuLaunchConfiguration(ABC):
                     *to_round[index_to_round + 1:],
                 )
             else:
-                return (*to_round[:index_to_round],
-                        ceil_to_multiple(to_round[index_to_round], hw_props.warp_size))
+                return (
+                    *to_round[:index_to_round],
+                    ceil_to_multiple(to_round[index_to_round], hw_props.warp_size),
+                )
 
         def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]:
-            if any(
-                size >= max_size for size, max_size in zip(to_return, hw_props.max_block_sizes)
-            ) or prod(to_return) > hw_props.max_threads_per_block:
+            if (
+                any(
+                    size >= max_size
+                    for size, max_size in zip(to_return, hw_props.max_block_sizes)
+                )
+                or prod(to_return) > hw_props.max_threads_per_block
+            ):
                 raise CodegenError(
                     f"Unable to determine GPU block size for this kernel for initial block size config"
                     f"{block_size} and iteration space {it_space}."
@@ -101,7 +107,10 @@ class GpuLaunchConfiguration(ABC):
             return to_return
 
         trimmed = trim(list(block_size))
-        if prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size == 0:
+        if (
+            prod(trimmed) >= hw_props.warp_size
+            and prod(trimmed) % hw_props.warp_size == 0
+        ):
             # case 1: greater than min block size -> use trimmed result
             return check_sizes_and_return(tuple(trimmed))
 
@@ -137,7 +146,10 @@ class GpuLaunchConfiguration(ABC):
                     trimmed = trim(trimmed)
 
                     # case 3: trim block is large enough
-                    if prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size != 0:
+                    if (
+                        prod(trimmed) >= hw_props.warp_size
+                        and prod(trimmed) % hw_props.warp_size != 0
+                    ):
                         return check_sizes_and_return(tuple(trimmed))
 
         raise CodegenError("Unable to determine GPU block size for this kernel.")
@@ -150,7 +162,10 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """
 
     def __init__(
-        self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, hw_props: HardwareProperties
+        self,
+        block_size: _Dim3Lambda,
+        grid_size: _Dim3Lambda,
+        hw_props: HardwareProperties,
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
@@ -330,9 +345,11 @@ class GpuIndexing:
         self._default_block_size = default_block_size
         self._manual_launch_grid = manual_launch_grid
 
-        self._hw_props = HardwareProperties(warp_size,
-                                            self.get_max_threads_per_block(target),
-                                            self.get_max_block_sizes(target))
+        self._hw_props = HardwareProperties(
+            warp_size,
+            self.get_max_threads_per_block(target),
+            self.get_max_block_sizes(target),
+        )
 
         from ..backend.kernelcreation import AstFactory
         from .driver import KernelFactory
@@ -346,7 +363,9 @@ class GpuIndexing:
             case Target.CUDA:
                 return (1024, 1024, 64)
             case _:
-                raise CodegenError(f"Cannot determine max GPU block sizes for target {target}")
+                raise CodegenError(
+                    f"Cannot determine max GPU block sizes for target {target}"
+                )
 
     @staticmethod
     def get_max_threads_per_block(target: Target):
@@ -354,7 +373,9 @@ class GpuIndexing:
             case Target.CUDA:
                 return 1024
             case _:
-                raise CodegenError(f"Cannot determine max GPU threads per block for target {target}")
+                raise CodegenError(
+                    f"Cannot determine max GPU threads per block for target {target}"
+                )
 
     def get_thread_mapping(self) -> ThreadMapping:
         """Retrieve a thread mapping object for use by the backend"""
-- 
GitLab


From 26ba005a4ba9e2f50f446572c531b2539ee4a74f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 6 Mar 2025 15:52:03 +0100
Subject: [PATCH 23/54] Slightly adapt block size fitting algorithm

---
 src/pystencils/codegen/gpu_indexing.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index ecc389f18..7edb7c8e6 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -94,14 +94,14 @@ class GpuLaunchConfiguration(ABC):
         def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]:
             if (
                 any(
-                    size >= max_size
+                    size > max_size
                     for size, max_size in zip(to_return, hw_props.max_block_sizes)
                 )
                 or prod(to_return) > hw_props.max_threads_per_block
             ):
                 raise CodegenError(
-                    f"Unable to determine GPU block size for this kernel for initial block size config"
-                    f"{block_size} and iteration space {it_space}."
+                    f"Unable to determine GPU block size for this kernel for initial block size config "
+                    f"{block_size} and iteration space {it_space}. "
                     f"Final block size was too large: {to_return}."
                 )
             return to_return
@@ -148,7 +148,7 @@ class GpuLaunchConfiguration(ABC):
                     # case 3: trim block is large enough
                     if (
                         prod(trimmed) >= hw_props.warp_size
-                        and prod(trimmed) % hw_props.warp_size != 0
+                        and prod(trimmed) % hw_props.warp_size == 0
                     ):
                         return check_sizes_and_return(tuple(trimmed))
 
-- 
GitLab


From 59903a15073b945d7d5d9e8c5aecf8792858aac4 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 6 Mar 2025 15:53:06 +0100
Subject: [PATCH 24/54] Add test checking generated GPU launch configurations

---
 tests/kernelcreation/test_gpu.py | 69 ++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index ad3d2bd3e..5ddb2978c 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -12,6 +12,7 @@ from pystencils import (
     create_kernel,
     Target,
 )
+from pystencils.codegen.gpu_indexing import GpuIndexing, HardwareProperties
 
 from pystencils.slicing import (
     add_ghost_layers,
@@ -19,6 +20,7 @@ from pystencils.slicing import (
     remove_ghost_layers,
     normalize_slice,
 )
+from math import prod
 
 try:
     import cupy as cp
@@ -72,6 +74,73 @@ def test_indexing_options(indexing_scheme: str, manual_grid: bool):
     cp.testing.assert_allclose(dst_arr, expected)
 
 
+@pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
+@pytest.mark.parametrize("iteration_space",
+                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65)])
+@pytest.mark.parametrize("initial_block_size",
+                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)])
+def test_indexing_launch_config(
+        indexing_scheme: str,
+        iteration_space: tuple[int, int, int],
+        initial_block_size: tuple[int, int, int]
+):
+
+    src, dst = fields("src, dst: [3D]")
+    asm = Assignment(
+        dst.center(),
+        src[-1, 0, 0]
+        + src[1, 0, 0]
+        + src[0, -1, 0]
+        + src[0, 1, 0]
+        + src[0, 0, -1]
+        + src[0, 0, 1],
+    )
+
+    target = Target.CUDA
+    cfg = CreateKernelConfig(target=target)
+    cfg.gpu.indexing_scheme = indexing_scheme
+
+    ast = create_kernel(asm, cfg)
+    kernel = ast.compile()
+
+    warp_size = cfg.gpu.default_warp_size(target)
+    max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
+    max_block_sizes = GpuIndexing.get_max_block_sizes(target)
+
+    block_size = kernel.launch_config.determine_block_size(
+        iteration_space,
+        initial_block_size,
+        HardwareProperties(warp_size, max_threads_per_block, max_block_sizes)
+    )
+
+    def check_suitability(b):
+        return prod(b) >= warp_size and prod(b) % warp_size == 0
+
+    # block size fitting should not modify an already ideal configuration
+    # -> check if ideal configurations are modified
+    if (
+            check_suitability(initial_block_size)
+        and all(x == y for x, y in zip(initial_block_size, iteration_space))  # trimming may alter results
+    ):
+        assert all(x == y for x, y in zip(initial_block_size, block_size)), \
+            f"Initial block size unnecessarily adapted from {initial_block_size} to {block_size}."
+
+    assert check_suitability(block_size), \
+        "Determined block size shall be divisible by warp size."
+
+    kernel.launch_config.block_size = block_size
+
+    src_arr = cp.ones(iteration_space)
+    dst_arr = cp.zeros_like(src_arr)
+
+    kernel(src=src_arr, dst=dst_arr)
+
+    expected = cp.zeros_like(src_arr)
+    expected[1:-1, 1:-1, 1:-1].fill(6.0)
+
+    cp.testing.assert_allclose(dst_arr, expected)
+
+
 def test_invalid_indexing_schemes():
     src, dst = fields("src, dst: [4D]")
     asm = Assignment(src.center(0), dst.center(0))
-- 
GitLab


From 5c840aa50a1c013e44e4709cef5764130aaf203a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 6 Mar 2025 16:45:38 +0100
Subject: [PATCH 25/54] Rename determine_block_size to fit_block_size

---
 src/pystencils/codegen/gpu_indexing.py | 4 ++--
 tests/kernelcreation/test_gpu.py       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 7edb7c8e6..3e41088c9 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -64,7 +64,7 @@ class GpuLaunchConfiguration(ABC):
         cache is invalidated."""
 
     @staticmethod
-    def determine_block_size(
+    def fit_block_size(
         it_space: tuple[int, ...],
         block_size: tuple[int, int, int],
         hw_props: HardwareProperties,
@@ -298,7 +298,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         )
         reduced_block_size = cast(
             dim3,
-            self.determine_block_size(num_work_items, self._block_size, self._hw_props),
+            self.fit_block_size(num_work_items, self._block_size, self._hw_props),
         )
         grid_size = cast(
             dim3,
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 5ddb2978c..73f3ff3d2 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -107,7 +107,7 @@ def test_indexing_launch_config(
     max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
     max_block_sizes = GpuIndexing.get_max_block_sizes(target)
 
-    block_size = kernel.launch_config.determine_block_size(
+    block_size = kernel.launch_config.fit_block_size(
         iteration_space,
         initial_block_size,
         HardwareProperties(warp_size, max_threads_per_block, max_block_sizes)
-- 
GitLab


From 946eccef8981bacd505b88c4d4c2abb3da1b0d60 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 6 Mar 2025 16:46:01 +0100
Subject: [PATCH 26/54] Adapt docstring for DynamicBlockSizeLaunchConfiguration

---
 src/pystencils/codegen/gpu_indexing.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 3e41088c9..b06bc4e14 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -243,12 +243,13 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
     """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size.
 
-    The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items
-    in the kernel's iteration space as follows.
+    In case the user-specified block size does not coincide with multiples of the warp size of the GPU,
+    it will be altered by a block size fitting algorithm.
+
+    The actual launch grid size is then computed as follows.
     For each dimension :math:`c \\in \\{ x, y, z \\}`,
 
-    - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``;
-    - otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
+    ``block_size.c = fit_block_size(user_block_size.c)`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
     """
 
     def __init__(
-- 
GitLab


From a5f511376461897de7e90485650a55e2696019ac Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 6 Mar 2025 17:01:27 +0100
Subject: [PATCH 27/54] Small adaptations to gpu_kernels.md user manual

---
 docs/source/user_manual/gpu_kernels.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 610c61ddf..8e102c642 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -111,11 +111,16 @@ For most kernels with an at most three-dimensional iteration space,
 this behavior is sufficient and desired.
 It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`.
 
-If the `Linear3D` indexing scheme is used, you may modifiy the GPU thread block size in two places.
+If the `Linear3D` indexing scheme is used, you may modify the GPU thread block size in two places.
 The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` 
 code generator option;
 if none is specified, a default depending on the iteration space's dimensionality will be used.
 
+For performance-related reasons, the block size shall coincide with multiples of the hardware's warp size.
+We enforce this by applying the fitting algorithm found in `GpuLaunchConfiguration.fit_block_size` 
+to alter the initial block size configuration.
+This modification only occurs if the initial configuration is not optimal.
+
 The block size can furthermore be modified at the compiled kernel's wrapper object via the
 `launch_config.block_size` attribute:
 
@@ -126,6 +131,8 @@ kfunc.launch_config.block_size = (256, 2, 1)
 # Run the kernel
 kfunc(f=f_arr, g=g_arr)
 ```
+Please note that when modifying the block size directly, it must align with the warp size,
+since this way of modification trespasses our fitting algorithm.
 
 In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments
 and the given thread block size.
-- 
GitLab


From a0cc7e8dae0b52e0c34959f599e5c61246d022e4 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 7 Mar 2025 13:25:12 +0100
Subject: [PATCH 28/54] Fix ceil_to_multiple

---
 src/pystencils/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py
index 234f65a5e..5ef13f31f 100644
--- a/src/pystencils/utils.py
+++ b/src/pystencils/utils.py
@@ -320,4 +320,4 @@ def div_ceil(divident, divisor):
 
 def ceil_to_multiple(divident, divisor):
     """Rounds 'divident' to the next multiple of 'divisor'."""
-    return div_ceil(divident + divisor - 1, divisor) * divisor
+    return div_ceil(divident, divisor) * divisor
-- 
GitLab


From 09fcfd3d5c496f4dea4682a0ac6f7629b050caaa Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 7 Mar 2025 13:28:14 +0100
Subject: [PATCH 29/54] Terminate block size fitting if block size is large
 enough and round to next multiple of warp size

---
 src/pystencils/codegen/gpu_indexing.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index b06bc4e14..89b339990 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -74,7 +74,11 @@ class GpuLaunchConfiguration(ABC):
         def trim(to_trim: list[int]) -> list[int]:
             return [min(b, i) for b, i in zip(to_trim, it_space)]
 
-        def round_blocksizes(to_round: list[int]) -> tuple[int, ...]:
+        def round_block_sizes(to_round: list[int]) -> tuple[int, ...]:
+            # check if already aligns with warp size
+            if prod(to_round) % hw_props.warp_size == 0:
+                return tuple(to_round)
+
             # find index of element closest to warp size and round up
             index_to_round = to_round.index(
                 max(to_round, key=lambda i: abs(i % hw_props.warp_size))
@@ -121,10 +125,7 @@ class GpuLaunchConfiguration(ABC):
 
             # case 2: trimmed block is equivalent to the whole iteration space
             if all(b == i for b, i in zip(trimmed, it_space)):
-                if prod(trimmed) % hw_props.warp_size != 0:
-                    return check_sizes_and_return(round_blocksizes(trimmed))
-                else:
-                    return check_sizes_and_return(tuple(trimmed))
+                return check_sizes_and_return(round_block_sizes(trimmed))
             else:
                 # double block size in each dimension until block is large enough (or case 2 triggers)
                 for d in resize_order:
@@ -146,11 +147,8 @@ class GpuLaunchConfiguration(ABC):
                     trimmed = trim(trimmed)
 
                     # case 3: trim block is large enough
-                    if (
-                        prod(trimmed) >= hw_props.warp_size
-                        and prod(trimmed) % hw_props.warp_size == 0
-                    ):
-                        return check_sizes_and_return(tuple(trimmed))
+                    if prod(trimmed) >= hw_props.warp_size:
+                        return check_sizes_and_return(round_block_sizes(trimmed))
 
         raise CodegenError("Unable to determine GPU block size for this kernel.")
 
-- 
GitLab


From 22f45248be16142ca2c56a7def2004c8c762caf3 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 7 Mar 2025 13:29:19 +0100
Subject: [PATCH 30/54] Add more edge cases to block size fitting test

---
 tests/kernelcreation/test_gpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 73f3ff3d2..e130db608 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -76,9 +76,9 @@ def test_indexing_options(indexing_scheme: str, manual_grid: bool):
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
 @pytest.mark.parametrize("iteration_space",
-                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65)])
+                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)])
 @pytest.mark.parametrize("initial_block_size",
-                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)])
+                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)])
 def test_indexing_launch_config(
         indexing_scheme: str,
         iteration_space: tuple[int, int, int],
-- 
GitLab


From 88485c6b57df0200516ef29d8936c7a4eedd26b0 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 10 Mar 2025 18:35:16 +0100
Subject: [PATCH 31/54] Distribute default block size cubicly over dims

---
 src/pystencils/codegen/gpu_indexing.py | 4 ++--
 src/pystencils/jit/gpu_cupy.py         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 5bdb0b24a..e0d655225 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -398,9 +398,9 @@ class GpuIndexing:
                 case 1:
                     return (256, 1, 1)
                 case 2:
-                    return (128, 2, 1)
+                    return (16, 16, 1)
                 case 3:
-                    return (128, 2, 2)
+                    return (8, 8, 8)
                 case _:
                     assert False, "unreachable code"
         else:
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 1c771a427..1fddf6f77 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -207,7 +207,7 @@ class CupyKernelWrapper(KernelWrapper):
 
 class CupyJit(JitBase):
 
-    def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)):
+    def __init__(self, default_block_size: Sequence[int] = (16, 16, 1)):
         self._runtime_headers = {"<cstdint>"}
 
         if len(default_block_size) > 3:
-- 
GitLab


From 4728c8e70217ca51214420d62ace2ff44fade312 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 10 Mar 2025 18:54:43 +0100
Subject: [PATCH 32/54] Use default block size if block size fitting fails

---
 src/pystencils/codegen/gpu_indexing.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index a29c2f7bf..3a05d9ea6 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import cast, Any, Callable
 from itertools import chain
+from warnings import warn
 
 from .functions import Lambda
 from .parameters import Parameter
@@ -295,10 +296,15 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         num_work_items = cast(
             dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items)
         )
-        reduced_block_size = cast(
-            dim3,
-            self.fit_block_size(num_work_items, self._block_size, self._hw_props),
-        )
+        try:
+            reduced_block_size = cast(
+                dim3,
+                self.fit_block_size(num_work_items, self._block_size, self._hw_props),
+            )
+        except CodegenError:
+            warn(f"Block size fitting could not determine optimal block size configuration. "
+                 f"Defaulting back to {self._block_size}")
+            reduced_block_size = self._block_size
         grid_size = cast(
             dim3,
             tuple(
-- 
GitLab


From 0d8e962adaa969bfc17928da15772a1ee3d6b5f0 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 11 Mar 2025 12:58:22 +0100
Subject: [PATCH 33/54] Adapt 3d default gpu block size to reduce register
 pressure

---
 src/pystencils/codegen/gpu_indexing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index cef20b3f6..018b62c58 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -446,7 +446,7 @@ class GpuIndexing:
                 case 2:
                     return (16, 16, 1)
                 case 3:
-                    return (8, 8, 8)
+                    return (8, 8, 4)
                 case _:
                     assert False, "unreachable code"
         else:
-- 
GitLab


From 2624d819df3f82f42b6f196f5671ef8b95227447 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 11 Mar 2025 19:32:53 +0100
Subject: [PATCH 34/54] Introduce assume_warp_aligned_block_size &
 use_block_size_fitting options

---
 src/pystencils/codegen/config.py       |  17 +++
 src/pystencils/codegen/driver.py       |   4 +
 src/pystencils/codegen/gpu_indexing.py | 169 +++++++++++++++++--------
 tests/kernelcreation/test_gpu.py       |  59 +++++----
 4 files changed, 173 insertions(+), 76 deletions(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index cbc1b0bdb..37263e058 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -394,6 +394,23 @@ class GpuOptions(ConfigBase):
     If this option is not set the default value for the given target will be automatically used.
     """
 
+    assume_warp_aligned_block_size: BasicOption[bool] = BasicOption(False)
+    """Specifies whether block sizes are divisible by the hardware's warp size.
+    
+    If set to `True`, the code generator can employ optimizations that require this assumption, 
+    e.g. warp-level reductions.
+    The generator also checks if user-provided block sizes fulfill this criterion.
+    """
+
+    use_block_size_fitting: BasicOption[bool] = BasicOption(False)
+    """Experimental. Specifies if a fitting algorithm should be used at runtime to 
+    compute optimal block sizes that align with the warp size, 
+    i.e. their product is divisible by the warp size.
+    
+    This option only takes effect for `DynamicBlockSizeLaunchConfiguration`'s, 
+    i.e. if `Linear3D <GpuIndexingScheme.Linear3D>` is chosen as an indexing scheme.
+    """
+
     @staticmethod
     def default_warp_size(target: Target):
         match target:
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index c70bc6c68..d47b889c7 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -407,6 +407,8 @@ class DefaultKernelCreationDriver:
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
         block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
+        use_block_size_fitting: bool = self._cfg.gpu.get_option("use_block_size_fitting")
+        assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
         warp_size: int | None = self._cfg.gpu.get_option("warp_size")
 
         if warp_size is None:
@@ -419,6 +421,8 @@ class DefaultKernelCreationDriver:
             warp_size,
             block_size,
             manual_launch_grid,
+            use_block_size_fitting,
+            assume_warp_aligned_block_size,
         )
 
     def _get_platform(self) -> Platform:
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 018b62c58..204737981 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -65,51 +65,77 @@ class GpuLaunchConfiguration(ABC):
         cache is invalidated."""
 
     @staticmethod
+    def block_size_exceeds_hw_limits(
+            block_size: tuple[int, ...],
+            hw_props: HardwareProperties) -> bool:
+        """Checks if provided block size conforms limits given by the hardware."""
+
+        return any(
+            size > max_size
+            for size, max_size in zip(block_size, hw_props.max_block_sizes)
+        ) or prod(block_size) > hw_props.max_threads_per_block
+
+    @staticmethod
+    def _gen_error_msg(block_size: tuple[int, ...]):
+        return f"Unable to determine GPU block size for this kernel. \
+        Final block size was too large: {block_size}."
+
+    @staticmethod
+    def _round_block_sizes_to_warp_size(to_round: list[int], warp_size: int) -> tuple[int, ...]:
+        # check if already aligns with warp size
+        if prod(to_round) % warp_size == 0:
+            return tuple(to_round)
+
+        # find index of element closest to warp size and round up
+        index_to_round = to_round.index(
+            max(to_round, key=lambda i: abs(i % warp_size))
+        )
+        if index_to_round + 1 < len(to_round):
+            return (
+                *to_round[:index_to_round],
+                ceil_to_multiple(to_round[index_to_round], warp_size),
+                *to_round[index_to_round + 1:],
+            )
+        else:
+            return (
+                *to_round[:index_to_round],
+                ceil_to_multiple(to_round[index_to_round], warp_size),
+            )
+
+    @classmethod
+    def trim_block_size(
+        cls,
+        it_space: tuple[int, ...],
+        block_size: tuple[int, ...],
+        hw_props: HardwareProperties,
+    ) -> tuple[int, ...]:
+        """Returns specified block sizes trimmed with iteration space.
+        Raises `CodegenError` if trimmed block size does not conform hardware limits.
+        """
+
+        ret = tuple([min(b, i) for b, i in zip(block_size, it_space)])
+        if cls.block_size_exceeds_hw_limits(ret, hw_props):
+            raise CodegenError(cls._gen_error_msg(ret))
+        return ret
+
+    @classmethod
     def fit_block_size(
+        cls,
         it_space: tuple[int, ...],
         block_size: tuple[int, int, int],
         hw_props: HardwareProperties,
     ) -> tuple[int, ...]:
-        """Returns an optimized block size configuration with block sizes being aligned with the warp size."""
+        """Returns an optimized block size configuration with block sizes being aligned with the warp size.
+        Raises `CodegenError` if optimal block size could not be found or does not conform hardware limits.
+        """
 
         def trim(to_trim: list[int]) -> list[int]:
             return [min(b, i) for b, i in zip(to_trim, it_space)]
 
-        def round_block_sizes(to_round: list[int]) -> tuple[int, ...]:
-            # check if already aligns with warp size
-            if prod(to_round) % hw_props.warp_size == 0:
-                return tuple(to_round)
-
-            # find index of element closest to warp size and round up
-            index_to_round = to_round.index(
-                max(to_round, key=lambda i: abs(i % hw_props.warp_size))
-            )
-            if index_to_round + 1 < len(to_round):
-                return (
-                    *to_round[:index_to_round],
-                    ceil_to_multiple(to_round[index_to_round], hw_props.warp_size),
-                    *to_round[index_to_round + 1:],
-                )
-            else:
-                return (
-                    *to_round[:index_to_round],
-                    ceil_to_multiple(to_round[index_to_round], hw_props.warp_size),
-                )
-
-        def check_sizes_and_return(to_return: tuple[int, ...]) -> tuple[int, ...]:
-            if (
-                any(
-                    size > max_size
-                    for size, max_size in zip(to_return, hw_props.max_block_sizes)
-                )
-                or prod(to_return) > hw_props.max_threads_per_block
-            ):
-                raise CodegenError(
-                    f"Unable to determine GPU block size for this kernel for initial block size config "
-                    f"{block_size} and iteration space {it_space}. "
-                    f"Final block size was too large: {to_return}."
-                )
-            return to_return
+        def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]:
+            if cls.block_size_exceeds_hw_limits(ret, hw_props):
+                raise CodegenError(cls._gen_error_msg(ret))
+            return ret
 
         trimmed = trim(list(block_size))
         if (
@@ -126,7 +152,7 @@ class GpuLaunchConfiguration(ABC):
 
             # case 2: trimmed block is equivalent to the whole iteration space
             if all(b == i for b, i in zip(trimmed, it_space)):
-                return check_sizes_and_return(round_block_sizes(trimmed))
+                return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size))
             else:
                 # double block size in each dimension until block is large enough (or case 2 triggers)
                 for d in resize_order:
@@ -149,7 +175,7 @@ class GpuLaunchConfiguration(ABC):
 
                     # case 3: trim block is large enough
                     if prod(trimmed) >= hw_props.warp_size:
-                        return check_sizes_and_return(round_block_sizes(trimmed))
+                        return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size))
 
         raise CodegenError("Unable to determine GPU block size for this kernel.")
 
@@ -165,10 +191,12 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         block_size: _Dim3Lambda,
         grid_size: _Dim3Lambda,
         hw_props: HardwareProperties,
+        assume_warp_aligned_block_size: bool,
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
         self._hw_props = hw_props
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
         self._params: frozenset[Parameter] = frozenset().union(
             *(lb.parameters for lb in chain(block_size, grid_size))
@@ -183,10 +211,14 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         # -> round block size in fastest moving dimension up to multiple of warp size
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
         block_size = (
-            ceil_to_multiple(block_size[0], self._hw_props.warp_size),
+            ceil_to_multiple(block_size[0], self._hw_props.warp_size)
+            if self._assume_warp_aligned_block_size else block_size[0],
             *block_size[1:],
         )
 
+        if self.block_size_exceeds_hw_limits(block_size, self._hw_props):
+            raise CodegenError(f"Block size {block_size} exceeds hardware limits.")
+
         grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
         return cast(dim3, block_size), cast(dim3, grid_size)
 
@@ -202,7 +234,13 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
     def __init__(
         self,
+        hw_props: HardwareProperties,
+        assume_warp_aligned_block_size: bool = False
     ) -> None:
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+        self._hw_props = hw_props
+
         self._block_size: dim3 | None = None
         self._grid_size: dim3 | None = None
 
@@ -233,6 +271,10 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
         if self._grid_size is None:
             raise AttributeError("No GPU grid size was set by the user.")
 
+        if self._assume_warp_aligned_block_size and prod(self._block_size) % self._hw_props.warp_size != 0:
+            raise CodegenError("Specified block sizes must align with warp size with "
+                               "`assume_warp_aligned_block_size` enabled.")
+
         return self._block_size, self._grid_size
 
     def jit_cache_key(self) -> Any:
@@ -255,12 +297,17 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         self,
         num_work_items: _Dim3Lambda,
         hw_props: HardwareProperties,
+        use_block_size_fitting: bool,
+        assume_warp_aligned_block_size: bool,
         default_block_size: dim3 | None = None,
     ) -> None:
         self._num_work_items = num_work_items
 
         self._hw_props = hw_props
 
+        self._use_block_size_fitting = use_block_size_fitting
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
         self._block_size: dim3 | None = default_block_size
 
         self._params: frozenset[Parameter] = frozenset().union(
@@ -296,23 +343,33 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         num_work_items = cast(
             dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items)
         )
+
+        computed_block_size: tuple[int, ...]
         try:
-            reduced_block_size = cast(
-                dim3,
-                self.fit_block_size(num_work_items, self._block_size, self._hw_props),
-            )
-        except CodegenError:
-            warn(f"Block size fitting could not determine optimal block size configuration. "
+            if self._use_block_size_fitting:
+                computed_block_size = self.fit_block_size(num_work_items, self._block_size, self._hw_props)
+            else:
+                computed_block_size = self.trim_block_size(num_work_items, self._block_size, self._hw_props)
+
+            # check if assumption for warp size alignment is met
+            if self._assume_warp_aligned_block_size and prod(computed_block_size) % self._hw_props.warp_size != 0:
+                raise CodegenError("Adapted block size is not divisible by warp size.")
+        except CodegenError as e:
+            warn(f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
+                 f"Block size fitting could not determine optimal block size configuration. "
                  f"Defaulting back to {self._block_size}")
-            reduced_block_size = self._block_size
+            computed_block_size = self._block_size
+
+        adapted_block_size = cast(dim3, computed_block_size)
+
         grid_size = cast(
             dim3,
             tuple(
-                div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size)
+                div_ceil(wit, bs) for wit, bs in zip(num_work_items, adapted_block_size)
             ),
         )
 
-        return reduced_block_size, grid_size
+        return adapted_block_size, grid_size
 
     def jit_cache_key(self) -> Any:
         return self._block_size
@@ -342,6 +399,8 @@ class GpuIndexing:
         warp_size: int,
         default_block_size: dim3 | _AUTO_TYPE | None = None,
         manual_launch_grid: bool = False,
+        use_block_size_fitting: bool = False,
+        assume_warp_aligned_block_size: bool = False,
     ) -> None:
         self._ctx = ctx
         self._target = target
@@ -349,6 +408,8 @@ class GpuIndexing:
         self._warp_size = warp_size
         self._default_block_size = default_block_size
         self._manual_launch_grid = manual_launch_grid
+        self._use_block_size_fitting = use_block_size_fitting
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
         self._hw_props = HardwareProperties(
             warp_size,
@@ -396,7 +457,10 @@ class GpuIndexing:
     def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
         """Retrieve a factory for the launch configuration for later consumption by the runtime system"""
         if self._manual_launch_grid:
-            return ManualLaunchConfiguration
+            def factory():
+                return ManualLaunchConfiguration(self._hw_props, self._assume_warp_aligned_block_size)
+
+            return factory
 
         match self._scheme:
             case GpuIndexingScheme.Linear3D:
@@ -430,6 +494,8 @@ class GpuIndexing:
             return DynamicBlockSizeLaunchConfiguration(
                 num_work_items,
                 self._hw_props,
+                self._use_block_size_fitting,
+                self._assume_warp_aligned_block_size,
                 self._get_default_block_size(rank),
             )
 
@@ -476,7 +542,10 @@ class GpuIndexing:
 
         def factory():
             return AutomaticLaunchConfiguration(
-                block_size, cast(_Dim3Lambda, grid_size), self._hw_props
+                block_size,
+                cast(_Dim3Lambda, grid_size),
+                self._hw_props,
+                self._assume_warp_aligned_block_size
             )
 
         return factory
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 8dbeadb7f..6cbc8f813 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -75,12 +75,16 @@ def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool):
 
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
+@pytest.mark.parametrize("use_block_size_fitting", [True, False])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
 @pytest.mark.parametrize("iteration_space",
                          [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)])
 @pytest.mark.parametrize("initial_block_size",
                          [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)])
 def test_indexing_launch_config(
         indexing_scheme: str,
+        use_block_size_fitting: bool,
+        assume_warp_aligned_block_size: bool,
         iteration_space: tuple[int, int, int],
         initial_block_size: tuple[int, int, int]
 ):
@@ -99,36 +103,39 @@ def test_indexing_launch_config(
     target = Target.CUDA
     cfg = CreateKernelConfig(target=target)
     cfg.gpu.indexing_scheme = indexing_scheme
+    cfg.gpu.use_block_size_fitting = use_block_size_fitting
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
 
-    warp_size = cfg.gpu.default_warp_size(target)
-    max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
-    max_block_sizes = GpuIndexing.get_max_block_sizes(target)
-
-    block_size = kernel.launch_config.fit_block_size(
-        iteration_space,
-        initial_block_size,
-        HardwareProperties(warp_size, max_threads_per_block, max_block_sizes)
-    )
-
-    def check_suitability(b):
-        return prod(b) >= warp_size and prod(b) % warp_size == 0
-
-    # block size fitting should not modify an already ideal configuration
-    # -> check if ideal configurations are modified
-    if (
-            check_suitability(initial_block_size)
-        and all(x == y for x, y in zip(initial_block_size, iteration_space))  # trimming may alter results
-    ):
-        assert all(x == y for x, y in zip(initial_block_size, block_size)), \
-            f"Initial block size unnecessarily adapted from {initial_block_size} to {block_size}."
-
-    assert check_suitability(block_size), \
-        "Determined block size shall be divisible by warp size."
-
-    kernel.launch_config.block_size = block_size
+    if use_block_size_fitting:
+        warp_size = cfg.gpu.default_warp_size(target)
+        max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
+        max_block_sizes = GpuIndexing.get_max_block_sizes(target)
+
+        block_size = kernel.launch_config.fit_block_size(
+            iteration_space,
+            initial_block_size,
+            HardwareProperties(warp_size, max_threads_per_block, max_block_sizes)
+        )
+
+        def check_suitability(b):
+            return prod(b) >= warp_size and prod(b) % warp_size == 0
+
+        # block size fitting should not modify an already ideal configuration
+        # -> check if ideal configurations are modified
+        if (
+                check_suitability(initial_block_size)
+            and all(x == y for x, y in zip(initial_block_size, iteration_space))  # trimming may alter results
+        ):
+            assert all(x == y for x, y in zip(initial_block_size, block_size)), \
+                f"Initial block size unnecessarily adapted from {initial_block_size} to {block_size}."
+
+        assert check_suitability(block_size), \
+            "Determined block size shall be divisible by warp size."
+
+        kernel.launch_config.block_size = block_size
 
     src_arr = cp.ones(iteration_space)
     dst_arr = cp.zeros_like(src_arr)
-- 
GitLab


From c427c469e29784b98972979c2d75d6f2c50232ea Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 11 Mar 2025 19:35:08 +0100
Subject: [PATCH 35/54] Format gpu_indexing.py

---
 src/pystencils/codegen/gpu_indexing.py | 88 +++++++++++++++++---------
 1 file changed, 57 insertions(+), 31 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 204737981..edddce10e 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -66,14 +66,17 @@ class GpuLaunchConfiguration(ABC):
 
     @staticmethod
     def block_size_exceeds_hw_limits(
-            block_size: tuple[int, ...],
-            hw_props: HardwareProperties) -> bool:
+        block_size: tuple[int, ...], hw_props: HardwareProperties
+    ) -> bool:
         """Checks if provided block size conforms limits given by the hardware."""
 
-        return any(
-            size > max_size
-            for size, max_size in zip(block_size, hw_props.max_block_sizes)
-        ) or prod(block_size) > hw_props.max_threads_per_block
+        return (
+            any(
+                size > max_size
+                for size, max_size in zip(block_size, hw_props.max_block_sizes)
+            )
+            or prod(block_size) > hw_props.max_threads_per_block
+        )
 
     @staticmethod
     def _gen_error_msg(block_size: tuple[int, ...]):
@@ -81,15 +84,15 @@ class GpuLaunchConfiguration(ABC):
         Final block size was too large: {block_size}."
 
     @staticmethod
-    def _round_block_sizes_to_warp_size(to_round: list[int], warp_size: int) -> tuple[int, ...]:
+    def _round_block_sizes_to_warp_size(
+        to_round: list[int], warp_size: int
+    ) -> tuple[int, ...]:
         # check if already aligns with warp size
         if prod(to_round) % warp_size == 0:
             return tuple(to_round)
 
         # find index of element closest to warp size and round up
-        index_to_round = to_round.index(
-            max(to_round, key=lambda i: abs(i % warp_size))
-        )
+        index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
         if index_to_round + 1 < len(to_round):
             return (
                 *to_round[:index_to_round],
@@ -152,7 +155,9 @@ class GpuLaunchConfiguration(ABC):
 
             # case 2: trimmed block is equivalent to the whole iteration space
             if all(b == i for b, i in zip(trimmed, it_space)):
-                return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size))
+                return check_sizes_and_return(
+                    cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size)
+                )
             else:
                 # double block size in each dimension until block is large enough (or case 2 triggers)
                 for d in resize_order:
@@ -175,7 +180,11 @@ class GpuLaunchConfiguration(ABC):
 
                     # case 3: trim block is large enough
                     if prod(trimmed) >= hw_props.warp_size:
-                        return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size))
+                        return check_sizes_and_return(
+                            cls._round_block_sizes_to_warp_size(
+                                trimmed, hw_props.warp_size
+                            )
+                        )
 
         raise CodegenError("Unable to determine GPU block size for this kernel.")
 
@@ -211,8 +220,11 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         # -> round block size in fastest moving dimension up to multiple of warp size
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
         block_size = (
-            ceil_to_multiple(block_size[0], self._hw_props.warp_size)
-            if self._assume_warp_aligned_block_size else block_size[0],
+            (
+                ceil_to_multiple(block_size[0], self._hw_props.warp_size)
+                if self._assume_warp_aligned_block_size
+                else block_size[0]
+            ),
             *block_size[1:],
         )
 
@@ -233,9 +245,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
     """
 
     def __init__(
-        self,
-        hw_props: HardwareProperties,
-        assume_warp_aligned_block_size: bool = False
+        self, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool = False
     ) -> None:
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
@@ -271,9 +281,14 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
         if self._grid_size is None:
             raise AttributeError("No GPU grid size was set by the user.")
 
-        if self._assume_warp_aligned_block_size and prod(self._block_size) % self._hw_props.warp_size != 0:
-            raise CodegenError("Specified block sizes must align with warp size with "
-                               "`assume_warp_aligned_block_size` enabled.")
+        if (
+            self._assume_warp_aligned_block_size
+            and prod(self._block_size) % self._hw_props.warp_size != 0
+        ):
+            raise CodegenError(
+                "Specified block sizes must align with warp size with "
+                "`assume_warp_aligned_block_size` enabled."
+            )
 
         return self._block_size, self._grid_size
 
@@ -347,17 +362,26 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         computed_block_size: tuple[int, ...]
         try:
             if self._use_block_size_fitting:
-                computed_block_size = self.fit_block_size(num_work_items, self._block_size, self._hw_props)
+                computed_block_size = self.fit_block_size(
+                    num_work_items, self._block_size, self._hw_props
+                )
             else:
-                computed_block_size = self.trim_block_size(num_work_items, self._block_size, self._hw_props)
+                computed_block_size = self.trim_block_size(
+                    num_work_items, self._block_size, self._hw_props
+                )
 
             # check if assumption for warp size alignment is met
-            if self._assume_warp_aligned_block_size and prod(computed_block_size) % self._hw_props.warp_size != 0:
+            if (
+                self._assume_warp_aligned_block_size
+                and prod(computed_block_size) % self._hw_props.warp_size != 0
+            ):
                 raise CodegenError("Adapted block size is not divisible by warp size.")
         except CodegenError as e:
-            warn(f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
-                 f"Block size fitting could not determine optimal block size configuration. "
-                 f"Defaulting back to {self._block_size}")
+            warn(
+                f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
+                f"Block size fitting could not determine optimal block size configuration. "
+                f"Defaulting back to {self._block_size}"
+            )
             computed_block_size = self._block_size
 
         adapted_block_size = cast(dim3, computed_block_size)
@@ -457,8 +481,11 @@ class GpuIndexing:
     def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
         """Retrieve a factory for the launch configuration for later consumption by the runtime system"""
         if self._manual_launch_grid:
+
             def factory():
-                return ManualLaunchConfiguration(self._hw_props, self._assume_warp_aligned_block_size)
+                return ManualLaunchConfiguration(
+                    self._hw_props, self._assume_warp_aligned_block_size
+                )
 
             return factory
 
@@ -481,10 +508,9 @@ class GpuIndexing:
             )
 
         work_items_expr += tuple(
-            self._ast_factory.parse_index(1)
-            for _ in range(3 - rank)
+            self._ast_factory.parse_index(1) for _ in range(3 - rank)
         )
-        
+
         num_work_items = cast(
             _Dim3Lambda,
             tuple(self._kernel_factory.create_lambda(wit) for wit in work_items_expr),
@@ -545,7 +571,7 @@ class GpuIndexing:
                 block_size,
                 cast(_Dim3Lambda, grid_size),
                 self._hw_props,
-                self._assume_warp_aligned_block_size
+                self._assume_warp_aligned_block_size,
             )
 
         return factory
-- 
GitLab


From 81a3e2abd4e69b000d00c15856d557c450afa90e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 12 Mar 2025 11:55:43 +0100
Subject: [PATCH 36/54] Fix docstrings

---
 src/pystencils/codegen/gpu_indexing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index edddce10e..f33cd7836 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -113,7 +113,7 @@ class GpuLaunchConfiguration(ABC):
         hw_props: HardwareProperties,
     ) -> tuple[int, ...]:
         """Returns specified block sizes trimmed with iteration space.
-        Raises `CodegenError` if trimmed block size does not conform hardware limits.
+        Raises CodegenError if trimmed block size does not conform hardware limits.
         """
 
         ret = tuple([min(b, i) for b, i in zip(block_size, it_space)])
@@ -129,7 +129,7 @@ class GpuLaunchConfiguration(ABC):
         hw_props: HardwareProperties,
     ) -> tuple[int, ...]:
         """Returns an optimized block size configuration with block sizes being aligned with the warp size.
-        Raises `CodegenError` if optimal block size could not be found or does not conform hardware limits.
+        Raises CodegenError if optimal block size could not be found or does not conform hardware limits.
         """
 
         def trim(to_trim: list[int]) -> list[int]:
-- 
GitLab


From 166656313f141f1ed943b2c885c3c933e0cffdc7 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 12 Mar 2025 12:44:36 +0100
Subject: [PATCH 37/54] Adapt docu for recent changes

---
 docs/source/backend/gpu_codegen.md     |  9 +++++----
 docs/source/user_manual/gpu_kernels.md | 16 +++++++++-------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 70b9db75b..6cc00a6e2 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -74,10 +74,11 @@ parameters,
 the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size.
 The `DynamicBlockSizeLaunchConfiguration` permits the user to set a block size which is used to then dynamically compute
 the grid size. However, the actual block size being used may differ from the user specification. This may be due to
-trimming operations between the original block size and the iteration space to avoid spawning unnecessarily large blocks.
-Another factor affecting the actual block size is the need to have block sizes that match with the warp size given by the hardware 
-for improved performance. Here, the block size is rounded to a multiple of the warp size while considering the
-maximum amount of threads per block.
+trimming operations between the original block size and the iteration space to avoid spawning unnecessarily large 
+blocks. In case the `GpuOptions.use_block_size_fitting` option is set, a block fitting algorithm adapts the original 
+block size such that it aligns with the warp size given by the hardware for improved performance. The algorithm 
+`GpuLaunchConfiguration.fit_block_size` incrementally increases the trimmed block size until it is rounded to a multiple 
+of the warp size while considering the maximum amount of threads per block.
 
 The `evaluate` method can only be used from within a Python runtime environment.
 When exporting pystencils CUDA kernels for external use in C++ projects,
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 8e102c642..2443108c1 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -115,11 +115,7 @@ If the `Linear3D` indexing scheme is used, you may modify the GPU thread block s
 The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` 
 code generator option;
 if none is specified, a default depending on the iteration space's dimensionality will be used.
-
-For performance-related reasons, the block size shall coincide with multiples of the hardware's warp size.
-We enforce this by applying the fitting algorithm found in `GpuLaunchConfiguration.fit_block_size` 
-to alter the initial block size configuration.
-This modification only occurs if the initial configuration is not optimal.
+Please note that this **initial block size configuration may be altered** by the code generator.
 
 The block size can furthermore be modified at the compiled kernel's wrapper object via the
 `launch_config.block_size` attribute:
@@ -131,8 +127,14 @@ kfunc.launch_config.block_size = (256, 2, 1)
 # Run the kernel
 kfunc(f=f_arr, g=g_arr)
 ```
-Please note that when modifying the block size directly, it must align with the warp size,
-since this way of modification trespasses our fitting algorithm.
+
+Block sizes aligning with multiples of the hardware's warp size allow for a better usage of the GPUs resources.
+Even though such block size configurations are not enforced, notifying our code generator via the 
+`GpuOptions.assume_warp_aligned_block_size` option that the configured block size is divisible by the warp size allows 
+for further optimization potential, e.g. for warp-level reductions.
+When setting this option to `True`, the user has to make sure that this alignment applies. This can be achieved by
+manually providing suitable block sizes via the `launch_config.block_size` attribute or by enabling the option
+`GpuOptions.fit_block_size` to automatically adapt the initial block size correspondingly.
 
 In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments
 and the given thread block size.
-- 
GitLab


From 3bfa575a7ee21bc7bb9b9ed76cab9aa36acf125f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 12 Mar 2025 15:04:26 +0100
Subject: [PATCH 38/54] Adapt docstring for DynamicBlockSizeLaunchConfiguration

---
 src/pystencils/codegen/gpu_indexing.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index f33cd7836..5b9d6c9c9 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -299,13 +299,29 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
     """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size.
 
-    In case the user-specified block size does not coincide with multiples of the warp size of the GPU,
-    it will be altered by a block size fitting algorithm.
+    The actual launch grid size is computed from the user-defined ``user_block_size``, the number of work items
+    in the kernel's iteration space and potentially the `GpuOptions.warp_size` as follows.
 
-    The actual launch grid size is then computed as follows.
     For each dimension :math:`c \\in \\{ x, y, z \\}`,
 
-    ``block_size.c = fit_block_size(user_block_size.c)`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
+    - if `GpuOptions.use_block_size_fitting == True`:
+        In case the user-specified block size does not coincide with multiples of the warp size of the GPU,
+        it will be altered by a block size fitting algorithm, i.e.
+
+        ``block_size.c = fit_block_size(user_block_size.c)``
+
+        The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`.
+
+    - otherwise:
+        In this case trimming between the number of work items and the kernel's iteration space occurs, i.e.
+
+        - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c``
+        - otherwise, ``block_size.c = user_block_size.c``
+
+
+    The actual launch grid size is then computed as follows.
+
+    ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
     """
 
     def __init__(
-- 
GitLab


From 7cafe934d8de5032c34dcaabc4b4e3e2957b79e4 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 12 Mar 2025 16:12:12 +0100
Subject: [PATCH 39/54] Fix docstring

---
 src/pystencils/codegen/gpu_indexing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 5b9d6c9c9..2c5e89100 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -304,7 +304,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     For each dimension :math:`c \\in \\{ x, y, z \\}`,
 
-    - if `GpuOptions.use_block_size_fitting == True`:
+    - if `GpuOptions.use_block_size_fitting`:
         In case the user-specified block size does not coincide with multiples of the warp size of the GPU,
         it will be altered by a block size fitting algorithm, i.e.
 
-- 
GitLab


From 9ebeacfeb726fb14be570e40f2fd981cbae15940 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 14 Mar 2025 14:05:45 +0100
Subject: [PATCH 40/54] Remove block_size and use_block_size_fitting options
 and introduce `trim_block_size` and `fit_block_size` instance functions for
 DynamicBlockSizeLaunchConfigurations

---
 src/pystencils/codegen/config.py       |  17 --
 src/pystencils/codegen/driver.py       |   4 -
 src/pystencils/codegen/gpu_indexing.py | 376 +++++++++++++------------
 src/pystencils/jit/gpu_cupy.py         |   5 +-
 tests/kernelcreation/test_gpu.py       | 105 ++++---
 5 files changed, 275 insertions(+), 232 deletions(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 37263e058..56d1f234c 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -372,14 +372,6 @@ class GpuOptions(ConfigBase):
     indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D)
     """Thread indexing scheme for dense GPU kernels."""
 
-    block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
-    """Desired block size for the execution of GPU kernels.
-    
-    This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>`
-    is chosen as an indexing scheme.
-    The block size may be overridden at runtime.
-    """
-
     manual_launch_grid: BasicOption[bool] = BasicOption(False)
     """Always require a manually specified launch grid when running this kernel.
     
@@ -402,15 +394,6 @@ class GpuOptions(ConfigBase):
     The generator also checks if user-provided block sizes fulfill this criterion.
     """
 
-    use_block_size_fitting: BasicOption[bool] = BasicOption(False)
-    """Experimental. Specifies if a fitting algorithm should be used at runtime to 
-    compute optimal block sizes that align with the warp size, 
-    i.e. their product is divisible by the warp size.
-    
-    This option only takes effect for `DynamicBlockSizeLaunchConfiguration`'s, 
-    i.e. if `Linear3D <GpuIndexingScheme.Linear3D>` is chosen as an indexing scheme.
-    """
-
     @staticmethod
     def default_warp_size(target: Target):
         match target:
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index d47b889c7..bd5fc8b0b 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -405,9 +405,7 @@ class DefaultKernelCreationDriver:
         from .gpu_indexing import dim3
 
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
-        block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
-        use_block_size_fitting: bool = self._cfg.gpu.get_option("use_block_size_fitting")
         assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
         warp_size: int | None = self._cfg.gpu.get_option("warp_size")
 
@@ -419,9 +417,7 @@ class DefaultKernelCreationDriver:
             self._target,
             idx_scheme,
             warp_size,
-            block_size,
             manual_launch_grid,
-            use_block_size_fitting,
             assume_warp_aligned_block_size,
         )
 
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 2c5e89100..c25aa3154 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -9,7 +9,7 @@ from warnings import warn
 from .functions import Lambda
 from .parameters import Parameter
 from .errors import CodegenError
-from .config import GpuIndexingScheme, _AUTO_TYPE
+from .config import GpuIndexingScheme
 from .target import Target
 
 from ..backend.kernelcreation import (
@@ -45,6 +45,18 @@ class GpuLaunchConfiguration(ABC):
             parameters to the associated kernel
     """
 
+    @property
+    @abstractmethod
+    def block_size(self) -> dim3 | None:
+        """Returns desired block size if available."""
+        pass
+
+    @block_size.setter
+    @abstractmethod
+    def block_size(self, val: dim3):
+        """Sets desired block size if possible."""
+        pass
+
     @property
     @abstractmethod
     def parameters(self) -> frozenset[Parameter]:
@@ -64,9 +76,24 @@ class GpuLaunchConfiguration(ABC):
         this launch configuration, such that when the configuration changes, the JIT parameter
         cache is invalidated."""
 
+    @staticmethod
+    def get_default_block_size(rank: int) -> dim3:
+        """Returns the default block size configuration used by the generator."""
+
+        match rank:
+            case 1:
+                return (256, 1, 1)
+            case 2:
+                return (16, 16, 1)
+            case 3:
+                return (8, 8, 4)
+            case _:
+                assert False, "unreachable code"
+
     @staticmethod
     def block_size_exceeds_hw_limits(
-        block_size: tuple[int, ...], hw_props: HardwareProperties
+            block_size: tuple[int, ...],
+            hw_props: HardwareProperties
     ) -> bool:
         """Checks if provided block size conforms limits given by the hardware."""
 
@@ -79,115 +106,10 @@ class GpuLaunchConfiguration(ABC):
         )
 
     @staticmethod
-    def _gen_error_msg(block_size: tuple[int, ...]):
+    def _excessive_block_size_error_msg(block_size: tuple[int, ...]):
         return f"Unable to determine GPU block size for this kernel. \
         Final block size was too large: {block_size}."
 
-    @staticmethod
-    def _round_block_sizes_to_warp_size(
-        to_round: list[int], warp_size: int
-    ) -> tuple[int, ...]:
-        # check if already aligns with warp size
-        if prod(to_round) % warp_size == 0:
-            return tuple(to_round)
-
-        # find index of element closest to warp size and round up
-        index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
-        if index_to_round + 1 < len(to_round):
-            return (
-                *to_round[:index_to_round],
-                ceil_to_multiple(to_round[index_to_round], warp_size),
-                *to_round[index_to_round + 1:],
-            )
-        else:
-            return (
-                *to_round[:index_to_round],
-                ceil_to_multiple(to_round[index_to_round], warp_size),
-            )
-
-    @classmethod
-    def trim_block_size(
-        cls,
-        it_space: tuple[int, ...],
-        block_size: tuple[int, ...],
-        hw_props: HardwareProperties,
-    ) -> tuple[int, ...]:
-        """Returns specified block sizes trimmed with iteration space.
-        Raises CodegenError if trimmed block size does not conform hardware limits.
-        """
-
-        ret = tuple([min(b, i) for b, i in zip(block_size, it_space)])
-        if cls.block_size_exceeds_hw_limits(ret, hw_props):
-            raise CodegenError(cls._gen_error_msg(ret))
-        return ret
-
-    @classmethod
-    def fit_block_size(
-        cls,
-        it_space: tuple[int, ...],
-        block_size: tuple[int, int, int],
-        hw_props: HardwareProperties,
-    ) -> tuple[int, ...]:
-        """Returns an optimized block size configuration with block sizes being aligned with the warp size.
-        Raises CodegenError if optimal block size could not be found or does not conform hardware limits.
-        """
-
-        def trim(to_trim: list[int]) -> list[int]:
-            return [min(b, i) for b, i in zip(to_trim, it_space)]
-
-        def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]:
-            if cls.block_size_exceeds_hw_limits(ret, hw_props):
-                raise CodegenError(cls._gen_error_msg(ret))
-            return ret
-
-        trimmed = trim(list(block_size))
-        if (
-            prod(trimmed) >= hw_props.warp_size
-            and prod(trimmed) % hw_props.warp_size == 0
-        ):
-            # case 1: greater than min block size -> use trimmed result
-            return check_sizes_and_return(tuple(trimmed))
-
-        prev_trim_size = 0
-        resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space))
-        while prod(trimmed) is not prev_trim_size:
-            prev_trim_size = prod(trimmed)
-
-            # case 2: trimmed block is equivalent to the whole iteration space
-            if all(b == i for b, i in zip(trimmed, it_space)):
-                return check_sizes_and_return(
-                    cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size)
-                )
-            else:
-                # double block size in each dimension until block is large enough (or case 2 triggers)
-                for d in resize_order:
-                    trimmed[d] *= 2
-
-                    # trim fastest moving dim to multiples of warp size
-                    if (
-                        d == 0
-                        and trimmed[d] > hw_props.warp_size
-                        and trimmed[d] % hw_props.warp_size != 0
-                    ):
-                        # subtract remainder
-                        trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size)
-
-                    # check if block sizes are within hardware capabilities
-                    trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d])
-
-                    # trim again
-                    trimmed = trim(trimmed)
-
-                    # case 3: trim block is large enough
-                    if prod(trimmed) >= hw_props.warp_size:
-                        return check_sizes_and_return(
-                            cls._round_block_sizes_to_warp_size(
-                                trimmed, hw_props.warp_size
-                            )
-                        )
-
-        raise CodegenError("Unable to determine GPU block size for this kernel.")
-
 
 class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """Launch configuration that is dynamically computed from kernel parameters.
@@ -211,6 +133,15 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
             *(lb.parameters for lb in chain(block_size, grid_size))
         )
 
+    @property
+    def block_size(self) -> dim3 | None:
+        """Block size is only available when `evaluate` is called."""
+        return None
+
+    @block_size.setter
+    def block_size(self, val: dim3):
+        warn("Fruitless attempt to set `block_size`.")
+
     @property
     def parameters(self) -> frozenset[Parameter]:
         return self._params
@@ -290,6 +221,9 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
                 "`assume_warp_aligned_block_size` enabled."
             )
 
+        if self.block_size_exceeds_hw_limits(self._block_size, self._hw_props):
+            raise CodegenError(self._excessive_block_size_error_msg(self._block_size))
+
         return self._block_size, self._grid_size
 
     def jit_cache_key(self) -> Any:
@@ -328,18 +262,16 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         self,
         num_work_items: _Dim3Lambda,
         hw_props: HardwareProperties,
-        use_block_size_fitting: bool,
         assume_warp_aligned_block_size: bool,
-        default_block_size: dim3 | None = None,
     ) -> None:
         self._num_work_items = num_work_items
 
         self._hw_props = hw_props
 
-        self._use_block_size_fitting = use_block_size_fitting
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
-        self._block_size: dim3 | None = default_block_size
+        self._init_block_size: dim3 | None = None
+        self._compute_block_size: Callable[[tuple[int, ...], tuple[int, ...], HardwareProperties], tuple[int, ...]] | None = None
 
         self._params: frozenset[Parameter] = frozenset().union(
             *(wit.parameters for wit in num_work_items)
@@ -351,68 +283,189 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         dimension from kernel parameters."""
         return self._num_work_items
 
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+        return self._params
+
     @property
     def block_size(self) -> dim3 | None:
-        """The desired GPU block size."""
-        return self._block_size
+        """Block size is only available when `evaluate` is called."""
+        return None
 
     @block_size.setter
     def block_size(self, val: dim3):
-        self._block_size = val
+        warn("Fruitless attempt to set `block_size`.")
 
-    @property
-    def parameters(self) -> frozenset[Parameter]:
-        """Parameters of this launch configuration"""
-        return self._params
+    @staticmethod
+    def _round_block_sizes_to_warp_size(
+            to_round: tuple[int, ...],
+            warp_size: int
+    ) -> tuple[int, ...]:
+        # check if already aligns with warp size
+        if prod(to_round) % warp_size == 0:
+            return tuple(to_round)
 
-    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
-        if self._block_size is None:
-            raise AttributeError("No GPU block size was specified by the user!")
+        # find index of element closest to warp size and round up
+        index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
+        if index_to_round + 1 < len(to_round):
+            return (
+                *to_round[:index_to_round],
+                ceil_to_multiple(to_round[index_to_round], warp_size),
+                *to_round[index_to_round + 1:],
+            )
+        else:
+            return (
+                *to_round[:index_to_round],
+                ceil_to_multiple(to_round[index_to_round], warp_size),
+            )
+
+    def trim_block_size(self, block_size: dim3):
+        def call_trimming_factory(
+                it: dim3,
+                bs: dim3,
+                hw: HardwareProperties,
+        ):
+            return self._trim_block_size_to_it_space(it, bs, hw)
 
+        self._init_block_size = block_size
+        self._compute_block_size = call_trimming_factory
+
+    def _trim_block_size_to_it_space(
+            self,
+            it_space: dim3,
+            block_size: dim3,
+            hw_props: HardwareProperties,
+    ) -> tuple[int, ...]:
+        """Returns specified block sizes trimmed with iteration space.
+        Raises CodegenError if trimmed block size does not conform hardware limits.
+        """
+
+        ret = tuple([min(b, i) for b, i in zip(block_size, it_space)])
+        if self.block_size_exceeds_hw_limits(ret, hw_props):
+            raise CodegenError(self._excessive_block_size_error_msg(ret))
+
+        if (
+                self._assume_warp_aligned_block_size
+                and prod(ret) % self._hw_props.warp_size != 0
+        ):
+            self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
+
+        return ret
+
+    def fit_block_size(self, block_size: dim3):
+        def call_fitting_factory(
+                it: dim3,
+                bs: dim3,
+                hw: HardwareProperties,
+        ):
+            return self._fit_block_size_to_it_space(it, bs, hw)
+
+        self._init_block_size = block_size
+        self._compute_block_size = call_fitting_factory
+
+    def _fit_block_size_to_it_space(
+            self,
+            it_space: dim3,
+            block_size: dim3,
+            hw_props: HardwareProperties,
+    ) -> tuple[int, ...]:
+        """Returns an optimized block size configuration with block sizes being aligned with the warp size.
+        Raises CodegenError if optimal block size could not be found or does not conform hardware limits.
+        """
+
+        def trim(to_trim: list[int]) -> list[int]:
+            return [min(b, i) for b, i in zip(to_trim, it_space)]
+
+        def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]:
+            if self.block_size_exceeds_hw_limits(ret, hw_props):
+                raise CodegenError(self._excessive_block_size_error_msg(ret))
+            return ret
+
+        trimmed = trim(list(block_size))
+        if (
+                prod(trimmed) >= hw_props.warp_size
+                and prod(trimmed) % hw_props.warp_size == 0
+        ):
+            # case 1: greater than min block size -> use trimmed result
+            return check_sizes_and_return(tuple(trimmed))
+
+        prev_trim_size = 0
+        resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space))
+        while prod(trimmed) is not prev_trim_size:
+            prev_trim_size = prod(trimmed)
+
+            # case 2: trimmed block is equivalent to the whole iteration space
+            if all(b == i for b, i in zip(trimmed, it_space)):
+                return check_sizes_and_return(
+                    self._round_block_sizes_to_warp_size(tuple(trimmed), hw_props.warp_size)
+                )
+            else:
+                # double block size in each dimension until block is large enough (or case 2 triggers)
+                for d in resize_order:
+                    trimmed[d] *= 2
+
+                    # trim fastest moving dim to multiples of warp size
+                    if (
+                            d == 0
+                            and trimmed[d] > hw_props.warp_size
+                            and trimmed[d] % hw_props.warp_size != 0
+                    ):
+                        # subtract remainder
+                        trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size)
+
+                    # check if block sizes are within hardware capabilities
+                    trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d])
+
+                    # trim again
+                    trimmed = trim(trimmed)
+
+                    # case 3: trim block is large enough
+                    if prod(trimmed) >= hw_props.warp_size:
+                        return check_sizes_and_return(
+                            self._round_block_sizes_to_warp_size(
+                                tuple(trimmed), hw_props.warp_size
+                            )
+                        )
+
+        raise CodegenError("Unable to determine GPU block size for this kernel.")
+
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
         from ..utils import div_ceil
 
         num_work_items = cast(
             dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items)
         )
 
-        computed_block_size: tuple[int, ...]
-        try:
-            if self._use_block_size_fitting:
-                computed_block_size = self.fit_block_size(
-                    num_work_items, self._block_size, self._hw_props
-                )
-            else:
-                computed_block_size = self.trim_block_size(
-                    num_work_items, self._block_size, self._hw_props
+        block_size: dim3
+        default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items))
+        if self._compute_block_size:
+            try:
+                computed_bs = self._compute_block_size(
+                    num_work_items, self._init_block_size, self._hw_props)
+
+                block_size = cast(dim3, computed_bs)
+            except CodegenError as e:
+                block_size = default_bs
+                warn(
+                    f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
+                    f"Block size fitting could not determine optimal block size configuration. "
+                    f"Defaulting back to {default_bs}."
                 )
-
-            # check if assumption for warp size alignment is met
-            if (
-                self._assume_warp_aligned_block_size
-                and prod(computed_block_size) % self._hw_props.warp_size != 0
-            ):
-                raise CodegenError("Adapted block size is not divisible by warp size.")
-        except CodegenError as e:
-            warn(
-                f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
-                f"Block size fitting could not determine optimal block size configuration. "
-                f"Defaulting back to {self._block_size}"
-            )
-            computed_block_size = self._block_size
-
-        adapted_block_size = cast(dim3, computed_block_size)
+        else:
+            block_size = default_bs
 
         grid_size = cast(
             dim3,
             tuple(
-                div_ceil(wit, bs) for wit, bs in zip(num_work_items, adapted_block_size)
+                div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size)
             ),
         )
 
-        return adapted_block_size, grid_size
+        return block_size, grid_size
 
     def jit_cache_key(self) -> Any:
-        return self._block_size
+        return ()
 
 
 class GpuIndexing:
@@ -437,18 +490,14 @@ class GpuIndexing:
         target: Target,
         scheme: GpuIndexingScheme,
         warp_size: int,
-        default_block_size: dim3 | _AUTO_TYPE | None = None,
         manual_launch_grid: bool = False,
-        use_block_size_fitting: bool = False,
         assume_warp_aligned_block_size: bool = False,
     ) -> None:
         self._ctx = ctx
         self._target = target
         self._scheme = scheme
         self._warp_size = warp_size
-        self._default_block_size = default_block_size
         self._manual_launch_grid = manual_launch_grid
-        self._use_block_size_fitting = use_block_size_fitting
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
         self._hw_props = HardwareProperties(
@@ -536,30 +585,11 @@ class GpuIndexing:
             return DynamicBlockSizeLaunchConfiguration(
                 num_work_items,
                 self._hw_props,
-                self._use_block_size_fitting,
                 self._assume_warp_aligned_block_size,
-                self._get_default_block_size(rank),
             )
 
         return factory
 
-    def _get_default_block_size(self, rank: int) -> dim3:
-        if self._default_block_size is None:
-            raise CodegenError("The default block size option was not set")
-
-        if isinstance(self._default_block_size, _AUTO_TYPE):
-            match rank:
-                case 1:
-                    return (256, 1, 1)
-                case 2:
-                    return (16, 16, 1)
-                case 3:
-                    return (8, 8, 4)
-                case _:
-                    assert False, "unreachable code"
-        else:
-            return self._default_block_size
-
     def _get_blockwise4d_config_factory(
         self,
     ) -> Callable[[], AutomaticLaunchConfiguration]:
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index f63af091f..7d9db4711 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -207,7 +207,10 @@ class CupyKernelWrapper(KernelWrapper):
 
 class CupyJit(JitBase):
 
-    def __init__(self, default_block_size: Sequence[int] = (16, 16, 1)):
+    def __init__(
+            self,
+            default_block_size: Sequence[int] = GpuLaunchConfiguration.get_default_block_size(rank=3)
+    ):
         self._runtime_headers = {"<cstdint>"}
 
         if len(default_block_size) > 3:
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 6cbc8f813..ae2a6d43d 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -32,7 +32,12 @@ except ImportError:
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
 @pytest.mark.parametrize("manual_grid", [False, True])
-def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool):
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True])
+def test_indexing_options_3d(
+        indexing_scheme: str,
+        manual_grid: bool,
+        assume_warp_aligned_block_size: bool,
+):
     src, dst = fields("src, dst: [3D]")
     asm = Assignment(
         dst.center(),
@@ -47,6 +52,7 @@ def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool):
     cfg = CreateKernelConfig(target=Target.CUDA)
     cfg.gpu.indexing_scheme = indexing_scheme
     cfg.gpu.manual_launch_grid = manual_grid
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
@@ -57,14 +63,25 @@ def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool):
     if manual_grid:
         match indexing_scheme:
             case "linear3d":
-                kernel.launch_config.block_size = (10, 8, 8)
-                kernel.launch_config.grid_size = (4, 4, 2)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (8, 10, 8)
+                    kernel.launch_config.grid_size = (5, 4, 2)
+                else:
+                    kernel.launch_config.block_size = (10, 10, 8)
+                    kernel.launch_config.grid_size = (4, 4, 2)
             case "blockwise4d":
-                kernel.launch_config.block_size = (40, 1, 1)
-                kernel.launch_config.grid_size = (32, 16, 1)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (64, 1, 1)
+                    kernel.launch_config.grid_size = (32, 16, 1)
+                else:
+                    kernel.launch_config.block_size = (40, 1, 1)
+                    kernel.launch_config.grid_size = (32, 16, 1)
 
     elif indexing_scheme == "linear3d":
-        kernel.launch_config.block_size = (10, 8, 8)
+        if assume_warp_aligned_block_size:
+            kernel.launch_config.block_size = (32, 8, 2)
+        else:
+            kernel.launch_config.block_size = (10, 10, 10)
 
     kernel(src=src_arr, dst=dst_arr)
 
@@ -73,22 +90,18 @@ def test_indexing_options_3d(indexing_scheme: str, manual_grid: bool):
 
     cp.testing.assert_allclose(dst_arr, expected)
 
-
-@pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
-@pytest.mark.parametrize("use_block_size_fitting", [True, False])
-@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
 @pytest.mark.parametrize("iteration_space",
                          [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)])
 @pytest.mark.parametrize("initial_block_size",
                          [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)])
-def test_indexing_launch_config(
-        indexing_scheme: str,
-        use_block_size_fitting: bool,
-        assume_warp_aligned_block_size: bool,
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
+@pytest.mark.parametrize("use_block_fitting", [True, False])
+def test_block_size_adaptations(
         iteration_space: tuple[int, int, int],
-        initial_block_size: tuple[int, int, int]
+        initial_block_size: tuple[int, int, int],
+        assume_warp_aligned_block_size: bool,
+        use_block_fitting: bool,
 ):
-
     src, dst = fields("src, dst: [3D]")
     asm = Assignment(
         dst.center(),
@@ -102,24 +115,25 @@ def test_indexing_launch_config(
 
     target = Target.CUDA
     cfg = CreateKernelConfig(target=target)
-    cfg.gpu.indexing_scheme = indexing_scheme
-    cfg.gpu.use_block_size_fitting = use_block_size_fitting
+    cfg.gpu.indexing_scheme = "linear3d"
     cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
+    warp_size = cfg.gpu.default_warp_size(target)
+    max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
+    max_block_sizes = GpuIndexing.get_max_block_sizes(target)
+
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
 
-    if use_block_size_fitting:
-        warp_size = cfg.gpu.default_warp_size(target)
-        max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
-        max_block_sizes = GpuIndexing.get_max_block_sizes(target)
-
-        block_size = kernel.launch_config.fit_block_size(
+    if use_block_fitting:
+        # test internal block fitting function later used in `kernel.launch_config.fit_block_size`
+        internal_block_size = kernel.launch_config._fit_block_size_to_it_space(
             iteration_space,
             initial_block_size,
             HardwareProperties(warp_size, max_threads_per_block, max_block_sizes)
         )
 
+        # checks if criterion for warp size alignment is fulfilled
         def check_suitability(b):
             return prod(b) >= warp_size and prod(b) % warp_size == 0
 
@@ -127,15 +141,19 @@ def test_indexing_launch_config(
         # -> check if ideal configurations are modified
         if (
                 check_suitability(initial_block_size)
-            and all(x == y for x, y in zip(initial_block_size, iteration_space))  # trimming may alter results
+                and all(x == y for x, y in zip(initial_block_size, iteration_space))  # trimming may alter results
         ):
-            assert all(x == y for x, y in zip(initial_block_size, block_size)), \
-                f"Initial block size unnecessarily adapted from {initial_block_size} to {block_size}."
+            assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \
+                f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}."
 
-        assert check_suitability(block_size), \
+        assert check_suitability(internal_block_size), \
             "Determined block size shall be divisible by warp size."
 
-        kernel.launch_config.block_size = block_size
+        # set block size via fitting algorithm
+        kernel.launch_config.fit_block_size(initial_block_size)
+    else:
+        # set block size via trimming algorithm
+        kernel.launch_config.trim_block_size(initial_block_size)
 
     src_arr = cp.ones(iteration_space)
     dst_arr = cp.zeros_like(src_arr)
@@ -149,10 +167,12 @@ def test_indexing_launch_config(
 
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
-@pytest.mark.parametrize("omit_range_check", [False, True])
 @pytest.mark.parametrize("manual_grid", [False, True])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True])
 def test_indexing_options_2d(
-    indexing_scheme: str, omit_range_check: bool, manual_grid: bool
+        indexing_scheme: str,
+        manual_grid: bool,
+        assume_warp_aligned_block_size: bool
 ):
     src, dst = fields("src, dst: [2D]")
     asm = Assignment(
@@ -165,8 +185,8 @@ def test_indexing_options_2d(
 
     cfg = CreateKernelConfig(target=Target.CUDA)
     cfg.gpu.indexing_scheme = indexing_scheme
-    cfg.gpu.omit_range_check = omit_range_check
     cfg.gpu.manual_launch_grid = manual_grid
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
@@ -177,14 +197,25 @@ def test_indexing_options_2d(
     if manual_grid:
         match indexing_scheme:
             case "linear3d":
-                kernel.launch_config.block_size = (10, 8, 1)
-                kernel.launch_config.grid_size = (4, 2, 1)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (8, 8, 1)
+                    kernel.launch_config.grid_size = (5, 2, 1)
+                else:
+                    kernel.launch_config.block_size = (10, 8, 1)
+                    kernel.launch_config.grid_size = (4, 2, 1)
             case "blockwise4d":
-                kernel.launch_config.block_size = (40, 1, 1)
-                kernel.launch_config.grid_size = (16, 1, 1)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (64, 1, 1)
+                    kernel.launch_config.grid_size = (16, 1, 1)
+                else:
+                    kernel.launch_config.block_size = (40, 1, 1)
+                    kernel.launch_config.grid_size = (16, 1, 1)
 
     elif indexing_scheme == "linear3d":
-        kernel.launch_config.block_size = (10, 8, 1)
+        if assume_warp_aligned_block_size:
+            kernel.launch_config.block_size = (8, 8, 1)
+        else:
+            kernel.launch_config.block_size = (10, 8, 1)
 
     kernel(src=src_arr, dst=dst_arr)
 
-- 
GitLab


From dde422637cec5298cf90b3cf8c82655ebc594a2e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 14 Mar 2025 14:08:45 +0100
Subject: [PATCH 41/54] Fix lint

---
 src/pystencils/codegen/driver.py       |  2 -
 src/pystencils/codegen/gpu_indexing.py | 66 ++++++++++++++------------
 tests/kernelcreation/test_gpu.py       | 20 ++++----
 3 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index bd5fc8b0b..65352f520 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -402,8 +402,6 @@ class DefaultKernelCreationDriver:
         if self._target != Target.CUDA:
             return None
 
-        from .gpu_indexing import dim3
-
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
         assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index c25aa3154..31e89353c 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -92,8 +92,7 @@ class GpuLaunchConfiguration(ABC):
 
     @staticmethod
     def block_size_exceeds_hw_limits(
-            block_size: tuple[int, ...],
-            hw_props: HardwareProperties
+        block_size: tuple[int, ...], hw_props: HardwareProperties
     ) -> bool:
         """Checks if provided block size conforms limits given by the hardware."""
 
@@ -271,7 +270,12 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
         self._init_block_size: dim3 | None = None
-        self._compute_block_size: Callable[[tuple[int, ...], tuple[int, ...], HardwareProperties], tuple[int, ...]] | None = None
+        self._compute_block_size: (
+            Callable[
+                [tuple[int, ...], tuple[int, ...], HardwareProperties], tuple[int, ...]
+            ]
+            | None
+        ) = None
 
         self._params: frozenset[Parameter] = frozenset().union(
             *(wit.parameters for wit in num_work_items)
@@ -299,8 +303,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     @staticmethod
     def _round_block_sizes_to_warp_size(
-            to_round: tuple[int, ...],
-            warp_size: int
+        to_round: tuple[int, ...], warp_size: int
     ) -> tuple[int, ...]:
         # check if already aligns with warp size
         if prod(to_round) % warp_size == 0:
@@ -322,9 +325,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     def trim_block_size(self, block_size: dim3):
         def call_trimming_factory(
-                it: dim3,
-                bs: dim3,
-                hw: HardwareProperties,
+            it: dim3,
+            bs: dim3,
+            hw: HardwareProperties,
         ):
             return self._trim_block_size_to_it_space(it, bs, hw)
 
@@ -332,10 +335,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         self._compute_block_size = call_trimming_factory
 
     def _trim_block_size_to_it_space(
-            self,
-            it_space: dim3,
-            block_size: dim3,
-            hw_props: HardwareProperties,
+        self,
+        it_space: dim3,
+        block_size: dim3,
+        hw_props: HardwareProperties,
     ) -> tuple[int, ...]:
         """Returns specified block sizes trimmed with iteration space.
         Raises CodegenError if trimmed block size does not conform hardware limits.
@@ -346,8 +349,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
             raise CodegenError(self._excessive_block_size_error_msg(ret))
 
         if (
-                self._assume_warp_aligned_block_size
-                and prod(ret) % self._hw_props.warp_size != 0
+            self._assume_warp_aligned_block_size
+            and prod(ret) % self._hw_props.warp_size != 0
         ):
             self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
 
@@ -355,9 +358,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     def fit_block_size(self, block_size: dim3):
         def call_fitting_factory(
-                it: dim3,
-                bs: dim3,
-                hw: HardwareProperties,
+            it: dim3,
+            bs: dim3,
+            hw: HardwareProperties,
         ):
             return self._fit_block_size_to_it_space(it, bs, hw)
 
@@ -365,10 +368,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         self._compute_block_size = call_fitting_factory
 
     def _fit_block_size_to_it_space(
-            self,
-            it_space: dim3,
-            block_size: dim3,
-            hw_props: HardwareProperties,
+        self,
+        it_space: dim3,
+        block_size: dim3,
+        hw_props: HardwareProperties,
     ) -> tuple[int, ...]:
         """Returns an optimized block size configuration with block sizes being aligned with the warp size.
         Raises CodegenError if optimal block size could not be found or does not conform hardware limits.
@@ -384,8 +387,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
         trimmed = trim(list(block_size))
         if (
-                prod(trimmed) >= hw_props.warp_size
-                and prod(trimmed) % hw_props.warp_size == 0
+            prod(trimmed) >= hw_props.warp_size
+            and prod(trimmed) % hw_props.warp_size == 0
         ):
             # case 1: greater than min block size -> use trimmed result
             return check_sizes_and_return(tuple(trimmed))
@@ -398,7 +401,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
             # case 2: trimmed block is equivalent to the whole iteration space
             if all(b == i for b, i in zip(trimmed, it_space)):
                 return check_sizes_and_return(
-                    self._round_block_sizes_to_warp_size(tuple(trimmed), hw_props.warp_size)
+                    self._round_block_sizes_to_warp_size(
+                        tuple(trimmed), hw_props.warp_size
+                    )
                 )
             else:
                 # double block size in each dimension until block is large enough (or case 2 triggers)
@@ -407,9 +412,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
                     # trim fastest moving dim to multiples of warp size
                     if (
-                            d == 0
-                            and trimmed[d] > hw_props.warp_size
-                            and trimmed[d] % hw_props.warp_size != 0
+                        d == 0
+                        and trimmed[d] > hw_props.warp_size
+                        and trimmed[d] % hw_props.warp_size != 0
                     ):
                         # subtract remainder
                         trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size)
@@ -442,7 +447,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         if self._compute_block_size:
             try:
                 computed_bs = self._compute_block_size(
-                    num_work_items, self._init_block_size, self._hw_props)
+                    num_work_items, self._init_block_size, self._hw_props
+                )
 
                 block_size = cast(dim3, computed_bs)
             except CodegenError as e:
@@ -457,9 +463,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
         grid_size = cast(
             dim3,
-            tuple(
-                div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size)
-            ),
+            tuple(div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size)),
         )
 
         return block_size, grid_size
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index ae2a6d43d..70b4edd35 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -34,9 +34,9 @@ except ImportError:
 @pytest.mark.parametrize("manual_grid", [False, True])
 @pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True])
 def test_indexing_options_3d(
-        indexing_scheme: str,
-        manual_grid: bool,
-        assume_warp_aligned_block_size: bool,
+    indexing_scheme: str,
+    manual_grid: bool,
+    assume_warp_aligned_block_size: bool,
 ):
     src, dst = fields("src, dst: [3D]")
     asm = Assignment(
@@ -97,10 +97,10 @@ def test_indexing_options_3d(
 @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
 @pytest.mark.parametrize("use_block_fitting", [True, False])
 def test_block_size_adaptations(
-        iteration_space: tuple[int, int, int],
-        initial_block_size: tuple[int, int, int],
-        assume_warp_aligned_block_size: bool,
-        use_block_fitting: bool,
+    iteration_space: tuple[int, int, int],
+    initial_block_size: tuple[int, int, int],
+    assume_warp_aligned_block_size: bool,
+    use_block_fitting: bool,
 ):
     src, dst = fields("src, dst: [3D]")
     asm = Assignment(
@@ -130,7 +130,7 @@ def test_block_size_adaptations(
         internal_block_size = kernel.launch_config._fit_block_size_to_it_space(
             iteration_space,
             initial_block_size,
-            HardwareProperties(warp_size, max_threads_per_block, max_block_sizes)
+            HardwareProperties(warp_size, max_threads_per_block, max_block_sizes),
         )
 
         # checks if criterion for warp size alignment is fulfilled
@@ -170,9 +170,7 @@ def test_block_size_adaptations(
 @pytest.mark.parametrize("manual_grid", [False, True])
 @pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True])
 def test_indexing_options_2d(
-        indexing_scheme: str,
-        manual_grid: bool,
-        assume_warp_aligned_block_size: bool
+    indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool
 ):
     src, dst = fields("src, dst: [2D]")
     asm = Assignment(
-- 
GitLab


From 7b818fa39bc395619d59ce99d5d5f251d51bc63c Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 14 Mar 2025 14:36:53 +0100
Subject: [PATCH 42/54] Fix typecheck

---
 src/pystencils/codegen/config.py       |  4 +---
 src/pystencils/codegen/gpu_indexing.py | 16 +++++++---------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 56d1f234c..04ceaf93c 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -742,9 +742,7 @@ class CreateKernelConfig(ConfigBase):
                 UserWarning,
             )
 
-            self.gpu = GpuOptions(
-                block_size=gpu_indexing_params.get("block_size", None)
-            )
+            self.gpu = GpuOptions()
 
 
 def _deprecated_option(name, instead):  # pragma: no cover
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 31e89353c..b80a5027e 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -269,12 +269,11 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
-        self._init_block_size: dim3 | None = None
+        default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items))
+        self._default_block_size = default_bs
+        self._init_block_size: dim3 = default_bs
         self._compute_block_size: (
-            Callable[
-                [tuple[int, ...], tuple[int, ...], HardwareProperties], tuple[int, ...]
-            ]
-            | None
+            Callable[[dim3, dim3, HardwareProperties], tuple[int, ...]] | None
         ) = None
 
         self._params: frozenset[Parameter] = frozenset().union(
@@ -443,7 +442,6 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         )
 
         block_size: dim3
-        default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items))
         if self._compute_block_size:
             try:
                 computed_bs = self._compute_block_size(
@@ -452,14 +450,14 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
                 block_size = cast(dim3, computed_bs)
             except CodegenError as e:
-                block_size = default_bs
+                block_size = self._default_block_size
                 warn(
                     f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
                     f"Block size fitting could not determine optimal block size configuration. "
-                    f"Defaulting back to {default_bs}."
+                    f"Defaulting back to {self._default_block_size}."
                 )
         else:
-            block_size = default_bs
+            block_size = self._default_block_size
 
         grid_size = cast(
             dim3,
-- 
GitLab


From b400aabfc82ea32ef413f6803d41c984970fb022 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 14 Mar 2025 17:27:47 +0100
Subject: [PATCH 43/54] Adapt documentation for CUDA block sizes

---
 docs/source/backend/gpu_codegen.md     | 14 ++++-----
 docs/source/user_manual/gpu_kernels.md | 43 +++++++++++++++++---------
 src/pystencils/codegen/gpu_indexing.py | 37 +++++++++++++++-------
 3 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 6cc00a6e2..e5035d7c7 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -72,13 +72,13 @@ These depend on the type of the launch configuration:
 while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel
 parameters,
 the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size.
-The `DynamicBlockSizeLaunchConfiguration` permits the user to set a block size which is used to then dynamically compute
-the grid size. However, the actual block size being used may differ from the user specification. This may be due to
-trimming operations between the original block size and the iteration space to avoid spawning unnecessarily large 
-blocks. In case the `GpuOptions.use_block_size_fitting` option is set, a block fitting algorithm adapts the original 
-block size such that it aligns with the warp size given by the hardware for improved performance. The algorithm 
-`GpuLaunchConfiguration.fit_block_size` incrementally increases the trimmed block size until it is rounded to a multiple 
-of the warp size while considering the maximum amount of threads per block.
+The `DynamicBlockSizeLaunchConfiguration` dynamically computes the grid size from either the default block size
+or a computed block size. Computing block sizes can be signaled by the user via the `trim_block_size` or 
+`fit_block_size` member functions. These function receive an initial block size as an argument and adapt it.
+The `trim_block_size` function trims the initial block size with the sizes of the iteration space, i.e. it takes 
+the minimum value of both sizes per dimension. The `fit_block_size` performs a block fitting algorithm that adapts 
+the initial block size by incrementally enlarging the trimmed block size until it is large enough 
+and aligns with the warp size.
 
 The `evaluate` method can only be used from within a Python runtime environment.
 When exporting pystencils CUDA kernels for external use in C++ projects,
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 2443108c1..0a6620123 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -111,20 +111,30 @@ For most kernels with an at most three-dimensional iteration space,
 this behavior is sufficient and desired.
 It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`.
 
-If the `Linear3D` indexing scheme is used, you may modify the GPU thread block size in two places.
-The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` 
-code generator option;
-if none is specified, a default depending on the iteration space's dimensionality will be used.
-Please note that this **initial block size configuration may be altered** by the code generator.
-
-The block size can furthermore be modified at the compiled kernel's wrapper object via the
-`launch_config.block_size` attribute:
+The GPU thread block size of a compiled kernel's wrapper object can only be directly modified
+for manual launch configurations, cf. the section for [manual launch configurations](#manual_launch_grids).
+Linear indexing schemes without manual launch configurations either employ default block sizes
+or compute block sizes using user-exposed member functions that adapt the initial block size that was
+passed as an argument. The available functions are `fit_block_size` and `trim_block_size`.
+The `trim_block_size` function trims the user-defined initial block size with the iteration space.
+The `fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is
+divisible by the hardware's warp size.
 
 ```{code-cell} ipython3
 kfunc = kernel.compile()
-kfunc.launch_config.block_size = (256, 2, 1)
 
-# Run the kernel
+# three different configuration cases for block size:
+
+# a) Nothing
+# -> use default block size, perform no fitting, no trimming
+
+# b) Activate fitting with initial block size
+kfunc.launch_config.fit_block_size(256, 2, 1)
+
+# c) Activate trimming with initial block size
+kfunc.launch_config.trim_block_size(256, 2, 1)
+
+# finally run the kernel...
 kfunc(f=f_arr, g=g_arr)
 ```
 
@@ -132,12 +142,17 @@ Block sizes aligning with multiples of the hardware's warp size allow for a bett
 Even though such block size configurations are not enforced, notifying our code generator via the 
 `GpuOptions.assume_warp_aligned_block_size` option that the configured block size is divisible by the warp size allows 
 for further optimization potential, e.g. for warp-level reductions.
-When setting this option to `True`, the user has to make sure that this alignment applies. This can be achieved by
-manually providing suitable block sizes via the `launch_config.block_size` attribute or by enabling the option
-`GpuOptions.fit_block_size` to automatically adapt the initial block size correspondingly.
+When setting this option to `True`, the user has to make sure that this alignment applies. 
+For [manual launch configurations](#manual_launch_grids), this can be achieved by manually providing suitable 
+block sizes via the `launch_config.block_size`. For the other launch configurations, this criterion is guaranteed 
+by using the default block sizes in pystencils. Using `fit_block_size` also guarantees this while also producing
+block sizes that are better customized towards the kernel's iteration space. For `trim_block_size`, 
+the trimmed block's dimension that is closest the next multiple of the warp size is rounded up to the next multiple
+of the warp size. For all cases, the final block size is checked against the imposed hardware limits and an error
+is thrown in case these limits are exceeded.
 
 In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments
-and the given thread block size.
+and the final thread block size.
 
 :::{attention}
 
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index b80a5027e..026d4bf07 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -230,27 +230,34 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
 
 class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
-    """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size.
-
-    The actual launch grid size is computed from the user-defined ``user_block_size``, the number of work items
-    in the kernel's iteration space and potentially the `GpuOptions.warp_size` as follows.
+    """GPU launch configuration that dynamically computes the grid size from either the default block size
+    or a computed block size. Computing block sizes can be triggerred via the `trim_block_size` or `fit_block_size`
+    member functions. These functions adapt a user-defined initial block size that they receive as an argument.
+    The adaptation of the initial block sizes is described in the following:
 
     For each dimension :math:`c \\in \\{ x, y, z \\}`,
 
-    - if `GpuOptions.use_block_size_fitting`:
-        In case the user-specified block size does not coincide with multiples of the warp size of the GPU,
-        it will be altered by a block size fitting algorithm, i.e.
+    - if `fit_block_size` was chosen, the initial block size is adapted such that it aligns with
+    multiples of the hardware's warp size. This is done using a fitting algorithm first trims
+    the initial block size with the iteration space and increases it incrementally until it is large enough
+    and coincides with multiples of the warp size, i.e.
 
-        ``block_size.c = fit_block_size(user_block_size.c)``
+        ``block_size.c = _fit_block_size_to_it_space(iter_space.c, init_block_size.c, hardware_properties)``
 
         The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`.
 
-    - otherwise:
-        In this case trimming between the number of work items and the kernel's iteration space occurs, i.e.
+    - elif `trim_block_size` was chosen, a trimming between the number of work items
+    and the kernel's iteration space occurs, i.e.
+
+        - if ``init_block_size.c > num_work_items.c``, ``block_size = num_work_items.c``
+        - otherwise, ``block_size.c = init_block_size.c``
 
-        - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c``
-        - otherwise, ``block_size.c = user_block_size.c``
+        When `GpuOptions.assume_warp_aligned_block_size` is set, we ensure warp-alignment by
+        rounding the block size dimension that is closest the next multiple of the warp size.
 
+    - else the default block size is taken, i.e.
+
+        ``block_size.c = get_default_block_size(rank=3).c``
 
     The actual launch grid size is then computed as follows.
 
@@ -330,6 +337,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         ):
             return self._trim_block_size_to_it_space(it, bs, hw)
 
+        if self._compute_block_size:
+            warn("Overwriting an already existing callback function for computing the block size.")
+
         self._init_block_size = block_size
         self._compute_block_size = call_trimming_factory
 
@@ -363,6 +373,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         ):
             return self._fit_block_size_to_it_space(it, bs, hw)
 
+        if self._compute_block_size:
+            warn("Overwriting an already existing callback function for computing the block size.")
+
         self._init_block_size = block_size
         self._compute_block_size = call_fitting_factory
 
-- 
GitLab


From 403cbbcf05264e6096e40151aea78161acbb1002 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 17 Mar 2025 13:44:48 +0100
Subject: [PATCH 44/54] Fix docs

---
 docs/source/user_manual/gpu_kernels.md | 15 ++++++++-------
 src/pystencils/codegen/gpu_indexing.py | 22 ++++++++++++----------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 0a6620123..146c2c428 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -115,12 +115,13 @@ The GPU thread block size of a compiled kernel's wrapper object can only be dire
 for manual launch configurations, cf. the section for [manual launch configurations](#manual_launch_grids).
 Linear indexing schemes without manual launch configurations either employ default block sizes
 or compute block sizes using user-exposed member functions that adapt the initial block size that was
-passed as an argument. The available functions are `fit_block_size` and `trim_block_size`.
-The `trim_block_size` function trims the user-defined initial block size with the iteration space.
-The `fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is
+passed as an argument. The available functions are :meth:`fit_block_size` and :meth:`trim_block_size`.
+The :meth:`trim_block_size` function trims the user-defined initial block size with the iteration space.
+The :meth:`fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is
 divisible by the hardware's warp size.
 
 ```{code-cell} ipython3
+:tags: [raises-exception]
 kfunc = kernel.compile()
 
 # three different configuration cases for block size:
@@ -129,10 +130,10 @@ kfunc = kernel.compile()
 # -> use default block size, perform no fitting, no trimming
 
 # b) Activate fitting with initial block size
-kfunc.launch_config.fit_block_size(256, 2, 1)
+kfunc.launch_config.fit_block_size((8, 8, 4))
 
 # c) Activate trimming with initial block size
-kfunc.launch_config.trim_block_size(256, 2, 1)
+#kfunc.launch_config.trim_block_size((8, 8, 4))
 
 # finally run the kernel...
 kfunc(f=f_arr, g=g_arr)
@@ -145,8 +146,8 @@ for further optimization potential, e.g. for warp-level reductions.
 When setting this option to `True`, the user has to make sure that this alignment applies. 
 For [manual launch configurations](#manual_launch_grids), this can be achieved by manually providing suitable 
 block sizes via the `launch_config.block_size`. For the other launch configurations, this criterion is guaranteed 
-by using the default block sizes in pystencils. Using `fit_block_size` also guarantees this while also producing
-block sizes that are better customized towards the kernel's iteration space. For `trim_block_size`, 
+by using the default block sizes in pystencils. Using :meth:`fit_block_size` also guarantees this while also producing
+block sizes that are better customized towards the kernel's iteration space. For :meth:`trim_block_size`, 
 the trimmed block's dimension that is closest the next multiple of the warp size is rounded up to the next multiple
 of the warp size. For all cases, the final block size is checked against the imposed hardware limits and an error
 is thrown in case these limits are exceeded.
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 026d4bf07..16b0826e7 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -231,23 +231,25 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
 class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
     """GPU launch configuration that dynamically computes the grid size from either the default block size
-    or a computed block size. Computing block sizes can be triggerred via the `trim_block_size` or `fit_block_size`
-    member functions. These functions adapt a user-defined initial block size that they receive as an argument.
-    The adaptation of the initial block sizes is described in the following:
+    or a computed block size. Computing block sizes can be triggerred via the :meth:`trim_block_size` or
+    :meth:`fit_block_size` member functions. These functions adapt a user-defined initial block size that they
+    receive as an argument. The adaptation of the initial block sizes is described in the following:
 
     For each dimension :math:`c \\in \\{ x, y, z \\}`,
 
-    - if `fit_block_size` was chosen, the initial block size is adapted such that it aligns with
-    multiples of the hardware's warp size. This is done using a fitting algorithm first trims
-    the initial block size with the iteration space and increases it incrementally until it is large enough
-    and coincides with multiples of the warp size, i.e.
+    - if :meth:`fit_block_size` was chosen:
+
+        the initial block size is adapted such that it aligns with multiples of the hardware's warp size.
+        This is done using a fitting algorithm first trims the initial block size with the iteration space
+        and increases it incrementally until it is large enough and coincides with multiples of the warp size, i.e.
 
         ``block_size.c = _fit_block_size_to_it_space(iter_space.c, init_block_size.c, hardware_properties)``
 
         The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`.
 
-    - elif `trim_block_size` was chosen, a trimming between the number of work items
-    and the kernel's iteration space occurs, i.e.
+    - elif :meth:`trim_block_size` was chosen:
+
+        a trimming between the number of work items and the kernel's iteration space occurs, i.e.
 
         - if ``init_block_size.c > num_work_items.c``, ``block_size = num_work_items.c``
         - otherwise, ``block_size.c = init_block_size.c``
@@ -255,7 +257,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         When `GpuOptions.assume_warp_aligned_block_size` is set, we ensure warp-alignment by
         rounding the block size dimension that is closest the next multiple of the warp size.
 
-    - else the default block size is taken, i.e.
+    - otherwise: the default block size is taken i.e.
 
         ``block_size.c = get_default_block_size(rank=3).c``
 
-- 
GitLab


From 27ae1f428a94163528d8ee3d4cd282e08f0f5c1a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 12:00:36 +0100
Subject: [PATCH 45/54] Adapt warnings

---
 src/pystencils/codegen/gpu_indexing.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 16b0826e7..a497411c7 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -139,7 +139,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
 
     @block_size.setter
     def block_size(self, val: dim3):
-        warn("Fruitless attempt to set `block_size`.")
+        warn("Setting `block_size` on an automatic launch configuration has no effect.", UserWarning)
 
     @property
     def parameters(self) -> frozenset[Parameter]:
@@ -307,7 +307,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     @block_size.setter
     def block_size(self, val: dim3):
-        warn("Fruitless attempt to set `block_size`.")
+        warn("Setting `block_size` on an dynamic launch configuration has no effect.", UserWarning)
 
     @staticmethod
     def _round_block_sizes_to_warp_size(
@@ -339,9 +339,6 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         ):
             return self._trim_block_size_to_it_space(it, bs, hw)
 
-        if self._compute_block_size:
-            warn("Overwriting an already existing callback function for computing the block size.")
-
         self._init_block_size = block_size
         self._compute_block_size = call_trimming_factory
 
@@ -375,9 +372,6 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         ):
             return self._fit_block_size_to_it_space(it, bs, hw)
 
-        if self._compute_block_size:
-            warn("Overwriting an already existing callback function for computing the block size.")
-
         self._init_block_size = block_size
         self._compute_block_size = call_fitting_factory
 
-- 
GitLab


From a6125bc7beb1e69bcfbf06b20d6906e6cd5ede5d Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 12:01:47 +0100
Subject: [PATCH 46/54] Move block_size_exceeds_hw_limits to HardwareProperties

---
 src/pystencils/codegen/gpu_indexing.py | 36 +++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index a497411c7..042af70db 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -34,6 +34,20 @@ class HardwareProperties:
     max_threads_per_block: int
     max_block_sizes: dim3
 
+    def block_size_exceeds_hw_limits(
+            self,
+            block_size: tuple[int, ...]
+    ) -> bool:
+        """Checks if provided block size conforms limits given by the hardware."""
+
+        return (
+            any(
+                size > max_size
+                for size, max_size in zip(block_size, self.max_block_sizes)
+            )
+            or prod(block_size) > self.max_threads_per_block
+        )
+
 
 class GpuLaunchConfiguration(ABC):
     """Base class for launch configurations for CUDA and HIP kernels.
@@ -90,20 +104,6 @@ class GpuLaunchConfiguration(ABC):
             case _:
                 assert False, "unreachable code"
 
-    @staticmethod
-    def block_size_exceeds_hw_limits(
-        block_size: tuple[int, ...], hw_props: HardwareProperties
-    ) -> bool:
-        """Checks if provided block size conforms limits given by the hardware."""
-
-        return (
-            any(
-                size > max_size
-                for size, max_size in zip(block_size, hw_props.max_block_sizes)
-            )
-            or prod(block_size) > hw_props.max_threads_per_block
-        )
-
     @staticmethod
     def _excessive_block_size_error_msg(block_size: tuple[int, ...]):
         return f"Unable to determine GPU block size for this kernel. \
@@ -158,7 +158,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
             *block_size[1:],
         )
 
-        if self.block_size_exceeds_hw_limits(block_size, self._hw_props):
+        if self._hw_props.block_size_exceeds_hw_limits(block_size):
             raise CodegenError(f"Block size {block_size} exceeds hardware limits.")
 
         grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
@@ -220,7 +220,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
                 "`assume_warp_aligned_block_size` enabled."
             )
 
-        if self.block_size_exceeds_hw_limits(self._block_size, self._hw_props):
+        if self._hw_props.block_size_exceeds_hw_limits(self._block_size):
             raise CodegenError(self._excessive_block_size_error_msg(self._block_size))
 
         return self._block_size, self._grid_size
@@ -353,7 +353,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         """
 
         ret = tuple([min(b, i) for b, i in zip(block_size, it_space)])
-        if self.block_size_exceeds_hw_limits(ret, hw_props):
+        if hw_props.block_size_exceeds_hw_limits(ret):
             raise CodegenError(self._excessive_block_size_error_msg(ret))
 
         if (
@@ -389,7 +389,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
             return [min(b, i) for b, i in zip(to_trim, it_space)]
 
         def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]:
-            if self.block_size_exceeds_hw_limits(ret, hw_props):
+            if hw_props.block_size_exceeds_hw_limits(ret):
                 raise CodegenError(self._excessive_block_size_error_msg(ret))
             return ret
 
-- 
GitLab


From 74b1457af0851606743d6421a61bc0396edfca1f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 12:02:12 +0100
Subject: [PATCH 47/54] Remove _default_block_size from CupyJit

---
 src/pystencils/jit/gpu_cupy.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 7d9db4711..5a078df9f 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -207,22 +207,9 @@ class CupyKernelWrapper(KernelWrapper):
 
 class CupyJit(JitBase):
 
-    def __init__(
-            self,
-            default_block_size: Sequence[int] = GpuLaunchConfiguration.get_default_block_size(rank=3)
-    ):
+    def __init__(self):
         self._runtime_headers = {"<cstdint>"}
 
-        if len(default_block_size) > 3:
-            raise ValueError(
-                f"Invalid block size: {default_block_size}. Must be at most three-dimensional."
-            )
-
-        self._default_block_size: tuple[int, int, int] = cast(
-            tuple[int, int, int],
-            tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
-        )
-
     def compile(self, kernel: Kernel) -> KernelWrapper:
         if not HAVE_CUPY:
             raise JitError(
-- 
GitLab


From 5737f64d2769201388fc1eaa4124e30b0a9c15cd Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 12:02:42 +0100
Subject: [PATCH 48/54] Adapt docs

---
 docs/source/user_manual/gpu_kernels.md | 2 +-
 src/pystencils/codegen/config.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 146c2c428..48a24e703 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -133,7 +133,7 @@ kfunc = kernel.compile()
 kfunc.launch_config.fit_block_size((8, 8, 4))
 
 # c) Activate trimming with initial block size
-#kfunc.launch_config.trim_block_size((8, 8, 4))
+kfunc.launch_config.trim_block_size((8, 8, 4))
 
 # finally run the kernel...
 kfunc(f=f_arr, g=g_arr)
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 04ceaf93c..d6f8e403c 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -391,7 +391,7 @@ class GpuOptions(ConfigBase):
     
     If set to `True`, the code generator can employ optimizations that require this assumption, 
     e.g. warp-level reductions.
-    The generator also checks if user-provided block sizes fulfill this criterion.
+    The pystencils Cupy runtime also checks if user-provided block sizes fulfill this criterion.
     """
 
     @staticmethod
-- 
GitLab


From d733102e178bca762284a640d0084a7a889ceceb Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 12:08:14 +0100
Subject: [PATCH 49/54] Move ceil_to_multiple of block sizes for automatic
 launch configs to factory

---
 src/pystencils/codegen/gpu_indexing.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 042af70db..724106ec2 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -146,17 +146,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         return self._params
 
     def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
-        # impossible to use block size determination function since the iteration space is unknown
-        # -> round block size in fastest moving dimension up to multiple of warp size
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
-        block_size = (
-            (
-                ceil_to_multiple(block_size[0], self._hw_props.warp_size)
-                if self._assume_warp_aligned_block_size
-                else block_size[0]
-            ),
-            *block_size[1:],
-        )
 
         if self._hw_props.block_size_exceeds_hw_limits(block_size):
             raise CodegenError(f"Block size {block_size} exceeds hardware limits.")
@@ -610,8 +600,16 @@ class GpuIndexing:
         if rank > 4:
             raise ValueError(f"Iteration space rank is too large: {rank}")
 
+        # impossible to use block size determination function since the iteration space is unknown
+        # -> round block size in fastest moving dimension up to multiple of warp size
+        rounded_block_size: PsExpression
+        if self._assume_warp_aligned_block_size:
+            rounded_block_size = ceil_to_multiple(work_items[0], self._hw_props.warp_size)
+        else:
+            rounded_block_size = work_items[0]
+
         block_size = (
-            self._kernel_factory.create_lambda(work_items[0]),
+            self._kernel_factory.create_lambda(rounded_block_size),
             self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)),
             self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)),
         )
-- 
GitLab


From 3ad5b4a4ea09aa82b1fc4f24771cf4b691e3969e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 12:09:25 +0100
Subject: [PATCH 50/54] Fix lint

---
 src/pystencils/jit/gpu_cupy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 5a078df9f..4e36e369e 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -1,4 +1,4 @@
-from typing import Any, Sequence, cast, Callable
+from typing import Any, Callable
 from dataclasses import dataclass
 
 try:
-- 
GitLab


From a3fd0c243d713b9e22a04779c360af0786bc9732 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 12:36:06 +0100
Subject: [PATCH 51/54] Fix type for ceil_to_multiple in automatic launch
 config lambda

---
 src/pystencils/codegen/gpu_indexing.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 724106ec2..52efee81d 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -11,6 +11,7 @@ from .parameters import Parameter
 from .errors import CodegenError
 from .config import GpuIndexingScheme
 from .target import Target
+from ..backend.constants import PsConstant
 
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -604,7 +605,9 @@ class GpuIndexing:
         # -> round block size in fastest moving dimension up to multiple of warp size
         rounded_block_size: PsExpression
         if self._assume_warp_aligned_block_size:
-            rounded_block_size = ceil_to_multiple(work_items[0], self._hw_props.warp_size)
+            rounded_block_size = ceil_to_multiple(
+                work_items[0],
+                PsExpression.make(PsConstant(self._hw_props.warp_size, work_items[0].dtype)))
         else:
             rounded_block_size = work_items[0]
 
-- 
GitLab


From 1a2b8f74552467455d0dcbd1cf023646ad2a7f41 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 12:48:53 +0100
Subject: [PATCH 52/54] Throw attribute error for wrongfully used block size
 setter

---
 src/pystencils/codegen/gpu_indexing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 52efee81d..bcffbbcae 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -140,7 +140,7 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
 
     @block_size.setter
     def block_size(self, val: dim3):
-        warn("Setting `block_size` on an automatic launch configuration has no effect.", UserWarning)
+        AttributeError("Setting `block_size` on an automatic launch configuration has no effect.")
 
     @property
     def parameters(self) -> frozenset[Parameter]:
@@ -298,7 +298,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     @block_size.setter
     def block_size(self, val: dim3):
-        warn("Setting `block_size` on an dynamic launch configuration has no effect.", UserWarning)
+        AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.")
 
     @staticmethod
     def _round_block_sizes_to_warp_size(
-- 
GitLab


From f65cd9bf27082525cfb97cf3958bda75bae4719e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 12:50:14 +0100
Subject: [PATCH 53/54] Create expression for ceil_to_multiple manually

---
 src/pystencils/codegen/gpu_indexing.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index bcffbbcae..d5f0aead2 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -20,7 +20,7 @@ from ..backend.kernelcreation import (
 )
 from ..backend.platforms.cuda import ThreadMapping
 
-from ..backend.ast.expressions import PsExpression
+from ..backend.ast.expressions import PsExpression, PsIntDiv
 from math import prod
 
 from ..utils import ceil_to_multiple
@@ -605,9 +605,10 @@ class GpuIndexing:
         # -> round block size in fastest moving dimension up to multiple of warp size
         rounded_block_size: PsExpression
         if self._assume_warp_aligned_block_size:
-            rounded_block_size = ceil_to_multiple(
-                work_items[0],
-                PsExpression.make(PsConstant(self._hw_props.warp_size, work_items[0].dtype)))
+            warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
+            rounded_block_size = self._ast_factory.parse_index(
+                PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1),
+                         warp_size.clone()) * warp_size.clone())
         else:
             rounded_block_size = work_items[0]
 
-- 
GitLab


From c84ad702852e746a19221cfb80bdc2cefc634918 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Mar 2025 14:28:13 +0100
Subject: [PATCH 54/54] Fix lint

---
 src/pystencils/codegen/gpu_indexing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index d5f0aead2..d473e5b4a 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -11,7 +11,6 @@ from .parameters import Parameter
 from .errors import CodegenError
 from .config import GpuIndexingScheme
 from .target import Target
-from ..backend.constants import PsConstant
 
 from ..backend.kernelcreation import (
     KernelCreationContext,
-- 
GitLab