From 9a15d1f77df46864d3d16068c71f99ec07b51628 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 10 Feb 2025 18:39:26 +0100
Subject: [PATCH 01/17] Implement Lambda and GpuLaunchGridConstraints. Start on
 backend WorkItemMapping.

---
 .../backend/platforms/generic_gpu.py          | 26 ++++++++--
 src/pystencils/codegen/config.py              | 30 +++++++++++
 src/pystencils/codegen/gpu_indexing.py        | 51 +++++++++++++++++++
 src/pystencils/codegen/lambdas.py             | 45 ++++++++++++++++
 4 files changed, 149 insertions(+), 3 deletions(-)
 create mode 100644 src/pystencils/codegen/gpu_indexing.py
 create mode 100644 src/pystencils/codegen/lambdas.py

diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 15df36cdd..f22c3c99b 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 
 from ..ast.expressions import PsExpression
 from ..ast.structural import PsBlock
@@ -16,6 +16,26 @@ if TYPE_CHECKING:
     from ...codegen.kernel import GpuThreadsRange
 
 
+class WorkItemMapping(ABC):
+    """Signature for work-item mappings used to modify the thread index mapping behavior"""
+
+    @abstractmethod
+    def __call__(
+        self,
+        block_idx: tuple[PsExpression, PsExpression, PsExpression],
+        thread_idx: tuple[PsExpression, PsExpression, PsExpression],
+        ispace_rank: int,
+    ) -> tuple[PsExpression, ...]:
+        """Compute a work item index from the current block index, thread index, and iteration space dimensionality.
+        
+        Implementations of this method must return a tuple with `ispace_rank` entries,
+        containing expressions for the compressed index of the work item identified by the
+        given GPU block and thread index triples.
+        (The *compressed index* is the work item's index before application
+        of the iteration space's lower limits and strides.)
+        """
+
+
 class GenericGpu(Platform):
     @abstractmethod
     def materialize_iteration_space(
@@ -38,13 +58,13 @@ class GenericGpu(Platform):
     @classmethod
     def _threads_from_full_ispace(cls, ispace: FullIterationSpace) -> GpuThreadsRange:
         from ...codegen.kernel import GpuThreadsRange
-        
+
         dimensions = ispace.dimensions_in_loop_order()[::-1]
         if len(dimensions) > 3:
             raise NotImplementedError(
                 f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
             )
-        
+
         from ..ast.analysis import collect_undefined_symbols as collect
 
         for dim in dimensions:
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index bce075731..96ab13ea0 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from warnings import warn
 from abc import ABC
 from collections.abc import Collection
+from enum import Enum, auto
 
 from typing import TYPE_CHECKING, Sequence, Generic, TypeVar, Callable, Any, cast
 from dataclasses import dataclass, InitVar, fields
@@ -331,6 +332,35 @@ class CpuOptions(ConfigBase):
     """
 
 
+class GpuIndexingScheme(Enum):
+    """Available index translation schemes for GPU kernels."""
+
+    Linear3D = auto()
+    """Map coordinates to global thread indices.
+
+    Supports up to three-dimensional iteration spaces.
+    For each dimension (with known start, stop and step values), compute the current iteration
+    point as ``start + step * (blockIdx.c * blockDim.c * threadDim.c)``
+    (where c :math:`\\in` (x, y, z)).
+    """
+
+    Blockwise4D = auto()
+    """On a 3D grid of 1D blocks, map the fastest coordinate onto the intra-block thread index,
+    and slower coordinates onto the block index.
+
+    Supports up to four-dimensional iteration spaces.
+    Using this indexing scheme, the iteration counters of up to four dimensions are assigned
+    like follows, from slowest to fastest:
+
+    .. code-block:: C++
+
+        ctr_3 = blockIdx.z;
+        ctr_2 = blockIdx.y;
+        ctr_1 = blockIdx.x;
+        ctr_0 = threadIDx.x;
+    """
+
+
 @dataclass
 class GpuOptions(ConfigBase):
     """Configuration options specific to GPU targets."""
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
new file mode 100644
index 000000000..1d9bf9c2c
--- /dev/null
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from itertools import chain
+
+from .lambdas import Lambda
+from .parameters import Parameter
+
+
+_ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None]
+
+
+class GpuLaunchGridConstraints:
+    """Constraints on the number of threads and blocks on the GPU launch grid for a given kernel.
+    
+    This constraints set determines all or some of
+    the number of threads on a GPU block as well as the number of blocks on the GPU grid,
+    statically or depending on runtime parameters.
+    """
+
+    def __init__(
+        self,
+        block_size: _ConstraintTriple | None = None,
+        grid_size: _ConstraintTriple | None = None,
+    ) -> None:
+        self._block_size: _ConstraintTriple = (
+            (None, None, None) if block_size is None else block_size
+        )
+        self._grid_size: _ConstraintTriple = (
+            (None, None, None) if grid_size is None else grid_size
+        )
+
+        params = set()
+        for constr in chain(self._block_size, self._grid_size):
+            if constr is not None:
+                params |= set(constr.parameters)
+        self._params = frozenset(params)
+
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters to this set of constraints"""
+        return self._params
+
+    @property
+    def block_size(self) -> _ConstraintTriple:
+        """Constraints on the number of threads per block"""
+        return self._block_size
+
+    @property
+    def grid_size(self) -> _ConstraintTriple:
+        """Constraints on the number of blocks on the grid"""
+        return self._grid_size
diff --git a/src/pystencils/codegen/lambdas.py b/src/pystencils/codegen/lambdas.py
new file mode 100644
index 000000000..dd0fb571d
--- /dev/null
+++ b/src/pystencils/codegen/lambdas.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+from typing import Sequence
+
+import numpy as np
+
+from .parameters import Parameter
+from ..types import PsType
+
+from ..backend.ast.expressions import PsExpression
+
+
+class Lambda:
+    """A one-line function emitted by the code generator as an auxiliary object."""
+
+    def __init__(self, expr: PsExpression, params: Sequence[Parameter]):
+        self._expr = expr
+        self._params = tuple(params)
+        self._return_type = expr.get_dtype()
+    
+    @property
+    def parameters(self) -> tuple[Parameter, ...]:
+        """Parameters to this lambda"""
+        return self._params
+
+    @property
+    def return_type(self) -> PsType:
+        """Return type of this lambda"""
+        return self._return_type
+
+    def __call__(self, **kwargs) -> np.generic:
+        """Evaluate this lambda with the given arguments.
+        
+        The lambda must receive a value for each parameter listed in `parameters`.
+        """
+        from ..backend.ast.expressions import evaluate_expression
+        return evaluate_expression(self._expr, kwargs)
+
+    def __str__(self) -> str:
+        return str(self._expr)
+
+    def c_code(self) -> str:
+        """Print the C code of this lambda"""
+        from ..backend.emission import CAstPrinter
+        printer = CAstPrinter()
+        return printer(self._expr)
-- 
GitLab


From 998948952e1d7c95f6384c3e3e46024356a4f745 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 11 Feb 2025 09:58:50 +0100
Subject: [PATCH 02/17] extract thread idx mapping from cuda platform. remove
 threads range computation from cuda and sycl platforms.

---
 src/pystencils/backend/platforms/cuda.py      | 149 +++++++++++++-----
 .../backend/platforms/generic_gpu.py          |  24 +--
 src/pystencils/backend/platforms/sycl.py      |  18 +--
 3 files changed, 114 insertions(+), 77 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index ff41d3a68..4f78d344d 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
-from warnings import warn
-from typing import TYPE_CHECKING
+from abc import ABC, abstractmethod
 
-from ...types import constify
+from ...types import constify, deconstify
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
 
+from ..memory import PsSymbol
 from ..kernelcreation import (
     Typifier,
     IterationSpace,
@@ -29,8 +29,6 @@ from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
 from ..functions import PsMathFunction, MathFunctions, CFunction
 
-if TYPE_CHECKING:
-    from ...codegen import GpuThreadsRange
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
@@ -48,18 +46,88 @@ GRID_DIM = [
 ]
 
 
+class DenseThreadIdxMapping(ABC):
+
+    @abstractmethod
+    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+        """Map the current thread index onto a point in the given iteration space.
+
+        Implementations of this method must return a declaration for each dimension counter
+        of the given iteration space.
+        """
+
+
+class Linear3DMapping(DenseThreadIdxMapping):
+    """3D globally linearized mapping, where each thread is assigned a work item according to
+    its location in the global launch grid."""
+
+    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 3:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Linear3D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for coord, dim in enumerate(dimensions[::-1]):
+            tid = self._linear_thread_idx(coord)
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+    def _linear_thread_idx(self, coord: int):
+        block_size = BLOCK_DIM[coord]
+        block_idx = BLOCK_IDX[coord]
+        thread_idx = THREAD_IDX[coord]
+        return block_idx * block_size + thread_idx
+
+
+class Blockwise4DMapping(DenseThreadIdxMapping):
+    """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
+    are mapped to block indices."""
+
+    _indices_in_loop_order = [  # slowest to fastest
+        BLOCK_IDX[2],
+        BLOCK_IDX[1],
+        BLOCK_IDX[0],
+        THREAD_IDX[0],
+    ]
+
+    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 4:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Blockwise4D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for dim, tid in zip(dimensions, self._indices_in_loop_order):
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+
 class CudaPlatform(GenericGpu):
     """Platform for CUDA-based GPUs."""
 
     def __init__(
-        self, ctx: KernelCreationContext,
+        self,
+        ctx: KernelCreationContext,
         omit_range_check: bool = False,
-        manual_launch_grid: bool = False,
+        dense_idx_mapping: DenseThreadIdxMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
         self._omit_range_check = omit_range_check
-        self._manual_launch_grid = manual_launch_grid
+        self._dense_idx_mapping = dense_idx_mapping
 
         self._typify = Typifier(ctx)
 
@@ -69,7 +137,7 @@ class CudaPlatform(GenericGpu):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
         if isinstance(ispace, FullIterationSpace):
             return self._prepend_dense_translation(body, ispace)
         elif isinstance(ispace, SparseIterationSpace):
@@ -141,40 +209,41 @@ class CudaPlatform(GenericGpu):
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
         dimensions = ispace.dimensions_in_loop_order()
 
-        if not self._manual_launch_grid:
-            try:
-                threads_range = self.threads_from_ispace(ispace)
-            except MaterializationError as e:
-                warn(
-                    str(e.args[0])
-                    + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
-                    UserWarning,
-                )
-                threads_range = None
-        else:
-            threads_range = None
+        #   TODO move to codegen
+        # if not self._manual_launch_grid:
+        #     try:
+        #         threads_range = self.threads_from_ispace(ispace)
+        #     except MaterializationError as e:
+        #         warn(
+        #             str(e.args[0])
+        #             + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
+        #             UserWarning,
+        #         )
+        #         threads_range = None
+        # else:
+        #     threads_range = None
+
+        idx_mapper = (
+            self._dense_idx_mapping
+            if self._dense_idx_mapping is not None
+            else Linear3DMapping()
+        )
+        ctr_mapping = idx_mapper(ispace)
 
         indexing_decls = []
         conds = []
-        for i, dim in enumerate(dimensions[::-1]):
+        for dim in dimensions:
             dim.counter.dtype = constify(dim.counter.get_dtype())
 
-            ctr = PsExpression.make(dim.counter)
+            ctr_expr = PsExpression.make(dim.counter)
             indexing_decls.append(
-                self._typify(
-                    PsDeclaration(
-                        ctr,
-                        dim.start
-                        + dim.step
-                        * PsCast(ctr.get_dtype(), self._linear_thread_idx(i)),
-                    )
-                )
+                self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
             if not self._omit_range_check:
-                conds.append(PsLt(ctr, dim.stop))
+                conds.append(PsLt(ctr_expr, dim.stop))
 
         indexing_decls = indexing_decls[::-1]
 
@@ -187,16 +256,16 @@ class CudaPlatform(GenericGpu):
             body.statements = indexing_decls + body.statements
             ast = body
 
-        return ast, threads_range
+        return ast
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         factory = AstFactory(self._ctx)
         ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
 
         sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = self._linear_thread_idx(0)
+        thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
         sparse_idx_decl = self._typify(
             PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx))
         )
@@ -224,10 +293,4 @@ class CudaPlatform(GenericGpu):
             body.statements = [sparse_idx_decl] + body.statements
             ast = body
 
-        return ast, self.threads_from_ispace(ispace)
-
-    def _linear_thread_idx(self, coord: int):
-        block_size = BLOCK_DIM[coord]
-        block_idx = BLOCK_IDX[coord]
-        thread_idx = THREAD_IDX[coord]
-        return block_idx * block_size + thread_idx
+        return ast
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index f22c3c99b..f66f9aa0e 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 
 from ..ast.expressions import PsExpression
 from ..ast.structural import PsBlock
@@ -16,31 +16,11 @@ if TYPE_CHECKING:
     from ...codegen.kernel import GpuThreadsRange
 
 
-class WorkItemMapping(ABC):
-    """Signature for work-item mappings used to modify the thread index mapping behavior"""
-
-    @abstractmethod
-    def __call__(
-        self,
-        block_idx: tuple[PsExpression, PsExpression, PsExpression],
-        thread_idx: tuple[PsExpression, PsExpression, PsExpression],
-        ispace_rank: int,
-    ) -> tuple[PsExpression, ...]:
-        """Compute a work item index from the current block index, thread index, and iteration space dimensionality.
-        
-        Implementations of this method must return a tuple with `ispace_rank` entries,
-        containing expressions for the compressed index of the work item identified by the
-        given GPU block and thread index triples.
-        (The *compressed index* is the work item's index before application
-        of the iteration space's lower limits and strides.)
-        """
-
-
 class GenericGpu(Platform):
     @abstractmethod
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
         pass
 
     @classmethod
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index e1da9e223..7f2bbf9f6 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
 
 from ..functions import CFunction, PsMathFunction, MathFunctions
 from ..kernelcreation.iteration_space import (
@@ -29,9 +28,6 @@ from .generic_gpu import GenericGpu
 from ..exceptions import MaterializationError
 from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType
 
-if TYPE_CHECKING:
-    from ...codegen import GpuThreadsRange
-
 
 class SyclPlatform(GenericGpu):
 
@@ -39,7 +35,7 @@ class SyclPlatform(GenericGpu):
         self,
         ctx: KernelCreationContext,
         omit_range_check: bool = False,
-        automatic_block_size: bool = False
+        automatic_block_size: bool = False,
     ):
         super().__init__(ctx)
 
@@ -52,7 +48,7 @@ class SyclPlatform(GenericGpu):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         if isinstance(ispace, FullIterationSpace):
             return self._prepend_dense_translation(body, ispace)
         elif isinstance(ispace, SparseIterationSpace):
@@ -113,15 +109,13 @@ class SyclPlatform(GenericGpu):
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         rank = ispace.rank
         id_type = self._id_type(rank)
         id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type))
         id_decl = self._id_declaration(rank, id_symbol)
 
         dimensions = ispace.dimensions_in_loop_order()
-        launch_config = self.threads_from_ispace(ispace)
-
         indexing_decls = [id_decl]
         conds = []
 
@@ -153,11 +147,11 @@ class SyclPlatform(GenericGpu):
             body.statements = indexing_decls + body.statements
             ast = body
 
-        return ast, launch_config
+        return ast
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         factory = AstFactory(self._ctx)
 
         id_type = PsCustomType("sycl::id< 1 >", const=True)
@@ -195,7 +189,7 @@ class SyclPlatform(GenericGpu):
             body.statements = [sparse_idx_decl] + body.statements
             ast = body
 
-        return ast, self.threads_from_ispace(ispace)
+        return ast
 
     def _item_type(self, rank: int):
         if not self._automatic_block_size:
-- 
GitLab


From 3d81f0311a1d569760a2aca08d43961abea74459 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 12 Feb 2025 14:52:13 +0100
Subject: [PATCH 03/17] remove GpuThreadsRange. Introduce GpuIndexing ABC and
 Linear3D implementation. Compute grid constraints for Linear3D. Start
 updating cupy JIT/

---
 src/pystencils/backend/memory.py              |  11 +-
 src/pystencils/backend/platforms/cuda.py      |  14 +--
 .../backend/platforms/generic_gpu.py          |  49 +-------
 src/pystencils/codegen/__init__.py            |   3 +-
 src/pystencils/codegen/config.py              |  57 ++++++++--
 src/pystencils/codegen/driver.py              |  59 +++++++---
 src/pystencils/codegen/errors.py              |   2 +
 .../codegen/{lambdas.py => functions.py}      |  15 ++-
 src/pystencils/codegen/gpu_indexing.py        | 106 +++++++++++++++++-
 src/pystencils/codegen/kernel.py              |  47 +-------
 src/pystencils/codegen/properties.py          |   5 +
 src/pystencils/jit/gpu_cupy.py                |  77 +++++++------
 .../platform/test_gpu_platforms.py            |  43 -------
 13 files changed, 283 insertions(+), 205 deletions(-)
 create mode 100644 src/pystencils/codegen/errors.py
 rename src/pystencils/codegen/{lambdas.py => functions.py} (77%)
 delete mode 100644 tests/nbackend/kernelcreation/platform/test_gpu_platforms.py

diff --git a/src/pystencils/backend/memory.py b/src/pystencils/backend/memory.py
index 7a5d62f69..0e9b21d6c 100644
--- a/src/pystencils/backend/memory.py
+++ b/src/pystencils/backend/memory.py
@@ -89,8 +89,13 @@ class PsSymbol:
         return f"PsSymbol({repr(self._name)}, {repr(self._dtype)})"
 
 
+class BackendPrivateProperty:
+    """Mix-in marker for symbol properties that are private to the backend
+    and should not be exported to parameters"""
+
+
 @dataclass(frozen=True)
-class BufferBasePtr(UniqueSymbolProperty):
+class BufferBasePtr(UniqueSymbolProperty, BackendPrivateProperty):
     """Symbol acts as a base pointer to a buffer."""
 
     buffer: PsBuffer
@@ -120,12 +125,12 @@ class PsBuffer:
         strides: Sequence[PsSymbol | PsConstant],
     ):
         bptr_type = base_ptr.get_dtype()
-        
+
         if not isinstance(bptr_type, PsPointerType):
             raise ValueError(
                 f"Type of buffer base pointer {base_ptr} was not a pointer type: {bptr_type}"
             )
-        
+
         if bptr_type.base_type != element_type:
             raise ValueError(
                 f"Base type of primary buffer base pointer {base_ptr} "
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 4f78d344d..cff6f935f 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -46,7 +46,7 @@ GRID_DIM = [
 ]
 
 
-class DenseThreadIdxMapping(ABC):
+class ThreadToIndexMapping(ABC):
 
     @abstractmethod
     def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
@@ -57,7 +57,7 @@ class DenseThreadIdxMapping(ABC):
         """
 
 
-class Linear3DMapping(DenseThreadIdxMapping):
+class Linear3DMapping(ThreadToIndexMapping):
     """3D globally linearized mapping, where each thread is assigned a work item according to
     its location in the global launch grid."""
 
@@ -86,7 +86,7 @@ class Linear3DMapping(DenseThreadIdxMapping):
         return block_idx * block_size + thread_idx
 
 
-class Blockwise4DMapping(DenseThreadIdxMapping):
+class Blockwise4DMapping(ThreadToIndexMapping):
     """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
     are mapped to block indices."""
 
@@ -122,12 +122,12 @@ class CudaPlatform(GenericGpu):
         self,
         ctx: KernelCreationContext,
         omit_range_check: bool = False,
-        dense_idx_mapping: DenseThreadIdxMapping | None = None,
+        thread_mapping: ThreadToIndexMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
         self._omit_range_check = omit_range_check
-        self._dense_idx_mapping = dense_idx_mapping
+        self._thread_mapping = thread_mapping
 
         self._typify = Typifier(ctx)
 
@@ -227,8 +227,8 @@ class CudaPlatform(GenericGpu):
         #     threads_range = None
 
         idx_mapper = (
-            self._dense_idx_mapping
-            if self._dense_idx_mapping is not None
+            self._thread_mapping
+            if self._thread_mapping is not None
             else Linear3DMapping()
         )
         ctr_mapping = idx_mapper(ispace)
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index f66f9aa0e..7491ec8e9 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,19 +1,9 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
 from abc import abstractmethod
 
-from ..ast.expressions import PsExpression
 from ..ast.structural import PsBlock
-from ..kernelcreation.iteration_space import (
-    IterationSpace,
-    FullIterationSpace,
-    SparseIterationSpace,
-)
+from ..kernelcreation.iteration_space import IterationSpace
 from .platform import Platform
-from ..exceptions import MaterializationError
-
-if TYPE_CHECKING:
-    from ...codegen.kernel import GpuThreadsRange
 
 
 class GenericGpu(Platform):
@@ -22,40 +12,3 @@ class GenericGpu(Platform):
         self, body: PsBlock, ispace: IterationSpace
     ) -> PsBlock:
         pass
-
-    @classmethod
-    def threads_from_ispace(cls, ispace: IterationSpace) -> GpuThreadsRange:
-        from ...codegen.kernel import GpuThreadsRange
-
-        if isinstance(ispace, FullIterationSpace):
-            return cls._threads_from_full_ispace(ispace)
-        elif isinstance(ispace, SparseIterationSpace):
-            work_items = (PsExpression.make(ispace.index_list.shape[0]),)
-            return GpuThreadsRange(work_items)
-        else:
-            assert False
-
-    @classmethod
-    def _threads_from_full_ispace(cls, ispace: FullIterationSpace) -> GpuThreadsRange:
-        from ...codegen.kernel import GpuThreadsRange
-
-        dimensions = ispace.dimensions_in_loop_order()[::-1]
-        if len(dimensions) > 3:
-            raise NotImplementedError(
-                f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
-            )
-
-        from ..ast.analysis import collect_undefined_symbols as collect
-
-        for dim in dimensions:
-            symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
-            for ctr in ispace.counters:
-                if ctr in symbs:
-                    raise MaterializationError(
-                        "Unable to construct GPU threads range for iteration space: "
-                        f"Limits of dimension counter {dim.counter.name} "
-                        f"depend on another dimension's counter {ctr.name}"
-                    )
-
-        work_items = [ispace.actual_iterations(dim) for dim in dimensions]
-        return GpuThreadsRange(work_items)
diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py
index e13f911dd..fc1b70ca0 100644
--- a/src/pystencils/codegen/__init__.py
+++ b/src/pystencils/codegen/__init__.py
@@ -4,7 +4,7 @@ from .config import (
     AUTO,
 )
 from .parameters import Parameter
-from .kernel import Kernel, GpuKernel, GpuThreadsRange
+from .kernel import Kernel, GpuKernel
 from .driver import create_kernel, get_driver
 
 __all__ = [
@@ -14,7 +14,6 @@ __all__ = [
     "Parameter",
     "Kernel",
     "GpuKernel",
-    "GpuThreadsRange",
     "create_kernel",
     "get_driver",
 ]
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 96ab13ea0..53a271852 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -86,7 +86,9 @@ class Option(Generic[Option_T, Arg_T]):
         self._name = name
         self._lookup = f"_{name}"
 
-    def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Option_T | None:
+    def __get__(
+        self, obj: ConfigBase, objtype: type[ConfigBase] | None = None
+    ) -> Option_T | None:
         if obj is None:
             return None
 
@@ -194,7 +196,9 @@ class Category(Generic[Category_T]):
         self._name = name
         self._lookup = f"_{name}"
 
-    def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Category_T:
+    def __get__(
+        self, obj: ConfigBase, objtype: type[ConfigBase] | None = None
+    ) -> Category_T:
         if obj is None:
             return None
 
@@ -365,6 +369,9 @@ class GpuIndexingScheme(Enum):
 class GpuOptions(ConfigBase):
     """Configuration options specific to GPU targets."""
 
+    indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D)
+    """Thread indexing scheme for dense GPU kernels."""
+
     omit_range_check: BasicOption[bool] = BasicOption(False)
     """If set to `True`, omit the iteration counter range check.
     
@@ -384,6 +391,31 @@ class GpuOptions(ConfigBase):
     The launch grid will then have to be specified manually at runtime.
     """
 
+    @indexing_scheme.validate
+    def _validate_idx_scheme(self, val: str | GpuIndexingScheme):
+        if isinstance(val, GpuIndexingScheme):
+            return val
+
+        match val.lower():
+            case "block":
+                warn(
+                    "GPU indexing scheme name `block` is deprecated and will be removed in pystencils 2.1. "
+                    "Use `Linear3D` instead."
+                )
+                return GpuIndexingScheme.Linear3D
+            case "line":
+                warn(
+                    "GPU indexing scheme name `line` is deprecated and will be removed in pystencils 2.1. "
+                    "Use `Blockwise4D` instead."
+                )
+                return GpuIndexingScheme.Blockwise4D
+            case "linear3d":
+                return GpuIndexingScheme.Linear3D
+            case "blockwise4d":
+                return GpuIndexingScheme.Blockwise4D
+            case _:
+                raise ValueError(f"Invalid GPU indexing scheme: {val}")
+
 
 @dataclass
 class SyclOptions(ConfigBase):
@@ -536,6 +568,9 @@ class CreateKernelConfig(ConfigBase):
     cpu_vectorize_info: InitVar[dict | None] = None
     """Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead."""
 
+    gpu_indexing: InitVar[str | None] = None
+    """Deprecated; use `gpu.indexing_scheme` instead."""
+
     gpu_indexing_params: InitVar[dict | None] = None
     """Deprecated; set options in the `gpu` category instead."""
 
@@ -594,6 +629,7 @@ class CreateKernelConfig(ConfigBase):
         data_type: UserTypeSpec | None,
         cpu_openmp: bool | int | None,
         cpu_vectorize_info: dict | None,
+        gpu_indexing: str | None,
         gpu_indexing_params: dict | None,
     ):  # pragma: no cover
         if data_type is not None:
@@ -623,9 +659,7 @@ class CreateKernelConfig(ConfigBase):
                     deprecated_omp.enable = True
                     deprecated_omp.num_threads = cpu_openmp
                 case _:
-                    raise ValueError(
-                        f"Invalid option for `cpu_openmp`: {cpu_openmp}"
-                    )
+                    raise ValueError(f"Invalid option for `cpu_openmp`: {cpu_openmp}")
 
             self.cpu.openmp = deprecated_omp
 
@@ -682,11 +716,20 @@ class CreateKernelConfig(ConfigBase):
 
             self.cpu.vectorize = deprecated_vec_opts
 
+        if gpu_indexing is not None:
+            _deprecated_option("gpu_indexing", "gpu.indexing_scheme")
+            warn(
+                "Setting the deprecated `gpu_indexing` will override the `gpu.indexing_scheme` option",
+                UserWarning,
+            )
+            self.gpu.indexing_scheme = gpu_indexing
+
         if gpu_indexing_params is not None:
-            _deprecated_option("gpu_indexing_params", "gpu_indexing")
+            _deprecated_option("gpu_indexing_params", "gpu")
             warn(
                 "Setting the deprecated `gpu_indexing_params` will override any options "
-                "passed in the `gpu` category."
+                "passed in the `gpu` category.",
+                UserWarning,
             )
 
             self.gpu = GpuOptions(
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 6f44e718d..152fceba8 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -10,10 +10,12 @@ from .config import (
     _AUTO_TYPE,
     GhostLayerSpec,
     IterationSliceSpec,
+    GpuIndexingScheme,
 )
-from .kernel import Kernel, GpuKernel, GpuThreadsRange
-from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
+from .kernel import Kernel, GpuKernel
+from .properties import PsSymbolProperty, FieldBasePtr
 from .parameters import Parameter
+from .gpu_indexing import GpuIndexing, GpuLaunchGridConstraints
 
 from ..field import Field
 from ..types import PsIntegerType, PsScalarType
@@ -145,6 +147,7 @@ class DefaultKernelCreationDriver:
         )
 
         self._target = cfg.get_target()
+        self._gpu_indexing: GpuIndexing | None = self._get_gpu_indexing()
         self._platform = self._get_platform()
 
         self._intermediates: CodegenIntermediates | None
@@ -169,9 +172,11 @@ class DefaultKernelCreationDriver:
                     kernel_body, self._ctx.get_iteration_space()
                 )
             case GenericGpu():
-                kernel_ast, gpu_threads = self._platform.materialize_iteration_space(
+                kernel_ast = self._platform.materialize_iteration_space(
                     kernel_body, self._ctx.get_iteration_space()
                 )
+            case _:
+                assert False, "unexpected platform"
 
         if self._intermediates is not None:
             self._intermediates.materialized_ispace = kernel_ast.clone()
@@ -219,7 +224,7 @@ class DefaultKernelCreationDriver:
                 self._ctx,
                 self._platform,
                 kernel_ast,
-                gpu_threads,
+                self._gpu_indexing,
                 self._cfg.get_option("function_name"),
                 self._target,
                 self._cfg.get_jit(),
@@ -395,6 +400,20 @@ class DefaultKernelCreationDriver:
 
         return kernel_ast
 
+    def _get_gpu_indexing(self) -> GpuIndexing | None:
+        if self._target != Target.CUDA:
+            return None
+
+        idx_scheme = self._cfg.gpu.get_option("indexing_scheme")
+
+        match idx_scheme:
+            case None | GpuIndexingScheme.Linear3D:
+                from .gpu_indexing import Linear3DGpuIndexing
+
+                return Linear3DGpuIndexing(self._ctx)
+            case _:
+                raise NotImplementedError()
+
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
             if Target._X86 in self._target:
@@ -430,7 +449,9 @@ class DefaultKernelCreationDriver:
                 case Target.SYCL:
                     from ..backend.platforms import SyclPlatform
 
-                    auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size")
+                    auto_block_size: bool = self._cfg.sycl.get_option(
+                        "automatic_block_size"
+                    )
 
                     return SyclPlatform(
                         self._ctx,
@@ -440,12 +461,16 @@ class DefaultKernelCreationDriver:
                 case Target.CUDA:
                     from ..backend.platforms import CudaPlatform
 
-                    manual_grid = gpu_opts.get_option("manual_launch_grid")
+                    thread_mapping = (
+                        self._gpu_indexing.get_thread_mapping()
+                        if self._gpu_indexing is not None
+                        else None
+                    )
 
                     return CudaPlatform(
                         self._ctx,
                         omit_range_check=omit_range_check,
-                        manual_launch_grid=manual_grid,
+                        thread_mapping=thread_mapping,
                     )
 
         raise NotImplementedError(
@@ -475,23 +500,25 @@ def create_gpu_kernel_function(
     ctx: KernelCreationContext,
     platform: Platform,
     body: PsBlock,
-    threads_range: GpuThreadsRange | None,
+    indexing: GpuIndexing | None,
     function_name: str,
     target_spec: Target,
     jit: JitBase,
 ) -> GpuKernel:
     undef_symbols = collect_undefined_symbols(body)
 
-    if threads_range is not None:
-        for threads in threads_range.num_work_items:
-            undef_symbols |= collect_undefined_symbols(threads)
+    launch_grid_constraints = (
+        indexing.get_launch_grid_constraints()
+        if indexing is not None
+        else GpuLaunchGridConstraints()
+    )
 
     params = _get_function_params(ctx, undef_symbols)
     req_headers = _get_headers(ctx, platform, body)
 
     kfunc = GpuKernel(
         body,
-        threads_range,
+        launch_grid_constraints,
         target_spec,
         function_name,
         params,
@@ -507,17 +534,19 @@ def _get_function_params(
 ) -> list[Parameter]:
     params: list[Parameter] = []
 
-    from pystencils.backend.memory import BufferBasePtr
+    from pystencils.backend.memory import BufferBasePtr, BackendPrivateProperty
 
     for symb in symbols:
         props: set[PsSymbolProperty] = set()
         for prop in symb.properties:
             match prop:
-                case FieldShape() | FieldStride():
-                    props.add(prop)
                 case BufferBasePtr(buf):
                     field = ctx.find_field(buf.name)
                     props.add(FieldBasePtr(field))
+                case BackendPrivateProperty():
+                    pass
+                case _:
+                    props.add(prop)
         params.append(Parameter(symb.name, symb.get_dtype(), props))
 
     params.sort(key=lambda p: p.name)
diff --git a/src/pystencils/codegen/errors.py b/src/pystencils/codegen/errors.py
new file mode 100644
index 000000000..eceb53f61
--- /dev/null
+++ b/src/pystencils/codegen/errors.py
@@ -0,0 +1,2 @@
+class CodegenError(Exception):
+    """Exception that indicates a fatal error in the code generation driver."""
diff --git a/src/pystencils/codegen/lambdas.py b/src/pystencils/codegen/functions.py
similarity index 77%
rename from src/pystencils/codegen/lambdas.py
rename to src/pystencils/codegen/functions.py
index dd0fb571d..2779fa289 100644
--- a/src/pystencils/codegen/lambdas.py
+++ b/src/pystencils/codegen/functions.py
@@ -6,17 +6,26 @@ import numpy as np
 from .parameters import Parameter
 from ..types import PsType
 
+from ..backend.kernelcreation import KernelCreationContext
 from ..backend.ast.expressions import PsExpression
 
 
 class Lambda:
     """A one-line function emitted by the code generator as an auxiliary object."""
 
+    @staticmethod
+    def from_expression(ctx: KernelCreationContext, expr: PsExpression):
+        from ..backend.ast.analysis import collect_undefined_symbols
+        from .driver import _get_function_params
+
+        params = _get_function_params(ctx, collect_undefined_symbols(expr))
+        return Lambda(expr, params)
+
     def __init__(self, expr: PsExpression, params: Sequence[Parameter]):
         self._expr = expr
         self._params = tuple(params)
         self._return_type = expr.get_dtype()
-    
+
     @property
     def parameters(self) -> tuple[Parameter, ...]:
         """Parameters to this lambda"""
@@ -29,10 +38,11 @@ class Lambda:
 
     def __call__(self, **kwargs) -> np.generic:
         """Evaluate this lambda with the given arguments.
-        
+
         The lambda must receive a value for each parameter listed in `parameters`.
         """
         from ..backend.ast.expressions import evaluate_expression
+
         return evaluate_expression(self._expr, kwargs)
 
     def __str__(self) -> str:
@@ -41,5 +51,6 @@ class Lambda:
     def c_code(self) -> str:
         """Print the C code of this lambda"""
         from ..backend.emission import CAstPrinter
+
         printer = CAstPrinter()
         return printer(self._expr)
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 1d9bf9c2c..2b84ef007 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -1,9 +1,21 @@
 from __future__ import annotations
 
+from abc import ABC, abstractmethod
+from typing import cast
 from itertools import chain
 
-from .lambdas import Lambda
+from .functions import Lambda
 from .parameters import Parameter
+from .properties import GpuBlockSize
+from .errors import CodegenError
+
+from ..backend.kernelcreation import (
+    KernelCreationContext,
+    FullIterationSpace,
+    SparseIterationSpace,
+)
+from ..backend.platforms.cuda import ThreadToIndexMapping
+from ..backend.ast.expressions import PsExpression
 
 
 _ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None]
@@ -11,7 +23,7 @@ _ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None]
 
 class GpuLaunchGridConstraints:
     """Constraints on the number of threads and blocks on the GPU launch grid for a given kernel.
-    
+
     This constraints set determines all or some of
     the number of threads on a GPU block as well as the number of blocks on the GPU grid,
     statically or depending on runtime parameters.
@@ -49,3 +61,93 @@ class GpuLaunchGridConstraints:
     def grid_size(self) -> _ConstraintTriple:
         """Constraints on the number of blocks on the grid"""
         return self._grid_size
+
+
+class GpuIndexing(ABC):
+    @abstractmethod
+    def get_thread_mapping(self) -> ThreadToIndexMapping | None: ...
+
+    @abstractmethod
+    def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints: ...
+
+
+class Linear3DGpuIndexing(GpuIndexing):
+
+    def __init__(self, ctx: KernelCreationContext) -> None:
+        self._ctx = ctx
+
+        from ..backend.kernelcreation import AstFactory
+
+        self._factory = AstFactory(self._ctx)
+
+    def get_thread_mapping(self) -> ThreadToIndexMapping:
+        from ..backend.platforms.cuda import Linear3DMapping
+
+        return Linear3DMapping()
+
+    def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints:
+        work_items = self._get_work_items()
+        rank = len(work_items)
+
+        from ..backend.constants import PsConstant
+        from ..backend.ast.expressions import PsExpression, PsIntDiv
+
+        block_size_constraints = [None] * rank + [
+            Lambda(self._factory.parse_index(1), ()) for _ in range(3 - rank)
+        ]
+
+        block_size_symbols = [
+            self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank)
+        ]
+        for c, bs in enumerate(block_size_symbols):
+            bs.add_property(GpuBlockSize(c))
+
+        def div_ceil(a: PsExpression, b: PsExpression):
+            return self._factory.parse_index(
+                PsIntDiv(a + b - PsExpression.make(PsConstant(1)), b)
+            )
+
+        grid_size_constraints = [
+            Lambda.from_expression(
+                self._ctx, div_ceil(witems, PsExpression.make(bsize))
+            )
+            for witems, bsize in zip(work_items, block_size_symbols)
+        ] + [
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
+            for _ in range(3 - rank)
+        ]
+
+        return GpuLaunchGridConstraints(
+            block_size=cast(_ConstraintTriple, tuple(block_size_constraints)),
+            grid_size=cast(_ConstraintTriple, tuple(grid_size_constraints)),
+        )
+
+    def _get_work_items(self) -> tuple[PsExpression, ...]:
+        ispace = self._ctx.get_iteration_space()
+        match ispace:
+            case FullIterationSpace():
+                dimensions = ispace.dimensions_in_loop_order()[::-1]
+                if len(dimensions) > 3:
+                    raise NotImplementedError(
+                        f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
+                    )
+
+                from ..backend.ast.analysis import collect_undefined_symbols as collect
+
+                for i, dim in enumerate(dimensions):
+                    symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
+                    for ctr in ispace.counters:
+                        if ctr in symbs:
+                            raise CodegenError(
+                                "Unable to construct GPU launch grid constraints for this kernel: "
+                                f"Limits in dimension {i} "
+                                f"depend on another dimension's counter {ctr.name}"
+                            )
+
+                return tuple(ispace.actual_iterations(dim) for dim in dimensions)
+
+            case SparseIterationSpace():
+                return (self._factory.parse_index(ispace.index_list.shape[0]),)
+
+            case _:
+                assert False, "unexpected iteration space"
diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py
index 3adc47876..8038f24b0 100644
--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -6,8 +6,8 @@ from itertools import chain
 
 from .target import Target
 from .parameters import Parameter
+from .gpu_indexing import GpuLaunchGridConstraints
 from ..backend.ast.structural import PsBlock
-from ..backend.ast.expressions import PsExpression
 from ..field import Field
 
 from .._deprecation import _deprecated
@@ -118,7 +118,7 @@ class GpuKernel(Kernel):
     def __init__(
         self,
         body: PsBlock,
-        threads_range: GpuThreadsRange | None,
+        launch_grid_constraints: GpuLaunchGridConstraints,
         target: Target,
         name: str,
         parameters: Sequence[Parameter],
@@ -126,46 +126,9 @@ class GpuKernel(Kernel):
         jit: JitBase,
     ):
         super().__init__(body, target, name, parameters, required_headers, jit)
-        self._threads_range = threads_range
+        self._launch_grid_constraints = launch_grid_constraints
 
     @property
-    def threads_range(self) -> GpuThreadsRange | None:
+    def launch_grid_constraints(self) -> GpuLaunchGridConstraints:
         """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._threads_range
-
-
-class GpuThreadsRange:
-    """Number of threads required by a GPU kernel, in order (x, y, z)."""
-
-    def __init__(
-        self,
-        num_work_items: Sequence[PsExpression],
-    ):
-        self._dim = len(num_work_items)
-        self._num_work_items = tuple(num_work_items)
-
-    # @property
-    # def grid_size(self) -> tuple[PsExpression, ...]:
-    #     return self._grid_size
-
-    # @property
-    # def block_size(self) -> tuple[PsExpression, ...]:
-    #     return self._block_size
-
-    @property
-    def num_work_items(self) -> tuple[PsExpression, ...]:
-        """Number of work items in (x, y, z)-order."""
-        return self._num_work_items
-
-    @property
-    def dim(self) -> int:
-        return self._dim
-
-    def __str__(self) -> str:
-        rep = "GpuThreadsRange { "
-        rep += "; ".join(f"{x}: {w}" for x, w in zip("xyz", self._num_work_items))
-        rep += " }"
-        return rep
-
-    def _repr_html_(self) -> str:
-        return str(self)
+        return self._launch_grid_constraints
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index d377fb3d3..df76489db 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -39,3 +39,8 @@ class FieldBasePtr(UniqueSymbolProperty):
 
 FieldProperty = FieldShape | FieldStride | FieldBasePtr
 _FieldProperty = (FieldShape, FieldStride, FieldBasePtr)
+
+
+@dataclass(frozen=True)
+class GpuBlockSize(UniqueSymbolProperty):
+    coordinate: int
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index a407bb75e..afdbd5097 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -40,7 +40,7 @@ class CupyKernelWrapper(KernelWrapper):
         self._kfunc: GpuKernel = kfunc
         self._raw_kernel = raw_kernel
         self._block_size = block_size
-        self._num_blocks: tuple[int, int, int] | None = None
+        self._grid_size: tuple[int, int, int] | None = None
         self._args_cache: dict[Any, tuple] = dict()
 
     @property
@@ -61,11 +61,11 @@ class CupyKernelWrapper(KernelWrapper):
 
     @property
     def num_blocks(self) -> tuple[int, int, int] | None:
-        return self._num_blocks
+        return self._grid_size
 
     @num_blocks.setter
     def num_blocks(self, nb: tuple[int, int, int] | None):
-        self._num_blocks = nb
+        self._grid_size = nb
 
     def __call__(self, **kwargs: Any):
         kernel_args, launch_grid = self._get_cached_args(**kwargs)
@@ -80,7 +80,9 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._block_size, self._grid_size) + tuple(
+            (k, id(v)) for k, v in kwargs.items()
+        )
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -164,6 +166,7 @@ class CupyKernelWrapper(KernelWrapper):
                             elem_dtype: PsType
 
                             from .. import DynamicType
+
                             if isinstance(field.dtype, DynamicType):
                                 assert isinstance(kparam.dtype, PsPointerType)
                                 elem_dtype = kparam.dtype.base_type
@@ -199,42 +202,48 @@ class CupyKernelWrapper(KernelWrapper):
                 add_arg(kparam.name, val, kparam.dtype)
 
         #   Determine launch grid
-        from ..backend.ast.expressions import evaluate_expression
-
-        symbolic_threads_range = self._kfunc.threads_range
 
-        if self._num_blocks is not None:
-            launch_grid = LaunchGrid(self._num_blocks, self._block_size)
+        from ..codegen.gpu_indexing import GpuBlockSize
 
-        elif symbolic_threads_range is not None:
-            threads_range: list[int] = [
-                evaluate_expression(expr, valuation)
-                for expr in symbolic_threads_range.num_work_items
-            ]
+        constraints = self._kfunc.launch_grid_constraints
 
-            if symbolic_threads_range.dim < 3:
-                threads_range += [1] * (3 - symbolic_threads_range.dim)
-
-            def div_ceil(a, b):
-                return a // b if a % b == 0 else a // b + 1
-
-            #   TODO: Refine this?
-            num_blocks = tuple(
-                div_ceil(threads, tpb)
-                for threads, tpb in zip(threads_range, self._block_size)
+        for cparam in constraints.parameters:
+            for prop in cparam.properties:
+                match prop:
+                    case GpuBlockSize(coord):
+                        valuation[cparam.name] = self._block_size[coord]
+                        break
+            else:
+                valuation[cparam.name] = kwargs[cparam.name]
+
+        # launch_block_size: list[int] = []
+        # for coord, (bsize_constr, user_bsize) in enumerate(
+        #     zip(constraints.block_size, self._block_size)
+        # ):
+        #     if bsize_constr is None:
+        #         launch_grid_size
+
+        launch_block_size = [
+            (
+                int(bsize_constr(**valuation))
+                if bsize_constr is not None
+                else self._block_size[coord]
             )
-            assert len(num_blocks) == 3
-
-            launch_grid = LaunchGrid(num_blocks, self._block_size)
-
-        else:
-            raise JitError(
-                "Unable to determine launch grid for GPU kernel invocation: "
-                "No manual grid size was specified, and the number of threads could not "
-                "be determined automatically."
+            for coord, bsize_constr in enumerate(constraints.block_size)
+        ]
+
+        launch_grid_size = [
+            (
+                int(gsize_constr(**valuation))
+                if gsize_constr is not None
+                else self._grid_size[coord]
             )
+            for coord, gsize_constr in enumerate(constraints.grid_size)
+        ]
 
-        return tuple(args), launch_grid
+        return tuple(args), LaunchGrid(
+            tuple(launch_grid_size), tuple(launch_block_size)
+        )
 
 
 class CupyJit(JitBase):
diff --git a/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py b/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py
deleted file mode 100644
index da2b3a5ad..000000000
--- a/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import pytest
-
-from pystencils.field import Field
-
-from pystencils.backend.kernelcreation import (
-    KernelCreationContext,
-    FullIterationSpace
-)
-
-from pystencils.backend.ast.structural import PsBlock, PsComment
-
-from pystencils.backend.platforms import CudaPlatform, SyclPlatform
-
-
-@pytest.mark.parametrize("layout", ["fzyx", "zyxf", "c", "f"])
-@pytest.mark.parametrize("platform_class", [CudaPlatform, SyclPlatform])
-def test_thread_range(platform_class, layout):
-    ctx = KernelCreationContext()
-
-    body = PsBlock([PsComment("Kernel body goes here")])
-    platform = platform_class(ctx)
-
-    dim = 3
-    archetype_field = Field.create_generic("field", spatial_dimensions=dim, layout=layout)
-    ispace = FullIterationSpace.create_with_ghost_layers(ctx, 1, archetype_field)
-
-    _, threads_range = platform.materialize_iteration_space(body, ispace)
-
-    assert threads_range.dim == dim
-    
-    match layout:
-        case "fzyx" | "zyxf" | "f":
-            indexing_order = [0, 1, 2]
-        case "c":
-            indexing_order = [2, 1, 0]
-
-    for i in range(dim):
-        #   Slowest to fastest coordinate
-        coordinate = indexing_order[i]
-        dimension = ispace.dimensions[coordinate]
-        witems = threads_range.num_work_items[i]
-        desired = dimension.stop - dimension.start
-        assert witems.structurally_equal(desired)
-- 
GitLab


From d2dd3dfaa23eed86cf5b980cf2a3dfff0b42ae4a Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 13 Feb 2025 11:05:18 +0100
Subject: [PATCH 04/17] also use ThreadIdxMapping for sparse kernels

---
 src/pystencils/backend/platforms/cuda.py | 88 ++++++++++++++++--------
 1 file changed, 59 insertions(+), 29 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index cff6f935f..122011eb0 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -49,7 +49,7 @@ GRID_DIM = [
 class ThreadToIndexMapping(ABC):
 
     @abstractmethod
-    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
         """Map the current thread index onto a point in the given iteration space.
 
         Implementations of this method must return a declaration for each dimension counter
@@ -61,7 +61,18 @@ class Linear3DMapping(ThreadToIndexMapping):
     """3D globally linearized mapping, where each thread is assigned a work item according to
     its location in the global launch grid."""
 
-    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        match ispace:
+            case FullIterationSpace():
+                return self._dense_mapping(ispace)
+            case SparseIterationSpace():
+                return self._sparse_mapping(ispace)
+            case _:
+                assert False, "unexpected iteration space"
+
+    def _dense_mapping(
+        self, ispace: FullIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
         if ispace.rank > 3:
             raise MaterializationError(
                 f"Cannot handle {ispace.rank}-dimensional iteration space "
@@ -79,6 +90,18 @@ class Linear3DMapping(ThreadToIndexMapping):
 
         return idx_map
 
+    def _sparse_mapping(
+        self, ispace: SparseIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
+        thread_idx = self._linear_thread_idx(0)
+        idx_map: dict[PsSymbol, PsExpression] = {
+            ispace.sparse_counter: PsCast(
+                deconstify(sparse_ctr.get_dtype()), thread_idx
+            )
+        }
+        return idx_map
+
     def _linear_thread_idx(self, coord: int):
         block_size = BLOCK_DIM[coord]
         block_idx = BLOCK_IDX[coord]
@@ -97,7 +120,18 @@ class Blockwise4DMapping(ThreadToIndexMapping):
         THREAD_IDX[0],
     ]
 
-    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        match ispace:
+            case FullIterationSpace():
+                return self._dense_mapping(ispace)
+            case SparseIterationSpace():
+                return self._sparse_mapping(ispace)
+            case _:
+                assert False, "unexpected iteration space"
+
+    def _dense_mapping(
+        self, ispace: FullIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
         if ispace.rank > 4:
             raise MaterializationError(
                 f"Cannot handle {ispace.rank}-dimensional iteration space "
@@ -114,6 +148,18 @@ class Blockwise4DMapping(ThreadToIndexMapping):
 
         return idx_map
 
+    def _sparse_mapping(
+        self, ispace: SparseIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
+        thread_idx = self._indices_in_loop_order[-1]
+        idx_map: dict[PsSymbol, PsExpression] = {
+            ispace.sparse_counter: PsCast(
+                deconstify(sparse_ctr.get_dtype()), thread_idx
+            )
+        }
+        return idx_map
+
 
 class CudaPlatform(GenericGpu):
     """Platform for CUDA-based GPUs."""
@@ -127,7 +173,9 @@ class CudaPlatform(GenericGpu):
         super().__init__(ctx)
 
         self._omit_range_check = omit_range_check
-        self._thread_mapping = thread_mapping
+        self._thread_mapping = (
+            thread_mapping if thread_mapping is not None else Linear3DMapping()
+        )
 
         self._typify = Typifier(ctx)
 
@@ -212,26 +260,7 @@ class CudaPlatform(GenericGpu):
     ) -> PsBlock:
         dimensions = ispace.dimensions_in_loop_order()
 
-        #   TODO move to codegen
-        # if not self._manual_launch_grid:
-        #     try:
-        #         threads_range = self.threads_from_ispace(ispace)
-        #     except MaterializationError as e:
-        #         warn(
-        #             str(e.args[0])
-        #             + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
-        #             UserWarning,
-        #         )
-        #         threads_range = None
-        # else:
-        #     threads_range = None
-
-        idx_mapper = (
-            self._thread_mapping
-            if self._thread_mapping is not None
-            else Linear3DMapping()
-        )
-        ctr_mapping = idx_mapper(ispace)
+        ctr_mapping = self._thread_mapping(ispace)
 
         indexing_decls = []
         conds = []
@@ -264,10 +293,11 @@ class CudaPlatform(GenericGpu):
         factory = AstFactory(self._ctx)
         ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
 
-        sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
+        sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
+        ctr_mapping = self._thread_mapping(ispace)
+
         sparse_idx_decl = self._typify(
-            PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx))
+            PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter])
         )
 
         mappings = [
@@ -276,7 +306,7 @@ class CudaPlatform(GenericGpu):
                 PsLookup(
                     PsBufferAcc(
                         ispace.index_list.base_pointer,
-                        (sparse_ctr, factory.parse_index(0)),
+                        (sparse_ctr_expr.clone(), factory.parse_index(0)),
                     ),
                     coord.name,
                 ),
@@ -287,7 +317,7 @@ class CudaPlatform(GenericGpu):
 
         if not self._omit_range_check:
             stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr, stop)
+            condition = PsLt(sparse_ctr_expr.clone(), stop)
             ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
         else:
             body.statements = [sparse_idx_decl] + body.statements
-- 
GitLab


From e67b5e238695099ab05b114054cd31b4ccd07d9e Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 13 Feb 2025 22:01:16 +0100
Subject: [PATCH 05/17] Introduce launch config factory pattern

 - update GpuKernel to receive a launch config factory.
 - Update gpu-indexing to provide one.
 - Update cupy-jit to expose and evaluate the launch config
---
 src/pystencils/backend/platforms/platform.py |   3 +-
 src/pystencils/codegen/driver.py             |  33 ++----
 src/pystencils/codegen/gpu_indexing.py       |  90 +++++++++-------
 src/pystencils/codegen/kernel.py             |  11 +-
 src/pystencils/codegen/properties.py         |   5 -
 src/pystencils/jit/gpu_cupy.py               | 104 +++++++------------
 6 files changed, 108 insertions(+), 138 deletions(-)

diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index 2c7ee1c5f..9b7e642b5 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -1,5 +1,4 @@
 from abc import ABC, abstractmethod
-from typing import Any
 
 from ..ast.structural import PsBlock
 from ..ast.expressions import PsCall, PsExpression
@@ -28,7 +27,7 @@ class Platform(ABC):
     @abstractmethod
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> PsBlock | tuple[PsBlock, Any]:
+    ) -> PsBlock:
         pass
 
     @abstractmethod
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 152fceba8..b14fc0272 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import cast, Sequence, Iterable, TYPE_CHECKING
+from typing import cast, Sequence, Iterable, Callable, TYPE_CHECKING
 from dataclasses import dataclass, replace
 
 from .target import Target
@@ -15,7 +15,7 @@ from .config import (
 from .kernel import Kernel, GpuKernel
 from .properties import PsSymbolProperty, FieldBasePtr
 from .parameters import Parameter
-from .gpu_indexing import GpuIndexing, GpuLaunchGridConstraints
+from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration
 
 from ..field import Field
 from ..types import PsIntegerType, PsScalarType
@@ -40,7 +40,6 @@ from ..backend.platforms import (
     Platform,
     GenericCpu,
     GenericVectorCpu,
-    GenericGpu,
 )
 from ..backend.exceptions import VectorizationError
 
@@ -166,17 +165,9 @@ class DefaultKernelCreationDriver:
     ) -> Kernel:
         kernel_body = self.parse_kernel_body(assignments)
 
-        match self._platform:
-            case GenericCpu():
-                kernel_ast = self._platform.materialize_iteration_space(
-                    kernel_body, self._ctx.get_iteration_space()
-                )
-            case GenericGpu():
-                kernel_ast = self._platform.materialize_iteration_space(
-                    kernel_body, self._ctx.get_iteration_space()
-                )
-            case _:
-                assert False, "unexpected platform"
+        kernel_ast = self._platform.materialize_iteration_space(
+            kernel_body, self._ctx.get_iteration_space()
+        )
 
         if self._intermediates is not None:
             self._intermediates.materialized_ispace = kernel_ast.clone()
@@ -220,14 +211,16 @@ class DefaultKernelCreationDriver:
                 self._cfg.get_jit(),
             )
         else:
+            assert self._gpu_indexing is not None
+
             return create_gpu_kernel_function(
                 self._ctx,
                 self._platform,
                 kernel_ast,
-                self._gpu_indexing,
                 self._cfg.get_option("function_name"),
                 self._target,
                 self._cfg.get_jit(),
+                self._gpu_indexing.get_launch_config,
             )
 
     def parse_kernel_body(
@@ -500,30 +493,24 @@ def create_gpu_kernel_function(
     ctx: KernelCreationContext,
     platform: Platform,
     body: PsBlock,
-    indexing: GpuIndexing | None,
     function_name: str,
     target_spec: Target,
     jit: JitBase,
+    launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
 ) -> GpuKernel:
     undef_symbols = collect_undefined_symbols(body)
 
-    launch_grid_constraints = (
-        indexing.get_launch_grid_constraints()
-        if indexing is not None
-        else GpuLaunchGridConstraints()
-    )
-
     params = _get_function_params(ctx, undef_symbols)
     req_headers = _get_headers(ctx, platform, body)
 
     kfunc = GpuKernel(
         body,
-        launch_grid_constraints,
         target_spec,
         function_name,
         params,
         req_headers,
         jit,
+        launch_config_factory,
     )
     kfunc.metadata.update(ctx.metadata)
     return kfunc
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 2b84ef007..08134a622 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import cast
+from typing import cast, Any
 from itertools import chain
 
 from .functions import Lambda
+from .kernel import GpuKernel
 from .parameters import Parameter
-from .properties import GpuBlockSize
 from .errors import CodegenError
 
 from ..backend.kernelcreation import (
@@ -18,34 +18,30 @@ from ..backend.platforms.cuda import ThreadToIndexMapping
 from ..backend.ast.expressions import PsExpression
 
 
-_ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None]
+_Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
-class GpuLaunchGridConstraints:
-    """Constraints on the number of threads and blocks on the GPU launch grid for a given kernel.
+class GpuLaunchConfiguration:
+    """Base class for launch configurations for CUDA and HIP kernels.
 
-    This constraints set determines all or some of
-    the number of threads on a GPU block as well as the number of blocks on the GPU grid,
-    statically or depending on runtime parameters.
+    Args:
+        block_size: A triple of lambdas determining the GPU block size
+        grid_size: A triple of lambdas determining the GPU grid size
+        config_parameters: Set containing all parameters to the given lambdas that are not also
+            parameters to the associated kernel
     """
 
     def __init__(
         self,
-        block_size: _ConstraintTriple | None = None,
-        grid_size: _ConstraintTriple | None = None,
+        block_size: _Dim3Lambda,
+        grid_size: _Dim3Lambda,
+        config_parameters: set[Parameter],
     ) -> None:
-        self._block_size: _ConstraintTriple = (
-            (None, None, None) if block_size is None else block_size
-        )
-        self._grid_size: _ConstraintTriple = (
-            (None, None, None) if grid_size is None else grid_size
-        )
+        self._block_size = block_size
+        self._grid_size = grid_size
 
-        params = set()
-        for constr in chain(self._block_size, self._grid_size):
-            if constr is not None:
-                params |= set(constr.parameters)
-        self._params = frozenset(params)
+        self._params = frozenset(config_parameters)
+        self._valuation: dict[Parameter, Any] = dict()
 
     @property
     def parameters(self) -> frozenset[Parameter]:
@@ -53,12 +49,18 @@ class GpuLaunchGridConstraints:
         return self._params
 
     @property
-    def block_size(self) -> _ConstraintTriple:
+    def parameter_values(self) -> dict[Parameter, Any]:
+        """Values for all parameters that are specific to the launch grid configuration and not
+        also kernel parameters."""
+        return self._valuation
+
+    @property
+    def block_size(self) -> _Dim3Lambda:
         """Constraints on the number of threads per block"""
         return self._block_size
 
     @property
-    def grid_size(self) -> _ConstraintTriple:
+    def grid_size(self) -> _Dim3Lambda:
         """Constraints on the number of blocks on the grid"""
         return self._grid_size
 
@@ -68,7 +70,7 @@ class GpuIndexing(ABC):
     def get_thread_mapping(self) -> ThreadToIndexMapping | None: ...
 
     @abstractmethod
-    def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints: ...
+    def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration: ...
 
 
 class Linear3DGpuIndexing(GpuIndexing):
@@ -85,29 +87,48 @@ class Linear3DGpuIndexing(GpuIndexing):
 
         return Linear3DMapping()
 
-    def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints:
+    def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration:
+        block_size, grid_size = self._prepare_launch_grid()
+
+        kernel_params = set(kernel.parameters)
+        launch_config_params = (
+            set().union(
+                *(lb.parameters for lb in chain(block_size, grid_size))
+            )
+            - kernel_params
+        )
+
+        return GpuLaunchConfiguration(
+            block_size=cast(_Dim3Lambda, tuple(block_size)),
+            grid_size=cast(_Dim3Lambda, tuple(grid_size)),
+            config_parameters=launch_config_params,
+        )
+
+    def _prepare_launch_grid(self):
         work_items = self._get_work_items()
         rank = len(work_items)
 
         from ..backend.constants import PsConstant
         from ..backend.ast.expressions import PsExpression, PsIntDiv
 
-        block_size_constraints = [None] * rank + [
-            Lambda(self._factory.parse_index(1), ()) for _ in range(3 - rank)
-        ]
-
         block_size_symbols = [
             self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank)
         ]
-        for c, bs in enumerate(block_size_symbols):
-            bs.add_property(GpuBlockSize(c))
+
+        block_size = [
+            Lambda.from_expression(self._ctx, self._factory.parse_index(bs_symb))
+            for bs_symb in block_size_symbols
+        ] + [
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
+            for _ in range(3 - rank)
+        ]
 
         def div_ceil(a: PsExpression, b: PsExpression):
             return self._factory.parse_index(
                 PsIntDiv(a + b - PsExpression.make(PsConstant(1)), b)
             )
 
-        grid_size_constraints = [
+        grid_size = [
             Lambda.from_expression(
                 self._ctx, div_ceil(witems, PsExpression.make(bsize))
             )
@@ -117,10 +138,7 @@ class Linear3DGpuIndexing(GpuIndexing):
             for _ in range(3 - rank)
         ]
 
-        return GpuLaunchGridConstraints(
-            block_size=cast(_ConstraintTriple, tuple(block_size_constraints)),
-            grid_size=cast(_ConstraintTriple, tuple(grid_size_constraints)),
-        )
+        return block_size, grid_size
 
     def _get_work_items(self) -> tuple[PsExpression, ...]:
         ispace = self._ctx.get_iteration_space()
diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py
index 8038f24b0..67ef6554c 100644
--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -6,7 +6,6 @@ from itertools import chain
 
 from .target import Target
 from .parameters import Parameter
-from .gpu_indexing import GpuLaunchGridConstraints
 from ..backend.ast.structural import PsBlock
 from ..field import Field
 
@@ -14,6 +13,7 @@ from .._deprecation import _deprecated
 
 if TYPE_CHECKING:
     from ..jit import JitBase
+    from .gpu_indexing import GpuLaunchConfiguration
 
 
 class Kernel:
@@ -118,17 +118,16 @@ class GpuKernel(Kernel):
     def __init__(
         self,
         body: PsBlock,
-        launch_grid_constraints: GpuLaunchGridConstraints,
         target: Target,
         name: str,
         parameters: Sequence[Parameter],
         required_headers: set[str],
         jit: JitBase,
+        launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
     ):
         super().__init__(body, target, name, parameters, required_headers, jit)
-        self._launch_grid_constraints = launch_grid_constraints
+        self._launch_config_factory = launch_config_factory
 
-    @property
-    def launch_grid_constraints(self) -> GpuLaunchGridConstraints:
+    def get_launch_configuration(self) -> GpuLaunchConfiguration:
         """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._launch_grid_constraints
+        return self._launch_config_factory(self)
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index df76489db..d377fb3d3 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -39,8 +39,3 @@ class FieldBasePtr(UniqueSymbolProperty):
 
 FieldProperty = FieldShape | FieldStride | FieldBasePtr
 _FieldProperty = (FieldShape, FieldStride, FieldBasePtr)
-
-
-@dataclass(frozen=True)
-class GpuBlockSize(UniqueSymbolProperty):
-    coordinate: int
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index afdbd5097..f3f834f76 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -18,6 +18,7 @@ from ..codegen import (
     GpuKernel,
     Parameter,
 )
+from ..codegen.gpu_indexing import GpuLaunchConfiguration
 from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr
 from ..types import PsStructType, PsPointerType
 
@@ -35,38 +36,24 @@ class CupyKernelWrapper(KernelWrapper):
         self,
         kfunc: GpuKernel,
         raw_kernel: Any,
-        block_size: tuple[int, int, int],
     ):
         self._kfunc: GpuKernel = kfunc
+        self._launch_config = kfunc.get_launch_configuration()
         self._raw_kernel = raw_kernel
-        self._block_size = block_size
-        self._grid_size: tuple[int, int, int] | None = None
         self._args_cache: dict[Any, tuple] = dict()
 
     @property
     def kernel_function(self) -> GpuKernel:
         return self._kfunc
+    
+    @property
+    def launch_config(self) -> GpuLaunchConfiguration:
+        return self._launch_config
 
     @property
     def raw_kernel(self):
         return self._raw_kernel
 
-    @property
-    def block_size(self) -> tuple[int, int, int]:
-        return self._block_size
-
-    @block_size.setter
-    def block_size(self, bs: tuple[int, int, int]):
-        self._block_size = bs
-
-    @property
-    def num_blocks(self) -> tuple[int, int, int] | None:
-        return self._grid_size
-
-    @num_blocks.setter
-    def num_blocks(self, nb: tuple[int, int, int] | None):
-        self._grid_size = nb
-
     def __call__(self, **kwargs: Any):
         kernel_args, launch_grid = self._get_cached_args(**kwargs)
         device = self._get_device(kernel_args)
@@ -80,9 +67,10 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        key = (self._block_size, self._grid_size) + tuple(
-            (k, id(v)) for k, v in kwargs.items()
-        )
+        launch_config_params = self._launch_config.parameter_values
+        key = tuple(
+            (k, v) for k, v in launch_config_params.items()
+        ) + tuple((k, id(v)) for k, v in kwargs.items())
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -203,48 +191,32 @@ class CupyKernelWrapper(KernelWrapper):
 
         #   Determine launch grid
 
-        from ..codegen.gpu_indexing import GpuBlockSize
-
-        constraints = self._kfunc.launch_grid_constraints
+        launch_cfg_valuation = valuation.copy()
+        launch_cfg_valuation.update(
+            {
+                param.name: value
+                for param, value in self._launch_config.parameter_values.items()
+            }
+        )
 
-        for cparam in constraints.parameters:
-            for prop in cparam.properties:
-                match prop:
-                    case GpuBlockSize(coord):
-                        valuation[cparam.name] = self._block_size[coord]
-                        break
-            else:
-                valuation[cparam.name] = kwargs[cparam.name]
-
-        # launch_block_size: list[int] = []
-        # for coord, (bsize_constr, user_bsize) in enumerate(
-        #     zip(constraints.block_size, self._block_size)
-        # ):
-        #     if bsize_constr is None:
-        #         launch_grid_size
-
-        launch_block_size = [
-            (
-                int(bsize_constr(**valuation))
-                if bsize_constr is not None
-                else self._block_size[coord]
-            )
-            for coord, bsize_constr in enumerate(constraints.block_size)
-        ]
-
-        launch_grid_size = [
-            (
-                int(gsize_constr(**valuation))
-                if gsize_constr is not None
-                else self._grid_size[coord]
-            )
-            for coord, gsize_constr in enumerate(constraints.grid_size)
-        ]
+        block_size = cast(
+            tuple[int, int, int],
+            tuple(
+                int(component(**launch_cfg_valuation))
+                for component in self._launch_config.block_size
+            ),
+        )
 
-        return tuple(args), LaunchGrid(
-            tuple(launch_grid_size), tuple(launch_block_size)
+        grid_size = cast(
+            tuple[int, int, int],
+            tuple(
+                int(component(**launch_cfg_valuation))
+                for component in self._launch_config.grid_size
+            ),
         )
 
+        return tuple(args), LaunchGrid(grid_size, block_size)
+
 
 class CupyJit(JitBase):
 
@@ -261,26 +233,26 @@ class CupyJit(JitBase):
             tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
         )
 
-    def compile(self, kfunc: Kernel) -> KernelWrapper:
+    def compile(self, kernel: Kernel) -> KernelWrapper:
         if not HAVE_CUPY:
             raise JitError(
                 "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
             )
 
-        if not isinstance(kfunc, GpuKernel) or kfunc.target != Target.CUDA:
+        if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA:
             raise ValueError(
                 "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP"
             )
 
         options = self._compiler_options()
-        prelude = self._prelude(kfunc)
-        kernel_code = self._kernel_code(kfunc)
+        prelude = self._prelude(kernel)
+        kernel_code = self._kernel_code(kernel)
         code = prelude + kernel_code
 
         raw_kernel = cp.RawKernel(
-            code, kfunc.name, options=options, backend="nvrtc", jitify=True
+            code, kernel.name, options=options, backend="nvrtc", jitify=True
         )
-        return CupyKernelWrapper(kfunc, raw_kernel, self._default_block_size)
+        return CupyKernelWrapper(kernel, raw_kernel)
 
     def _compiler_options(self) -> tuple[str, ...]:
         options = ["-w", "-std=c++11"]
-- 
GitLab


From 210f768a5a5ffdab3531361a45bada18ce699358 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 14 Feb 2025 12:35:42 +0100
Subject: [PATCH 06/17] WIP refactor launch configuration and gpu indexing

---
 src/pystencils/codegen/config.py       |   2 +-
 src/pystencils/codegen/driver.py       |  35 +++--
 src/pystencils/codegen/gpu_indexing.py | 208 +++++++++++++++++++++----
 src/pystencils/jit/gpu_cupy.py         |   8 +-
 4 files changed, 200 insertions(+), 53 deletions(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 53a271852..47a64df64 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -380,7 +380,7 @@ class GpuOptions(ConfigBase):
     This check can be discarded through this option, at your own peril.
     """
 
-    block_size: BasicOption[tuple[int, int, int]] = BasicOption()
+    block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
     """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
 
     manual_launch_grid: BasicOption[bool] = BasicOption(False)
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index b14fc0272..f7eb8ddb4 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -516,26 +516,27 @@ def create_gpu_kernel_function(
     return kfunc
 
 
-def _get_function_params(
-    ctx: KernelCreationContext, symbols: Iterable[PsSymbol]
-) -> list[Parameter]:
-    params: list[Parameter] = []
-
+def _symbol_to_param(ctx: KernelCreationContext, symbol: PsSymbol):
     from pystencils.backend.memory import BufferBasePtr, BackendPrivateProperty
 
-    for symb in symbols:
-        props: set[PsSymbolProperty] = set()
-        for prop in symb.properties:
-            match prop:
-                case BufferBasePtr(buf):
-                    field = ctx.find_field(buf.name)
-                    props.add(FieldBasePtr(field))
-                case BackendPrivateProperty():
-                    pass
-                case _:
-                    props.add(prop)
-        params.append(Parameter(symb.name, symb.get_dtype(), props))
+    props: set[PsSymbolProperty] = set()
+    for prop in symbol.properties:
+        match prop:
+            case BufferBasePtr(buf):
+                field = ctx.find_field(buf.name)
+                props.add(FieldBasePtr(field))
+            case BackendPrivateProperty():
+                pass
+            case _:
+                props.add(prop)
+
+    return Parameter(symbol.name, symbol.get_dtype(), props)
+
 
+def _get_function_params(
+    ctx: KernelCreationContext, symbols: Iterable[PsSymbol]
+) -> list[Parameter]:
+    params: list[Parameter] = [_symbol_to_param(ctx, s) for s in symbols]
     params.sort(key=lambda p: p.name)
     return params
 
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 08134a622..24189bf63 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -1,13 +1,14 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import cast, Any
+from typing import cast, Any, Callable
 from itertools import chain
 
 from .functions import Lambda
 from .kernel import GpuKernel
 from .parameters import Parameter
 from .errors import CodegenError
+from .config import GpuIndexingScheme, _AUTO_TYPE
 
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -18,6 +19,8 @@ from ..backend.platforms.cuda import ThreadToIndexMapping
 from ..backend.ast.expressions import PsExpression
 
 
+dim3 = tuple[int, int, int]
+_Dim3Params = tuple[Parameter, Parameter, Parameter]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
@@ -48,29 +51,179 @@ class GpuLaunchConfiguration:
         """Parameters to this set of constraints"""
         return self._params
 
-    @property
-    def parameter_values(self) -> dict[Parameter, Any]:
+    def get_valuation(self) -> dict[Parameter, Any]:
         """Values for all parameters that are specific to the launch grid configuration and not
         also kernel parameters."""
         return self._valuation
 
-    @property
-    def block_size(self) -> _Dim3Lambda:
-        """Constraints on the number of threads per block"""
+    def get_block_size(self) -> _Dim3Lambda:
         return self._block_size
 
-    @property
-    def grid_size(self) -> _Dim3Lambda:
-        """Constraints on the number of blocks on the grid"""
+    def get_grid_size(self) -> _Dim3Lambda:
         return self._grid_size
 
 
+class ManualLaunchConfiguration(GpuLaunchConfiguration):
+    """Manual GPU launch configuration.
+
+    This launch configuration requires the user to set block and grid size.
+    """
+
+    def __init__(
+        self,
+        block_size: _Dim3Lambda,
+        grid_size: _Dim3Lambda,
+        block_size_params: _Dim3Params,
+        grid_size_params: _Dim3Params,
+    ):
+        super().__init__(
+            cast(_Dim3Lambda, block_size),
+            cast(_Dim3Lambda, grid_size),
+            set(block_size_params).union(grid_size_params),
+        )
+        self._block_size_params = block_size_params
+        self._grid_size_params = grid_size_params
+
+        self._user_block_size: dim3 | None = None
+        self._user_grid_size: dim3 | None = None
+
+    @property
+    def block_size(self) -> dim3 | None:
+        return self._user_block_size
+
+    @block_size.setter
+    def block_size(self, val: dim3):
+        self._user_block_size = val
+
+    @property
+    def grid_size(self) -> dim3 | None:
+        return self._user_grid_size
+
+    @grid_size.setter
+    def grid_size(self, val: dim3):
+        self._user_grid_size = val
+
+    def get_valuation(self) -> dict[Parameter, Any]:
+        if self._user_block_size is None:
+            raise AttributeError("No GPU block size was specified")
+
+        if self._user_grid_size is None:
+            raise AttributeError("No GPU grid size was specified")
+
+        valuation: dict[Parameter, Any] = dict()
+
+        for bs_param, bs in zip(self._block_size_params, self._user_block_size):
+            valuation[bs_param] = bs
+
+        for gs_param, gs in zip(self._grid_size_params, self._user_grid_size):
+            valuation[gs_param] = gs
+
+        return valuation
+
+
+class GridFromBlockSizeConfiguration(GpuLaunchConfiguration):
+    """GPU launch configuration that computes the grid size from a user-defined block size."""
+
+    def __init__(
+        self,
+        block_size: _Dim3Lambda,
+        grid_size: _Dim3Lambda,
+        block_size_params: _Dim3Params,
+        default_block_size: dim3 | None = None,
+    ) -> None:
+        super().__init__(block_size, grid_size, set(block_size_params))
+
+        self._block_size_params = block_size_params
+        self._user_block_size: dim3 | None = default_block_size
+
+    @property
+    def block_size(self) -> dim3 | None:
+        return self._user_block_size
+
+    @block_size.setter
+    def block_size(self, val: dim3):
+        self._user_block_size = val
+
+    def get_valuation(self) -> dict[Parameter, Any]:
+        if self._user_block_size is None:
+            raise AttributeError("No GPU block size was specified")
+
+        valuation: dict[Parameter, Any] = dict()
+
+        for bs_param, bs in zip(self._block_size_params, self._user_block_size):
+            valuation[bs_param] = bs
+
+        return valuation
+
+
 class GpuIndexing(ABC):
-    @abstractmethod
-    def get_thread_mapping(self) -> ThreadToIndexMapping | None: ...
+    def __init__(
+        self,
+        ctx: KernelCreationContext,
+        scheme: GpuIndexingScheme,
+        block_size: dim3 | _AUTO_TYPE,
+        manual_launch_grid: bool,
+    ) -> None:
+        self._ctx = ctx
+        self._scheme = scheme
+        self._block_size = block_size
+        self._manual_launch_grid = manual_launch_grid
+
+        from ..backend.kernelcreation import AstFactory
 
-    @abstractmethod
-    def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration: ...
+        self._factory = AstFactory(self._ctx)
+
+    def get_thread_mapping(self):
+        from ..backend.platforms.cuda import Linear3DMapping, Blockwise4DMapping
+
+        match self._scheme:
+            case GpuIndexingScheme.Linear3D:
+                return Linear3DMapping()
+            case GpuIndexingScheme.Blockwise4D:
+                return Blockwise4DMapping()
+
+    def get_launch_config_factory(
+        self, scheme: GpuIndexingScheme
+    ) -> Callable[[], GpuLaunchConfiguration]:
+        if self._manual_launch_grid:
+            return self._manual_config_factory()
+
+        raise NotImplementedError()
+
+    def _manual_config_factory(self) -> Callable[[], ManualLaunchConfiguration]:
+        ctx = self._ctx
+
+        block_size_symbols = [
+            ctx.get_new_symbol(f"gpuBlockSize_{c}", ctx.index_dtype) for c in range(3)
+        ]
+        grid_size_symbols = [
+            ctx.get_new_symbol(f"gpuGridSize_{c}", ctx.index_dtype) for c in range(3)
+        ]
+
+        block_size = tuple(
+            Lambda.from_expression(ctx, PsExpression.make(bs))
+            for bs in block_size_symbols
+        )
+
+        grid_size = tuple(
+            Lambda.from_expression(ctx, PsExpression.make(gs))
+            for gs in grid_size_symbols
+        )
+
+        from .driver import _symbol_to_param
+
+        bs_params = [_symbol_to_param(ctx, s) for s in block_size_symbols]
+        gs_params = [_symbol_to_param(ctx, s) for s in grid_size_symbols]
+
+        def factory():
+            return ManualLaunchConfiguration(
+                cast(_Dim3Lambda, block_size),
+                cast(_Dim3Lambda, grid_size),
+                cast(_Dim3Params, bs_params),
+                cast(_Dim3Params, gs_params),
+            )
+
+        return factory
 
 
 class Linear3DGpuIndexing(GpuIndexing):
@@ -88,23 +241,6 @@ class Linear3DGpuIndexing(GpuIndexing):
         return Linear3DMapping()
 
     def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration:
-        block_size, grid_size = self._prepare_launch_grid()
-
-        kernel_params = set(kernel.parameters)
-        launch_config_params = (
-            set().union(
-                *(lb.parameters for lb in chain(block_size, grid_size))
-            )
-            - kernel_params
-        )
-
-        return GpuLaunchConfiguration(
-            block_size=cast(_Dim3Lambda, tuple(block_size)),
-            grid_size=cast(_Dim3Lambda, tuple(grid_size)),
-            config_parameters=launch_config_params,
-        )
-
-    def _prepare_launch_grid(self):
         work_items = self._get_work_items()
         rank = len(work_items)
 
@@ -138,7 +274,17 @@ class Linear3DGpuIndexing(GpuIndexing):
             for _ in range(3 - rank)
         ]
 
-        return block_size, grid_size
+        from .driver import _symbol_to_param
+
+        block_size_params = tuple(
+            _symbol_to_param(self._ctx, s) for s in block_size_symbols
+        )
+
+        return GridFromBlockSizeConfiguration(
+            cast(_Dim3Lambda, tuple(block_size)),
+            cast(_Dim3Lambda, tuple(grid_size)),
+            cast(tuple[Parameter, Parameter, Parameter], block_size_params),
+        )
 
     def _get_work_items(self) -> tuple[PsExpression, ...]:
         ispace = self._ctx.get_iteration_space()
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index f3f834f76..d4f1c0204 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -67,7 +67,7 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        launch_config_params = self._launch_config.parameter_values
+        launch_config_params = self._launch_config.get_valuation
         key = tuple(
             (k, v) for k, v in launch_config_params.items()
         ) + tuple((k, id(v)) for k, v in kwargs.items())
@@ -195,7 +195,7 @@ class CupyKernelWrapper(KernelWrapper):
         launch_cfg_valuation.update(
             {
                 param.name: value
-                for param, value in self._launch_config.parameter_values.items()
+                for param, value in self._launch_config.get_valuation.items()
             }
         )
 
@@ -203,7 +203,7 @@ class CupyKernelWrapper(KernelWrapper):
             tuple[int, int, int],
             tuple(
                 int(component(**launch_cfg_valuation))
-                for component in self._launch_config.block_size
+                for component in self._launch_config.get_block_size()
             ),
         )
 
@@ -211,7 +211,7 @@ class CupyKernelWrapper(KernelWrapper):
             tuple[int, int, int],
             tuple(
                 int(component(**launch_cfg_valuation))
-                for component in self._launch_config.grid_size
+                for component in self._launch_config.get_grid_size()
             ),
         )
 
-- 
GitLab


From 26eb2d6221d98d883928133080b9f7d02eebdf40 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 14 Feb 2025 18:23:48 +0100
Subject: [PATCH 07/17] Simplified interface of GpuLaunchConfiguration.
 Implement factories inside GpuIndexing.

---
 src/pystencils/codegen/config.py       |  13 +-
 src/pystencils/codegen/driver.py       |  16 +-
 src/pystencils/codegen/functions.py    |   6 +-
 src/pystencils/codegen/gpu_indexing.py | 279 ++++++++++++++-----------
 src/pystencils/codegen/kernel.py       |   7 +-
 src/pystencils/jit/gpu_cupy.py         |  49 ++---
 6 files changed, 197 insertions(+), 173 deletions(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 47a64df64..2d62f286b 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -381,7 +381,11 @@ class GpuOptions(ConfigBase):
     """
 
     block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
-    """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
+    """Desired block size for the execution of GPU kernels.
+    
+    This option only takes effect if `Linear3D` is chosen as an indexing scheme.
+    The block size may be overridden at runtime.
+    """
 
     manual_launch_grid: BasicOption[bool] = BasicOption(False)
     """Always require a manually specified launch grid when running this kernel.
@@ -596,11 +600,8 @@ class CreateKernelConfig(ConfigBase):
             elif self.get_target() == Target.CUDA:
                 try:
                     from ..jit.gpu_cupy import CupyJit
-
-                    if self.gpu is not None and self.gpu.block_size is not None:
-                        return CupyJit(self.gpu.block_size)
-                    else:
-                        return CupyJit()
+                    
+                    return CupyJit()
 
                 except ImportError:
                     from ..jit import no_jit
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index f7eb8ddb4..14a95c84d 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -220,7 +220,7 @@ class DefaultKernelCreationDriver:
                 self._cfg.get_option("function_name"),
                 self._target,
                 self._cfg.get_jit(),
-                self._gpu_indexing.get_launch_config,
+                self._gpu_indexing.get_launch_config_factory(),
             )
 
     def parse_kernel_body(
@@ -397,15 +397,13 @@ class DefaultKernelCreationDriver:
         if self._target != Target.CUDA:
             return None
 
-        idx_scheme = self._cfg.gpu.get_option("indexing_scheme")
+        from .gpu_indexing import dim3
 
-        match idx_scheme:
-            case None | GpuIndexingScheme.Linear3D:
-                from .gpu_indexing import Linear3DGpuIndexing
+        idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
+        block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
+        manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
 
-                return Linear3DGpuIndexing(self._ctx)
-            case _:
-                raise NotImplementedError()
+        return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid)
 
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
@@ -496,7 +494,7 @@ def create_gpu_kernel_function(
     function_name: str,
     target_spec: Target,
     jit: JitBase,
-    launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
+    launch_config_factory: Callable[[], GpuLaunchConfiguration],
 ) -> GpuKernel:
     undef_symbols = collect_undefined_symbols(body)
 
diff --git a/src/pystencils/codegen/functions.py b/src/pystencils/codegen/functions.py
index 2779fa289..f6be3b1f3 100644
--- a/src/pystencils/codegen/functions.py
+++ b/src/pystencils/codegen/functions.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
-from typing import Sequence
-
-import numpy as np
+from typing import Sequence, Any
 
 from .parameters import Parameter
 from ..types import PsType
@@ -36,7 +34,7 @@ class Lambda:
         """Return type of this lambda"""
         return self._return_type
 
-    def __call__(self, **kwargs) -> np.generic:
+    def __call__(self, **kwargs) -> Any:
         """Evaluate this lambda with the given arguments.
 
         The lambda must receive a value for each parameter listed in `parameters`.
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 24189bf63..1e23a820e 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -5,7 +5,6 @@ from typing import cast, Any, Callable
 from itertools import chain
 
 from .functions import Lambda
-from .kernel import GpuKernel
 from .parameters import Parameter
 from .errors import CodegenError
 from .config import GpuIndexingScheme, _AUTO_TYPE
@@ -15,7 +14,7 @@ from ..backend.kernelcreation import (
     FullIterationSpace,
     SparseIterationSpace,
 )
-from ..backend.platforms.cuda import ThreadToIndexMapping
+
 from ..backend.ast.expressions import PsExpression
 
 
@@ -24,7 +23,7 @@ _Dim3Params = tuple[Parameter, Parameter, Parameter]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
-class GpuLaunchConfiguration:
+class GpuLaunchConfiguration(ABC):
     """Base class for launch configurations for CUDA and HIP kernels.
 
     Args:
@@ -34,33 +33,61 @@ class GpuLaunchConfiguration:
             parameters to the associated kernel
     """
 
+    @property
+    @abstractmethod
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+
+    @abstractmethod
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        """Compute block and grid size for a kernel launch.
+
+        Args:
+            kwargs: Valuation providing a value for each parameter listed in `parameters`
+        """
+
+    @abstractmethod
+    def jit_cache_key(self) -> Any:
+        """Return a hashable object that represents any user-configurable options of
+        this launch configuration, such that when the configuration changes, the JIT parameter
+        cache is invalidated."""
+
+
+class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
+    """Launch configuration that is dynamically computed from kernel parameters.
+
+    This launch configuration permits no further user customization
+    """
+
     def __init__(
         self,
         block_size: _Dim3Lambda,
         grid_size: _Dim3Lambda,
-        config_parameters: set[Parameter],
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
 
-        self._params = frozenset(config_parameters)
-        self._valuation: dict[Parameter, Any] = dict()
+        self._params: frozenset[Parameter] = frozenset().union(
+            *(lb.parameters for lb in chain(block_size, grid_size))
+        )
 
     @property
     def parameters(self) -> frozenset[Parameter]:
-        """Parameters to this set of constraints"""
+        """Parameters of this launch configuration"""
         return self._params
 
-    def get_valuation(self) -> dict[Parameter, Any]:
-        """Values for all parameters that are specific to the launch grid configuration and not
-        also kernel parameters."""
-        return self._valuation
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        """Compute block and grid size for a kernel launch.
 
-    def get_block_size(self) -> _Dim3Lambda:
-        return self._block_size
+        Args:
+            kwargs: Valuation providing a value for each parameter listed in `parameters`
+        """
+        block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
+        grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
+        return cast(dim3, block_size), cast(dim3, grid_size)
 
-    def get_grid_size(self) -> _Dim3Lambda:
-        return self._grid_size
+    def jit_cache_key(self) -> Any:
+        return ()
 
 
 class ManualLaunchConfiguration(GpuLaunchConfiguration):
@@ -71,89 +98,93 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
     def __init__(
         self,
-        block_size: _Dim3Lambda,
-        grid_size: _Dim3Lambda,
-        block_size_params: _Dim3Params,
-        grid_size_params: _Dim3Params,
-    ):
-        super().__init__(
-            cast(_Dim3Lambda, block_size),
-            cast(_Dim3Lambda, grid_size),
-            set(block_size_params).union(grid_size_params),
-        )
-        self._block_size_params = block_size_params
-        self._grid_size_params = grid_size_params
-
-        self._user_block_size: dim3 | None = None
-        self._user_grid_size: dim3 | None = None
+    ) -> None:
+        self._block_size: dim3 | None = None
+        self._grid_size: dim3 | None = None
 
     @property
     def block_size(self) -> dim3 | None:
-        return self._user_block_size
+        return self._block_size
 
     @block_size.setter
     def block_size(self, val: dim3):
-        self._user_block_size = val
+        self._block_size = val
 
     @property
     def grid_size(self) -> dim3 | None:
-        return self._user_grid_size
+        return self._grid_size
 
     @grid_size.setter
     def grid_size(self, val: dim3):
-        self._user_grid_size = val
-
-    def get_valuation(self) -> dict[Parameter, Any]:
-        if self._user_block_size is None:
-            raise AttributeError("No GPU block size was specified")
+        self._grid_size = val
 
-        if self._user_grid_size is None:
-            raise AttributeError("No GPU grid size was specified")
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        return frozenset()
 
-        valuation: dict[Parameter, Any] = dict()
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        if self._block_size is None:
+            raise AttributeError("No GPU block size was set by the user.")
 
-        for bs_param, bs in zip(self._block_size_params, self._user_block_size):
-            valuation[bs_param] = bs
+        if self._grid_size is None:
+            raise AttributeError("No GPU grid size was set by the user.")
 
-        for gs_param, gs in zip(self._grid_size_params, self._user_grid_size):
-            valuation[gs_param] = gs
+        return self._block_size, self._grid_size
 
-        return valuation
+    def jit_cache_key(self) -> Any:
+        return (self._block_size, self._grid_size)
 
 
-class GridFromBlockSizeConfiguration(GpuLaunchConfiguration):
-    """GPU launch configuration that computes the grid size from a user-defined block size."""
+class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
+    """GPU launch configuration that permits the user to set a block size dynamically."""
 
     def __init__(
         self,
-        block_size: _Dim3Lambda,
-        grid_size: _Dim3Lambda,
+        block_size_expr: _Dim3Lambda,
+        grid_size_expr: _Dim3Lambda,
         block_size_params: _Dim3Params,
         default_block_size: dim3 | None = None,
     ) -> None:
-        super().__init__(block_size, grid_size, set(block_size_params))
+        self._block_size_expr = block_size_expr
+        self._grid_size_expr = grid_size_expr
 
         self._block_size_params = block_size_params
-        self._user_block_size: dim3 | None = default_block_size
+        self._block_size: dim3 | None = default_block_size
+
+        self._params: frozenset[Parameter] = frozenset().union(
+            *(lb.parameters for lb in chain(block_size_expr, grid_size_expr))
+        ) - set(self._block_size_params)
 
     @property
     def block_size(self) -> dim3 | None:
-        return self._user_block_size
+        return self._block_size
 
     @block_size.setter
     def block_size(self, val: dim3):
-        self._user_block_size = val
+        self._block_size = val
+
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+        return self._params
 
-    def get_valuation(self) -> dict[Parameter, Any]:
-        if self._user_block_size is None:
-            raise AttributeError("No GPU block size was specified")
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        if self._block_size is None:
+            raise AttributeError("No GPU block size was specified by the user!")
 
-        valuation: dict[Parameter, Any] = dict()
+        kwargs.update(
+            {
+                param.name: value
+                for param, value in zip(self._block_size_params, self._block_size)
+            }
+        )
 
-        for bs_param, bs in zip(self._block_size_params, self._user_block_size):
-            valuation[bs_param] = bs
+        block_size = tuple(int(bs(**kwargs)) for bs in self._block_size_expr)
+        grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size_expr)
+        return cast(dim3, block_size), cast(dim3, grid_size)
 
-        return valuation
+    def jit_cache_key(self) -> Any:
+        return self._block_size
 
 
 class GpuIndexing(ABC):
@@ -182,65 +213,19 @@ class GpuIndexing(ABC):
             case GpuIndexingScheme.Blockwise4D:
                 return Blockwise4DMapping()
 
-    def get_launch_config_factory(
-        self, scheme: GpuIndexingScheme
-    ) -> Callable[[], GpuLaunchConfiguration]:
+    def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
         if self._manual_launch_grid:
-            return self._manual_config_factory()
-
-        raise NotImplementedError()
-
-    def _manual_config_factory(self) -> Callable[[], ManualLaunchConfiguration]:
-        ctx = self._ctx
-
-        block_size_symbols = [
-            ctx.get_new_symbol(f"gpuBlockSize_{c}", ctx.index_dtype) for c in range(3)
-        ]
-        grid_size_symbols = [
-            ctx.get_new_symbol(f"gpuGridSize_{c}", ctx.index_dtype) for c in range(3)
-        ]
-
-        block_size = tuple(
-            Lambda.from_expression(ctx, PsExpression.make(bs))
-            for bs in block_size_symbols
-        )
+            return ManualLaunchConfiguration
 
-        grid_size = tuple(
-            Lambda.from_expression(ctx, PsExpression.make(gs))
-            for gs in grid_size_symbols
-        )
-
-        from .driver import _symbol_to_param
-
-        bs_params = [_symbol_to_param(ctx, s) for s in block_size_symbols]
-        gs_params = [_symbol_to_param(ctx, s) for s in grid_size_symbols]
-
-        def factory():
-            return ManualLaunchConfiguration(
-                cast(_Dim3Lambda, block_size),
-                cast(_Dim3Lambda, grid_size),
-                cast(_Dim3Params, bs_params),
-                cast(_Dim3Params, gs_params),
-            )
-
-        return factory
-
-
-class Linear3DGpuIndexing(GpuIndexing):
-
-    def __init__(self, ctx: KernelCreationContext) -> None:
-        self._ctx = ctx
-
-        from ..backend.kernelcreation import AstFactory
-
-        self._factory = AstFactory(self._ctx)
-
-    def get_thread_mapping(self) -> ThreadToIndexMapping:
-        from ..backend.platforms.cuda import Linear3DMapping
-
-        return Linear3DMapping()
+        match self._scheme:
+            case GpuIndexingScheme.Linear3D:
+                return self._get_linear3d_config_factory()
+            case GpuIndexingScheme.Blockwise4D:
+                return self._get_blockwise4d_config_factory()
 
-    def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration:
+    def _get_linear3d_config_factory(
+        self,
+    ) -> Callable[[], DynamicBlockSizeLaunchConfiguration]:
         work_items = self._get_work_items()
         rank = len(work_items)
 
@@ -280,13 +265,65 @@ class Linear3DGpuIndexing(GpuIndexing):
             _symbol_to_param(self._ctx, s) for s in block_size_symbols
         )
 
-        return GridFromBlockSizeConfiguration(
-            cast(_Dim3Lambda, tuple(block_size)),
-            cast(_Dim3Lambda, tuple(grid_size)),
-            cast(tuple[Parameter, Parameter, Parameter], block_size_params),
+        def factory():
+            return DynamicBlockSizeLaunchConfiguration(
+                cast(_Dim3Lambda, tuple(block_size)),
+                cast(_Dim3Lambda, tuple(grid_size)),
+                cast(_Dim3Params, block_size_params),
+                self._get_default_block_size(rank),
+            )
+
+        return factory
+
+    def _get_default_block_size(self, rank: int) -> dim3:
+        if isinstance(self._block_size, _AUTO_TYPE):
+            match rank:
+                case 1:
+                    return (256, 1, 1)
+                case 2:
+                    return (128, 2, 1)
+                case 3:
+                    return (128, 2, 2)
+                case _:
+                    assert False, "unreachable code"
+        else:
+            return self._block_size
+
+    def _get_blockwise4d_config_factory(
+        self,
+    ) -> Callable[[], AutomaticLaunchConfiguration]:
+        work_items = self._get_work_items()[::-1]  # Want this ordered fastest first
+        rank = len(work_items)
+
+        if rank > 4:
+            raise ValueError(f"Iteration space rank is too large: {rank}")
+
+        block_size = (
+            Lambda.from_expression(self._ctx, work_items[0]),
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
+        )
+
+        grid_size = tuple(
+            Lambda.from_expression(self._ctx, wit) for wit in work_items[1:]
+        ) + tuple(
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
+            for _ in range(4 - rank)
         )
 
+        def factory():
+            return AutomaticLaunchConfiguration(
+                block_size,
+                cast(_Dim3Lambda, grid_size),
+            )
+
+        return factory
+
     def _get_work_items(self) -> tuple[PsExpression, ...]:
+        """Return a tuple of expressions representing the number of work items
+        in each dimension of the kernel's iteration space,
+        ordered from slowest to fastest dimension.
+        """
         ispace = self._ctx.get_iteration_space()
         match ispace:
             case FullIterationSpace():
diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py
index 67ef6554c..181e6ad3b 100644
--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -6,6 +6,8 @@ from itertools import chain
 
 from .target import Target
 from .parameters import Parameter
+from .gpu_indexing import GpuLaunchConfiguration
+
 from ..backend.ast.structural import PsBlock
 from ..field import Field
 
@@ -13,7 +15,6 @@ from .._deprecation import _deprecated
 
 if TYPE_CHECKING:
     from ..jit import JitBase
-    from .gpu_indexing import GpuLaunchConfiguration
 
 
 class Kernel:
@@ -123,11 +124,11 @@ class GpuKernel(Kernel):
         parameters: Sequence[Parameter],
         required_headers: set[str],
         jit: JitBase,
-        launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
+        launch_config_factory: Callable[[], GpuLaunchConfiguration],
     ):
         super().__init__(body, target, name, parameters, required_headers, jit)
         self._launch_config_factory = launch_config_factory
 
     def get_launch_configuration(self) -> GpuLaunchConfiguration:
         """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._launch_config_factory(self)
+        return self._launch_config_factory()
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index d4f1c0204..42d9a685f 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -45,7 +45,7 @@ class CupyKernelWrapper(KernelWrapper):
     @property
     def kernel_function(self) -> GpuKernel:
         return self._kfunc
-    
+
     @property
     def launch_config(self) -> GpuLaunchConfiguration:
         return self._launch_config
@@ -67,10 +67,9 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        launch_config_params = self._launch_config.get_valuation
-        key = tuple(
-            (k, v) for k, v in launch_config_params.items()
-        ) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._launch_config.jit_cache_key(),) + tuple(
+            (k, id(v)) for k, v in kwargs.items()
+        )
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -80,7 +79,7 @@ class CupyKernelWrapper(KernelWrapper):
             return self._args_cache[key]
 
     def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]:
-        args = []
+        kernel_args = []
         valuation: dict[str, Any] = dict()
 
         def add_arg(name: str, arg: Any, dtype: PsType):
@@ -88,7 +87,7 @@ class CupyKernelWrapper(KernelWrapper):
             assert nptype is not None
             typecast = nptype.type
             arg = typecast(arg)
-            args.append(arg)
+            kernel_args.append(arg)
             valuation[name] = arg
 
         field_shapes = set()
@@ -168,7 +167,7 @@ class CupyKernelWrapper(KernelWrapper):
                                     f"Expected {field.dtype}, got {arr.dtype}"
                                 )
                             check_shape(kparam, arr)
-                            args.append(arr)
+                            kernel_args.append(arr)
                             break
 
                         case FieldShape(field, coord):
@@ -191,31 +190,21 @@ class CupyKernelWrapper(KernelWrapper):
 
         #   Determine launch grid
 
-        launch_cfg_valuation = valuation.copy()
-        launch_cfg_valuation.update(
-            {
-                param.name: value
-                for param, value in self._launch_config.get_valuation.items()
-            }
-        )
+        def add_launch_config_arg(name: str, arg: Any, dtype: PsType):
+            nptype = dtype.numpy_dtype
+            assert nptype is not None
+            typecast = nptype.type
+            arg = typecast(arg)
+            valuation[name] = arg
 
-        block_size = cast(
-            tuple[int, int, int],
-            tuple(
-                int(component(**launch_cfg_valuation))
-                for component in self._launch_config.get_block_size()
-            ),
-        )
+        for cparam in self._launch_config.parameters:
+            if cparam.name not in valuation:
+                val = kwargs[cparam.name]
+                add_launch_config_arg(cparam.name, val, cparam.dtype)
 
-        grid_size = cast(
-            tuple[int, int, int],
-            tuple(
-                int(component(**launch_cfg_valuation))
-                for component in self._launch_config.get_grid_size()
-            ),
-        )
+        block_size, grid_size = self._launch_config.evaluate(**valuation)
 
-        return tuple(args), LaunchGrid(grid_size, block_size)
+        return tuple(kernel_args), LaunchGrid(grid_size, block_size)
 
 
 class CupyJit(JitBase):
-- 
GitLab


From d9c8f260133994db177aa42db2f608cd4b418e56 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 14 Feb 2025 19:29:02 +0100
Subject: [PATCH 08/17] fix testsuite and docs

---
 docs/source/api/codegen.md                    |  2 +-
 docs/source/user_manual/gpu_kernels.md        | 43 ++++++-------------
 src/pystencils/backend/platforms/cuda.py      | 10 +++--
 src/pystencils/codegen/config.py              |  5 ++-
 src/pystencils/codegen/gpu_indexing.py        |  4 +-
 tests/kernelcreation/test_iteration_slices.py | 19 +++++---
 6 files changed, 39 insertions(+), 44 deletions(-)

diff --git a/docs/source/api/codegen.md b/docs/source/api/codegen.md
index b739a4f33..8e374d4e5 100644
--- a/docs/source/api/codegen.md
+++ b/docs/source/api/codegen.md
@@ -108,6 +108,7 @@ The following categories with target-specific options are exposed:
   VectorizationOptions
   GpuOptions
   SyclOptions
+  GpuIndexingScheme
 
 .. autosummary::
   :toctree: generated
@@ -176,5 +177,4 @@ The following categories with target-specific options are exposed:
   Kernel
   GpuKernel
   Parameter
-  GpuThreadsRange
 ```
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 4db2d7944..d3a491707 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -51,12 +51,6 @@ ps.inspect(kernel)
 The `kernel` object returned by the code generator in above snippet is an instance
 of the {py:class}`GpuKernel` class.
 It extends {py:class}`Kernel` with some GPU-specific information.
-In particular, it defines the {any}`threads_range <GpuKernel.threads_range>`
-property, which tells us how many threads the kernel is expecting to be executed with:
-
-```{code-cell} ipython3
-kernel.threads_range
-```
 
 If a GPU is available and [CuPy][cupy] is installed in the current environment,
 the kernel can be compiled and run immediately.
@@ -87,7 +81,7 @@ kfunc = kernel.compile()
 kfunc(f=f_arr, g=g_arr)
 ```
 
-### Modifying the Launch Grid
+### Modifying the Launch Configuration
 
 The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
 This object holds the kernel's launch grid configuration
@@ -150,26 +144,26 @@ assignments = [
 
 ```{code-cell} ipython3
 y = ps.DEFAULTS.spatial_counters[0]
-cfg = ps.CreateKernelConfig(
-    target=ps.Target.CUDA,
-    iteration_slice=ps.make_slice[:, y:]
-)
-    
-kernel = ps.create_kernel(assignments, cfg).compile()
+cfg = ps.CreateKernelConfig()
+cfg.target= ps.Target.CUDA
+cfg.iteration_slice = ps.make_slice[:, y:]
 ```
 
-This warns us that the threads range could not be determined automatically.
-We can disable this warning by setting `manual_launch_grid` in the GPU option category:
+In this case, it is necessary to set the `gpu.manual_launch_grid` option to `True`;
+otherwise, code generation will fail as the code generator cannot figure out
+a GPU grid size on its own:
 
-```{code-cell}
+```{code-cell} ipython3
 cfg.gpu.manual_launch_grid = True
+    
+kernel = ps.create_kernel(assignments, cfg).compile()
 ```
 
 Now, to execute our kernel, we have to manually specify its launch grid:
 
 ```{code-cell} ipython3
-kernel.block_size = (8, 8)
-kernel.num_blocks = (2, 2)
+kernel.launch_config.block_size = (8, 8)
+kernel.launch_config.grid_size = (2, 2)
 ```
 
 This way the kernel will cover this iteration space:
@@ -184,8 +178,8 @@ _draw_ispace(cp.asnumpy(f_arr))
 We can also observe the effect of decreasing the launch grid size:
 
 ```{code-cell} ipython3
-kernel.block_size = (4, 4)
-kernel.num_blocks = (2, 3)
+kernel.launch_config.block_size = (4, 4)
+kernel.launch_config.grid_size = (2, 3)
 ```
 
 ```{code-cell} ipython3
@@ -199,15 +193,6 @@ Here, since there are only eight threads operating in $x$-direction,
 and twelve threads in $y$-direction,
 only a part of the triangle is being processed.
 
-## API Reference
-
-```{eval-rst}
-.. autosummary::
-  :nosignatures:
-
-  pystencils.codegen.GpuKernel
-  pystencils.jit.gpu_cupy.CupyKernelWrapper
-```
 
 :::{admonition} Developers To Do:
 
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 122011eb0..537c92db1 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -258,13 +258,17 @@ class CudaPlatform(GenericGpu):
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
     ) -> PsBlock:
-        dimensions = ispace.dimensions_in_loop_order()
-
         ctr_mapping = self._thread_mapping(ispace)
 
         indexing_decls = []
         conds = []
+
+        dimensions = ispace.dimensions_in_loop_order()
+
         for dim in dimensions:
+            # counter declarations must be ordered slowest-to-fastest
+            # such that inner dimensions can depend on outer ones
+
             dim.counter.dtype = constify(dim.counter.get_dtype())
 
             ctr_expr = PsExpression.make(dim.counter)
@@ -274,8 +278,6 @@ class CudaPlatform(GenericGpu):
             if not self._omit_range_check:
                 conds.append(PsLt(ctr_expr, dim.stop))
 
-        indexing_decls = indexing_decls[::-1]
-
         if conds:
             condition: PsExpression = conds[0]
             for cond in conds[1:]:
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 2d62f286b..0d43b40e3 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -383,7 +383,8 @@ class GpuOptions(ConfigBase):
     block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
     """Desired block size for the execution of GPU kernels.
     
-    This option only takes effect if `Linear3D` is chosen as an indexing scheme.
+    This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>`
+    is chosen as an indexing scheme.
     The block size may be overridden at runtime.
     """
 
@@ -573,7 +574,7 @@ class CreateKernelConfig(ConfigBase):
     """Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead."""
 
     gpu_indexing: InitVar[str | None] = None
-    """Deprecated; use `gpu.indexing_scheme` instead."""
+    """Deprecated; use `gpu.indexing_scheme <GpuOptions.indexing_scheme>` instead."""
 
     gpu_indexing_params: InitVar[dict | None] = None
     """Deprecated; set options in the `gpu` category instead."""
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 1e23a820e..80af8aba8 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -56,7 +56,7 @@ class GpuLaunchConfiguration(ABC):
 class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """Launch configuration that is dynamically computed from kernel parameters.
 
-    This launch configuration permits no further user customization
+    This launch configuration permits no further user customization.
     """
 
     def __init__(
@@ -233,7 +233,7 @@ class GpuIndexing(ABC):
         from ..backend.ast.expressions import PsExpression, PsIntDiv
 
         block_size_symbols = [
-            self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank)
+            self._ctx.get_new_symbol(f"gpuBlockSize_{c}", self._ctx.index_dtype) for c in range(rank)
         ]
 
         block_size = [
diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py
index 02b6b9922..b1f2da576 100644
--- a/tests/kernelcreation/test_iteration_slices.py
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -19,6 +19,7 @@ from pystencils.sympyextensions.integer_functions import int_rem
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import normalize_slice
 from pystencils.jit.gpu_cupy import CupyKernelWrapper
+from pystencils.codegen.gpu_indexing import ManualLaunchConfiguration
 
 
 def test_sliced_iteration():
@@ -137,8 +138,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
         expected[r, r:] = 1.0
 
     update = Assignment(f.center(), 1)
-    outer_counter = DEFAULTS.spatial_counters[0]
-    islice = make_slice[:, outer_counter:]
+
+    #   Have NumPy data layout -> X is slowest coordinate, Y is fastest
+    slow_counter = DEFAULTS.spatial_counters[0]
+    islice = make_slice[:, slow_counter:]
     gen_config = replace(gen_config, iteration_slice=islice)
 
     if gen_config.target == Target.CUDA:
@@ -147,8 +150,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
     kernel = create_kernel(update, gen_config).compile()
 
     if isinstance(kernel, CupyKernelWrapper):
-        kernel.block_size = shape + (1,)
-        kernel.num_blocks = (1, 1, 1)
+        assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
+
+        kernel.launch_config.block_size = shape + (1,)
+        kernel.launch_config.grid_size = (1, 1, 1)
 
     kernel(f=f_arr)
 
@@ -182,8 +187,10 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
             pytest.xfail("Gather/Scatter not implemented yet")
 
     if isinstance(kernel, CupyKernelWrapper):
-        kernel.block_size = (8, 16, 1)
-        kernel.num_blocks = (1, 1, 1)
+        assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
+
+        kernel.launch_config.block_size = (8, 16, 1)
+        kernel.launch_config.grid_size = (1, 1, 1)
 
     kernel(f=f_arr)
 
-- 
GitLab


From c306be593065f988be7386bd1aac6146feff6385 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Sat, 15 Feb 2025 11:21:49 +0100
Subject: [PATCH 09/17] further simplify Python implementation of
 DynamicBlockSizeLaunchConfig

---
 src/pystencils/codegen/gpu_indexing.py        | 104 +++++++-----------
 .../sympyextensions/integer_functions.py      |   4 +-
 src/pystencils/utils.py                       |  47 +++++++-
 3 files changed, 89 insertions(+), 66 deletions(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 80af8aba8..d32e4112a 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -19,7 +19,6 @@ from ..backend.ast.expressions import PsExpression
 
 
 dim3 = tuple[int, int, int]
-_Dim3Params = tuple[Parameter, Parameter, Parameter]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
@@ -73,15 +72,9 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
 
     @property
     def parameters(self) -> frozenset[Parameter]:
-        """Parameters of this launch configuration"""
         return self._params
 
     def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
-        """Compute block and grid size for a kernel launch.
-
-        Args:
-            kwargs: Valuation providing a value for each parameter listed in `parameters`
-        """
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
         grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
         return cast(dim3, block_size), cast(dim3, grid_size)
@@ -136,27 +129,38 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
 
 class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
-    """GPU launch configuration that permits the user to set a block size dynamically."""
+    """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size.
+    
+    The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items
+    in the kernel's iteration space as follows.
+    For each dimension :math:`c \\in \\{ x, y, z \\}`,
+
+    - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``;
+    - otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
+    """
 
     def __init__(
         self,
-        block_size_expr: _Dim3Lambda,
-        grid_size_expr: _Dim3Lambda,
-        block_size_params: _Dim3Params,
+        num_work_items: _Dim3Lambda,
         default_block_size: dim3 | None = None,
     ) -> None:
-        self._block_size_expr = block_size_expr
-        self._grid_size_expr = grid_size_expr
+        self._num_work_items = num_work_items
 
-        self._block_size_params = block_size_params
         self._block_size: dim3 | None = default_block_size
 
         self._params: frozenset[Parameter] = frozenset().union(
-            *(lb.parameters for lb in chain(block_size_expr, grid_size_expr))
-        ) - set(self._block_size_params)
+            *(wit.parameters for wit in num_work_items)
+        )
+
+    @property
+    def num_work_items(self) -> _Dim3Lambda:
+        """Lambda expressions that compute the number of work items in each iteration space
+        dimension from kernel parameters."""
+        return self._num_work_items
 
     @property
     def block_size(self) -> dim3 | None:
+        """The desired GPU block size."""
         return self._block_size
 
     @block_size.setter
@@ -172,16 +176,23 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         if self._block_size is None:
             raise AttributeError("No GPU block size was specified by the user!")
 
-        kwargs.update(
-            {
-                param.name: value
-                for param, value in zip(self._block_size_params, self._block_size)
-            }
+        from ..utils import div_ceil
+
+        num_work_items = cast(
+            dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items)
+        )
+        reduced_block_size = cast(
+            dim3,
+            tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)),
+        )
+        grid_size = cast(
+            dim3,
+            tuple(
+                div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size)
+            ),
         )
 
-        block_size = tuple(int(bs(**kwargs)) for bs in self._block_size_expr)
-        grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size_expr)
-        return cast(dim3, block_size), cast(dim3, grid_size)
+        return reduced_block_size, grid_size
 
     def jit_cache_key(self) -> Any:
         return self._block_size
@@ -226,50 +237,17 @@ class GpuIndexing(ABC):
     def _get_linear3d_config_factory(
         self,
     ) -> Callable[[], DynamicBlockSizeLaunchConfiguration]:
-        work_items = self._get_work_items()
-        rank = len(work_items)
-
-        from ..backend.constants import PsConstant
-        from ..backend.ast.expressions import PsExpression, PsIntDiv
-
-        block_size_symbols = [
-            self._ctx.get_new_symbol(f"gpuBlockSize_{c}", self._ctx.index_dtype) for c in range(rank)
-        ]
-
-        block_size = [
-            Lambda.from_expression(self._ctx, self._factory.parse_index(bs_symb))
-            for bs_symb in block_size_symbols
-        ] + [
-            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
-            for _ in range(3 - rank)
-        ]
-
-        def div_ceil(a: PsExpression, b: PsExpression):
-            return self._factory.parse_index(
-                PsIntDiv(a + b - PsExpression.make(PsConstant(1)), b)
-            )
-
-        grid_size = [
-            Lambda.from_expression(
-                self._ctx, div_ceil(witems, PsExpression.make(bsize))
-            )
-            for witems, bsize in zip(work_items, block_size_symbols)
-        ] + [
-            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
-            for _ in range(3 - rank)
-        ]
-
-        from .driver import _symbol_to_param
+        work_items_expr = self._get_work_items()
+        rank = len(work_items_expr)
 
-        block_size_params = tuple(
-            _symbol_to_param(self._ctx, s) for s in block_size_symbols
+        num_work_items = cast(
+            _Dim3Lambda,
+            tuple(Lambda.from_expression(self._ctx, wit) for wit in work_items_expr),
         )
 
         def factory():
             return DynamicBlockSizeLaunchConfiguration(
-                cast(_Dim3Lambda, tuple(block_size)),
-                cast(_Dim3Lambda, tuple(grid_size)),
-                cast(_Dim3Params, block_size_params),
+                num_work_items,
                 self._get_default_block_size(rank),
             )
 
diff --git a/src/pystencils/sympyextensions/integer_functions.py b/src/pystencils/sympyextensions/integer_functions.py
index 42513ef9c..9d2c69502 100644
--- a/src/pystencils/sympyextensions/integer_functions.py
+++ b/src/pystencils/sympyextensions/integer_functions.py
@@ -140,10 +140,10 @@ class div_ceil(IntegerFunctionTwoArgsMixIn):
 
     @classmethod
     def eval(cls, arg1, arg2):
-        from ..utils import c_intdiv
+        from ..utils import div_ceil
 
         if is_integer_sequence((arg1, arg2)):
-            return c_intdiv(arg1 + arg2 - 1, arg2)
+            return div_ceil(arg1, arg2)
 
     def _eval_op(self, arg1, arg2):
         return self.eval(arg1, arg2)
diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py
index a53eb8289..0049d0a2c 100644
--- a/src/pystencils/utils.py
+++ b/src/pystencils/utils.py
@@ -4,11 +4,13 @@ from itertools import groupby
 from collections import Counter
 from contextlib import contextmanager
 from tempfile import NamedTemporaryFile
-from typing import Mapping
+from typing import Mapping, overload
 
 import numpy as np
 import sympy as sp
 
+from numpy.typing import NDArray
+
 
 class DotDict(dict):
     """Normal dict with additional dot access for all keys"""
@@ -254,6 +256,24 @@ class ContextVar:
         return self.stack[-1]
 
 
+@overload
+def c_intdiv(num: int, denom: int) -> int: ...
+
+
+@overload
+def c_intdiv(
+    num: NDArray[np.integer], denom: NDArray[np.integer]
+) -> NDArray[np.integer]: ...
+
+
+@overload
+def c_intdiv(num: int, denom: NDArray[np.integer]) -> NDArray[np.integer]: ...
+
+
+@overload
+def c_intdiv(num: NDArray[np.integer], denom: int) -> NDArray[np.integer]: ...
+
+
 def c_intdiv(num, denom):
     """C-style integer division"""
     if isinstance(num, np.ndarray) or isinstance(denom, np.ndarray):
@@ -271,3 +291,28 @@ def c_rem(num, denom):
     """C-style integer remainder"""
     div = c_intdiv(num, denom)
     return num - div * denom
+
+
+@overload
+def div_ceil(divident: int, divisor: int) -> int: ...
+
+
+@overload
+def div_ceil(
+    divident: NDArray[np.integer], divisor: NDArray[np.integer]
+) -> NDArray[np.integer]: ...
+
+
+@overload
+def div_ceil(divident: int, divisor: NDArray[np.integer]) -> NDArray[np.integer]: ...
+
+
+@overload
+def div_ceil(divident: NDArray[np.integer], divisor: int) -> NDArray[np.integer]: ...
+
+
+def div_ceil(divident, divisor):
+    """For nonnegative integer arguments, compute ``ceil(num / denom)``.
+    The result is unspecified if either argument is negative."""
+
+    return c_intdiv(divident + divisor - 1, divisor)
-- 
GitLab


From 9ba7bd250e1a72df118d1ae33be1729ce1c6bf17 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Sat, 15 Feb 2025 12:00:12 +0100
Subject: [PATCH 10/17] update GPU user guide; fix some other details in docs

---
 docs/source/contributing/index.md      |   1 +
 docs/source/installation.md            |   4 +-
 docs/source/user_manual/gpu_kernels.md | 101 +++++++++++++++++--------
 3 files changed, 74 insertions(+), 32 deletions(-)

diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md
index 56c97509c..8be86cdd4 100644
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@@ -1,3 +1,4 @@
+(contribution_guide)=
 # Contribution Guide
 
 Welcome to the Contributor's Guide to pystencils!
diff --git a/docs/source/installation.md b/docs/source/installation.md
index cea0acd2f..deb2b0613 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -38,12 +38,14 @@ The following feature sets are available:
 If you are developing pystencils, we recommend you perform an editable install of your
 local clone of the repository, with all optional features:
 ```bash
-pip install -e pystencils[alltrafos,interactive,use_cython,doc,tests]
+pip install -e pystencils[alltrafos,interactive,use_cython,doc,testsuite]
 ```
 
 This includes the additional feature groups `doc`, which contains all dependencies required
 to build this documentation, and `tests`, which adds `flake8` for code style checking,
 `mypy` for static type checking, and `pytest` plus plugins for running the test suite.
+
+For more information on developing pystencils, see the [](#contribution_guide).
 :::
 
 ### For Nvidia GPUs
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index d3a491707..2fa7cd056 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -28,7 +28,7 @@ import matplotlib.pyplot as plt
 
 Pystencils offers code generation for Nvidia GPUs using the CUDA programming model,
 as well as just-in-time compilation and execution of CUDA kernels from within Python
-based on the [cupy] library.w
+based on the [cupy] library.
 This section's objective is to give a detailed introduction into the creation of
 GPU kernels with pystencils.
 
@@ -62,7 +62,7 @@ which operates much in the same way that [NumPy][numpy] works on CPU arrays.
 Cupy and NumPy expose nearly the same APIs for array operations;
 the difference being that CuPy allocates all its arrays on the GPU
 and performs its operations as CUDA kernels.
-Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvcc].
+Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc].
 In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code,
 and to allocate and manage the data these kernels can be executed on.
 
@@ -81,35 +81,79 @@ kfunc = kernel.compile()
 kfunc(f=f_arr, g=g_arr)
 ```
 
-### Modifying the Launch Configuration
+(indexing_and_launch_config)=
+## Modify the Indexing Scheme and Launch Configuration
+
+There are two key elements to how the work items of a GPU kernel's iteration space
+are mapped onto a GPU launch grid:
+ - The *indexing scheme* defines the relation between thread indices and iteration space points;
+   it can be modified through the {any}`gpu.indexing_scheme <GpuOptions.indexing_scheme>` option
+   and is fixed for the entire kernel.
+ - The *launch configuration* defines the number of threads per block, and the number of blocks on the grid,
+   with which the kernel should be launched.
+   The launch configuration mostly depends on the size of the arrays passed to the kernel,
+   but parts of it may also be modified.
+   The launch configuration may change at each kernel invocation.
+
+(linear3d)=
+### The Default "Linear3D" Indexing Scheme
+
+By default, *pystencils* will employ a 1:1-mapping between threads and iteration space points
+via the global thread indices inside the launch grid; e.g.
+
+```{code-block} C++
+ctr_0 = start_0 + step_0 * (blockSize.x * blockIdx.x + threadIdx.x);
+ctr_1 = start_1 + step_1 * (blockSize.y * blockIdx.y + threadIdx.y);
+ctr_2 = start_2 + step_2 * (blockSize.z * blockIdx.z + threadIdx.z);
+```
+
+For most kernels with an at most three-dimensional iteration space,
+this behavior is sufficient and desired.
+It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`.
+
+If the `Linear3D` indexing scheme is used, you may modifiy the GPU thread block size in two places.
+The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` 
+code generator option;
+if none is specified, a default depending on the iteration space's dimensionality will be used.
 
-The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
-This object holds the kernel's launch grid configuration
-(i.e. the number of thread blocks, and the number of threads per block.)
-Pystencils specifies a default value for the block size and if possible, 
-the number of blocks is automatically inferred in order to cover the entire iteration space.
-In addition, the wrapper's interface allows us to customize the GPU launch grid,
-by manually setting both the number of threads per block, and the number of blocks on the grid:
+The block size can furthermore be modified at the compiled kernel's wrapper object via the
+`launch_config.block_size` attribute:
 
 ```{code-cell} ipython3
-kfunc.block_size = (16, 8, 8)
-kfunc.num_blocks = (1, 2, 2)
+kfunc = kernel.compile()
+kfunc.launch_config.block_size = (256, 2, 1)
+
+# Run the kernel
+kfunc(f=f_arr, g=g_arr)
 ```
 
-For most kernels, setting only the `block_size` is sufficient since pystencils will
-automatically compute the number of blocks;
-for exceptions to this, see [](#manual_launch_grids).
-If `num_blocks` is set manually and the launch grid thus specified is too small, only
-a part of the iteration space will be traversed by the kernel;
-similarily, if it is too large, it will cause any threads working outside of the iteration bounds to idle.
+In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments
+and the given thread block size.
 
 (manual_launch_grids)=
 ### Manual Launch Grids and Non-Cuboid Iteration Patterns
 
-In some cases, it will be unavoidable to set the launch grid size manually;
-especially if the code generator is unable to automatically determine the size of the
-iteration space.
-An example for this is the triangular iteration previously described in the [Kernel Creation Guide](#example_triangular_iteration).
+By default, the above indexing schemes will automatically compute the GPU launch configuration
+from array shapes and optional user input.
+However, it is also possible to override this behavior and instead specify a launch grid manually.
+This will even be unavoidable if the code generator cannot precompute the number of points
+in the kernel's iteration space.
+
+To specify a manual launch configuration, set the {any}`gpu.manual_launch_grid <GpuOptions.manual_launch_grid>`
+option to `True`.
+Then, after compiling the kernel, set its block and grid size via the `launch_config` property:
+
+```{code-cell} ipython3
+cfg.gpu.manual_launch_grid = True
+
+kernel = ps.create_kernel(update, cfg)
+kfunc = kernel.compile()
+kfunc.launch_config.block_size = (64, 2, 1)
+kfunc.launch_config.grid_size = (4, 2, 1)
+```
+
+An example where this is necessary is the triangular iteration
+previously described in the [Kernel Creation Guide](#example_triangular_iteration).
 Let's set it up once more:
 
 ```{code-cell} ipython3
@@ -149,19 +193,14 @@ cfg.target= ps.Target.CUDA
 cfg.iteration_slice = ps.make_slice[:, y:]
 ```
 
-In this case, it is necessary to set the `gpu.manual_launch_grid` option to `True`;
-otherwise, code generation will fail as the code generator cannot figure out
-a GPU grid size on its own:
+For this kernel, the code generator cannot figure out a launch configuration on its own,
+so we need to manually provide one:
 
 ```{code-cell} ipython3
 cfg.gpu.manual_launch_grid = True
     
 kernel = ps.create_kernel(assignments, cfg).compile()
-```
 
-Now, to execute our kernel, we have to manually specify its launch grid:
-
-```{code-cell} ipython3
 kernel.launch_config.block_size = (8, 8)
 kernel.launch_config.grid_size = (2, 2)
 ```
@@ -175,7 +214,7 @@ kernel(f=f_arr)
 _draw_ispace(cp.asnumpy(f_arr))
 ```
 
-We can also observe the effect of decreasing the launch grid size:
+We can also observe the effect of decreasing the launch grid size.
 
 ```{code-cell} ipython3
 kernel.launch_config.block_size = (4, 4)
@@ -203,5 +242,5 @@ only a part of the triangle is being processed.
 
 [cupy]: https://cupy.dev "CuPy Homepage"
 [numpy]: https://numpy.org "NumPy Homepage"
-[nvcc]: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html "NVIDIA CUDA Compiler Driver"
+[nvrtc]: https://docs.nvidia.com/cuda/nvrtc/index.html "NVIDIA Runtime Compilation Library"
 [cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation"
-- 
GitLab


From c0f48e192695fdc4dd3c181243f378ae7e88020c Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 17 Feb 2025 09:25:41 +0100
Subject: [PATCH 11/17] fix param collection in cupy JIT; fix Blockwise4D
 indexing order

---
 conftest.py                              |   1 -
 src/pystencils/backend/platforms/cuda.py |  12 +--
 src/pystencils/codegen/gpu_indexing.py   |   4 +-
 src/pystencils/jit/gpu_cupy.py           |  50 +++++----
 tests/kernelcreation/test_gpu.py         | 132 +++++++++++++++--------
 5 files changed, 121 insertions(+), 78 deletions(-)

diff --git a/conftest.py b/conftest.py
index 8296641ed..ff0467eff 100644
--- a/conftest.py
+++ b/conftest.py
@@ -55,7 +55,6 @@ try:
     import cupy
 except ImportError:
     collect_ignore += [
-        os.path.join(SCRIPT_FOLDER, "tests/kernelcreation/test_gpu.py"),
         os.path.join(SCRIPT_FOLDER, "src/pystencils/backend/jit/gpu_cupy.py"),
     ]
     add_path_to_ignore("src/pystencils/gpu")
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 537c92db1..5f4995a02 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -113,11 +113,11 @@ class Blockwise4DMapping(ThreadToIndexMapping):
     """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
     are mapped to block indices."""
 
-    _indices_in_loop_order = [  # slowest to fastest
-        BLOCK_IDX[2],
-        BLOCK_IDX[1],
-        BLOCK_IDX[0],
+    _indices_fastest_first = [  # slowest to fastest
         THREAD_IDX[0],
+        BLOCK_IDX[0],
+        BLOCK_IDX[1],
+        BLOCK_IDX[2]
     ]
 
     def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
@@ -141,7 +141,7 @@ class Blockwise4DMapping(ThreadToIndexMapping):
         dimensions = ispace.dimensions_in_loop_order()
         idx_map: dict[PsSymbol, PsExpression] = dict()
 
-        for dim, tid in zip(dimensions, self._indices_in_loop_order):
+        for dim, tid in zip(dimensions[::-1], self._indices_fastest_first):
             idx_map[dim.counter] = dim.start + dim.step * PsCast(
                 deconstify(dim.counter.get_dtype()), tid
             )
@@ -152,7 +152,7 @@ class Blockwise4DMapping(ThreadToIndexMapping):
         self, ispace: SparseIterationSpace
     ) -> dict[PsSymbol, PsExpression]:
         sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = self._indices_in_loop_order[-1]
+        thread_idx = self._indices_fastest_first[0]
         idx_map: dict[PsSymbol, PsExpression] = {
             ispace.sparse_counter: PsCast(
                 deconstify(sparse_ctr.get_dtype()), thread_idx
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index d32e4112a..13549f734 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -270,7 +270,7 @@ class GpuIndexing(ABC):
     def _get_blockwise4d_config_factory(
         self,
     ) -> Callable[[], AutomaticLaunchConfiguration]:
-        work_items = self._get_work_items()[::-1]  # Want this ordered fastest first
+        work_items = self._get_work_items()
         rank = len(work_items)
 
         if rank > 4:
@@ -300,7 +300,7 @@ class GpuIndexing(ABC):
     def _get_work_items(self) -> tuple[PsExpression, ...]:
         """Return a tuple of expressions representing the number of work items
         in each dimension of the kernel's iteration space,
-        ordered from slowest to fastest dimension.
+        ordered from fastest to slowest dimension.
         """
         ispace = self._ctx.get_iteration_space()
         match ispace:
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 42d9a685f..1c771a427 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -1,4 +1,4 @@
-from typing import Any, Sequence, cast
+from typing import Any, Sequence, cast, Callable
 from dataclasses import dataclass
 
 try:
@@ -82,13 +82,16 @@ class CupyKernelWrapper(KernelWrapper):
         kernel_args = []
         valuation: dict[str, Any] = dict()
 
-        def add_arg(name: str, arg: Any, dtype: PsType):
-            nptype = dtype.numpy_dtype
+        def add_arg(param: Parameter, arg: Any):
+            nptype = param.dtype.numpy_dtype
             assert nptype is not None
             typecast = nptype.type
             arg = typecast(arg)
+            valuation[param.name] = arg
+
+        def add_kernel_arg(param: Parameter, arg: Any):
+            add_arg(param, arg)
             kernel_args.append(arg)
-            valuation[name] = arg
 
         field_shapes = set()
         index_shapes = set()
@@ -141,12 +144,13 @@ class CupyKernelWrapper(KernelWrapper):
                         )
 
         #   Collect parameter values
-        arr: cp.ndarray
 
-        for kparam in self._kfunc.parameters:
-            if kparam.is_field_parameter:
+        def process_param(param: Parameter, adder: Callable[[Parameter, Any], None]):
+            arr: cp.ndarray
+
+            if param.is_field_parameter:
                 #   Determine field-associated data to pass in
-                for prop in kparam.properties:
+                for prop in param.properties:
                     match prop:
                         case FieldBasePtr(field):
 
@@ -155,8 +159,8 @@ class CupyKernelWrapper(KernelWrapper):
                             from .. import DynamicType
 
                             if isinstance(field.dtype, DynamicType):
-                                assert isinstance(kparam.dtype, PsPointerType)
-                                elem_dtype = kparam.dtype.base_type
+                                assert isinstance(param.dtype, PsPointerType)
+                                elem_dtype = param.dtype.base_type
                             else:
                                 elem_dtype = field.dtype
 
@@ -166,41 +170,35 @@ class CupyKernelWrapper(KernelWrapper):
                                     f"Data type mismatch at array argument {field.name}:"
                                     f"Expected {field.dtype}, got {arr.dtype}"
                                 )
-                            check_shape(kparam, arr)
+                            check_shape(param, arr)
                             kernel_args.append(arr)
                             break
 
                         case FieldShape(field, coord):
                             arr = kwargs[field.name]
-                            add_arg(kparam.name, arr.shape[coord], kparam.dtype)
+                            adder(param, arr.shape[coord])
                             break
 
                         case FieldStride(field, coord):
                             arr = kwargs[field.name]
-                            add_arg(
-                                kparam.name,
+                            adder(
+                                param,
                                 arr.strides[coord] // arr.dtype.itemsize,
-                                kparam.dtype,
                             )
                             break
             else:
                 #   scalar parameter
-                val: Any = kwargs[kparam.name]
-                add_arg(kparam.name, val, kparam.dtype)
+                val: Any = kwargs[param.name]
+                adder(param, val)
 
-        #   Determine launch grid
+        #   Process Arguments
 
-        def add_launch_config_arg(name: str, arg: Any, dtype: PsType):
-            nptype = dtype.numpy_dtype
-            assert nptype is not None
-            typecast = nptype.type
-            arg = typecast(arg)
-            valuation[name] = arg
+        for kparam in self._kfunc.parameters:
+            process_param(kparam, add_kernel_arg)
 
         for cparam in self._launch_config.parameters:
             if cparam.name not in valuation:
-                val = kwargs[cparam.name]
-                add_launch_config_arg(cparam.name, val, cparam.dtype)
+                process_param(cparam, add_arg)
 
         block_size, grid_size = self._launch_config.evaluate(**valuation)
 
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 97f0c0fa9..621e4c251 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -4,16 +4,30 @@ import numpy as np
 import sympy as sp
 from scipy.ndimage import convolve
 
-from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
+from pystencils import (
+    Assignment,
+    Field,
+    fields,
+    CreateKernelConfig,
+    create_kernel,
+    Target,
+)
+
 # from pystencils.gpu import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
-from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice
+from pystencils.slicing import (
+    add_ghost_layers,
+    make_slice,
+    remove_ghost_layers,
+    normalize_slice,
+)
 
 try:
     import cupy as cp
+
     device_numbers = range(cp.cuda.runtime.getDeviceCount())
 except ImportError:
-    device_numbers = []
+    pytest.skip(reason="CuPy is not available", allow_module_level=True)
 
 
 def test_averaging_kernel():
@@ -21,11 +35,13 @@ def test_averaging_kernel():
     src_arr = np.random.rand(*size)
     src_arr = add_ghost_layers(src_arr)
     dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr)
+    src_field = Field.create_from_numpy_array("src", src_arr)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr)
 
-    update_rule = Assignment(dst_field[0, 0],
-                             (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4)
+    update_rule = Assignment(
+        dst_field[0, 0],
+        (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4,
+    )
 
     config = CreateKernelConfig(target=Target.GPU)
     ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
@@ -37,17 +53,21 @@ def test_averaging_kernel():
     dst_arr = gpu_dst_arr.get()
 
     stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
-    reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)
+    reference = convolve(
+        remove_ghost_layers(src_arr), stencil, mode="constant", cval=0.0
+    )
     reference = add_ghost_layers(reference)
     np.testing.assert_almost_equal(reference, dst_arr)
 
 
 def test_variable_sized_fields():
-    src_field = Field.create_generic('src', spatial_dimensions=2)
-    dst_field = Field.create_generic('dst', spatial_dimensions=2)
+    src_field = Field.create_generic("src", spatial_dimensions=2)
+    dst_field = Field.create_generic("dst", spatial_dimensions=2)
 
-    update_rule = Assignment(dst_field[0, 0],
-                             (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4)
+    update_rule = Assignment(
+        dst_field[0, 0],
+        (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4,
+    )
 
     config = CreateKernelConfig(target=Target.GPU)
     ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
@@ -64,7 +84,9 @@ def test_variable_sized_fields():
     dst_arr = gpu_dst_arr.get()
 
     stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
-    reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)
+    reference = convolve(
+        remove_ghost_layers(src_arr), stencil, mode="constant", cval=0.0
+    )
     reference = add_ghost_layers(reference)
     np.testing.assert_almost_equal(reference, dst_arr)
 
@@ -76,12 +98,14 @@ def test_multiple_index_dimensions():
     src_arr = np.array(np.random.rand(*src_size))
     dst_arr = np.zeros(dst_size)
 
-    src_field = Field.create_from_numpy_array('src', src_arr, index_dimensions=1)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr, index_dimensions=0)
+    src_field = Field.create_from_numpy_array("src", src_arr, index_dimensions=1)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr, index_dimensions=0)
 
     offset = (-2, -1)
-    update_rule = Assignment(dst_field[0, 0],
-                             sum([src_field[offset[0], offset[1]](i) for i in range(src_size[-1])]))
+    update_rule = Assignment(
+        dst_field[0, 0],
+        sum([src_field[offset[0], offset[1]](i) for i in range(src_size[-1])]),
+    )
 
     config = CreateKernelConfig(target=Target.GPU)
     ast = create_kernel([update_rule], config=config)
@@ -94,25 +118,30 @@ def test_multiple_index_dimensions():
 
     reference = np.zeros_like(dst_arr)
     gl = np.max(np.abs(np.array(offset, dtype=int)))
-    for x in range(gl, src_size[0]-gl):
-        for y in range(gl, src_size[1]-gl):
-            reference[x, y] = sum([src_arr[x+offset[0], y+offset[1], i] for i in range(src_size[2])])
+    for x in range(gl, src_size[0] - gl):
+        for y in range(gl, src_size[1] - gl):
+            reference[x, y] = sum(
+                [src_arr[x + offset[0], y + offset[1], i] for i in range(src_size[2])]
+            )
 
     np.testing.assert_almost_equal(reference, dst_arr)
 
 
-@pytest.mark.xfail(reason="Line indexing not available yet")
 def test_ghost_layer():
     size = (6, 5)
     src_arr = np.ones(size)
     dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr, index_dimensions=0)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr, index_dimensions=0)
+    src_field = Field.create_from_numpy_array("src", src_arr, index_dimensions=0)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr, index_dimensions=0)
 
     update_rule = Assignment(dst_field[0, 0], src_field[0, 0])
     ghost_layers = [(1, 2), (2, 1)]
 
-    config = CreateKernelConfig(target=Target.GPU, ghost_layers=ghost_layers, gpu="line")
+    config = CreateKernelConfig()
+    config.target = Target.CUDA
+    config.ghost_layers = ghost_layers
+    config.gpu.indexing_scheme = "blockwise4d"
+
     ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
     kernel = ast.compile()
 
@@ -122,11 +151,13 @@ def test_ghost_layer():
     dst_arr = gpu_dst_arr.get()
 
     reference = np.zeros_like(src_arr)
-    reference[ghost_layers[0][0]:-ghost_layers[0][1], ghost_layers[1][0]:-ghost_layers[1][1]] = 1
+    reference[
+        ghost_layers[0][0] : -ghost_layers[0][1],
+        ghost_layers[1][0] : -ghost_layers[1][1],
+    ] = 1
     np.testing.assert_equal(reference, dst_arr)
 
 
-@pytest.mark.xfail(reason="Line indexing not available yet")
 def test_setting_value():
     arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5)
     arr_gpu = cp.asarray(arr_cpu)
@@ -135,7 +166,11 @@ def test_setting_value():
     f = Field.create_generic("f", 2)
     update_rule = [Assignment(f(0), sp.Symbol("value"))]
 
-    config = CreateKernelConfig(target=Target.GPU, gpu="line", iteration_slice=iteration_slice)
+    config = CreateKernelConfig()
+    config.target = Target.CUDA
+    config.iteration_slice = iteration_slice
+    config.gpu.indexing_scheme = "blockwise4d"
+    
     ast = create_kernel(sympy_cse_on_assignment_list(update_rule), config=config)
     kernel = ast.compile()
 
@@ -167,21 +202,30 @@ def test_periodicity():
 def test_block_indexing(device_number):
     f = fields("f: [3D]")
     s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
-    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
-                       permute_block_size_dependent_on_layout=False)
-    assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
-    assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8)
-
-    bi = BlockIndexing(s, f.layout, block_size=(32, 1, 1),
-                       permute_block_size_dependent_on_layout=False)
-    assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
-
-    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
-                       maximum_block_size="auto", device_number=device_number)
+    bi = BlockIndexing(
+        s, f.layout, block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False
+    )
+    assert bi.call_parameters((3, 2, 32))["block"] == (3, 2, 32)
+    assert bi.call_parameters((32, 2, 32))["block"] == (16, 2, 8)
+
+    bi = BlockIndexing(
+        s, f.layout, block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False
+    )
+    assert bi.call_parameters((1, 16, 16))["block"] == (1, 16, 2)
+
+    bi = BlockIndexing(
+        s,
+        f.layout,
+        block_size=(16, 8, 2),
+        maximum_block_size="auto",
+        device_number=device_number,
+    )
 
     # This function should be used if number of needed registers is known. Can be determined with func.num_regs
     registers_per_thread = 1000
-    blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
+    blocks = bi.limit_block_size_by_register_restriction(
+        [1024, 1024, 1], registers_per_thread
+    )
 
     if cp.cuda.runtime.is_hip:
         max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
@@ -193,9 +237,9 @@ def test_block_indexing(device_number):
     assert np.prod(blocks) * registers_per_thread < max_registers_per_block
 
 
-@pytest.mark.parametrize('gpu_indexing', ("block", "line"))
-@pytest.mark.parametrize('layout', ("C", "F"))
-@pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
+@pytest.mark.parametrize("gpu_indexing", ("block", "line"))
+@pytest.mark.parametrize("layout", ("C", "F"))
+@pytest.mark.parametrize("shape", ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
 @pytest.mark.xfail(reason="4D kernels not available yet")
 def test_four_dimensional_kernel(gpu_indexing, layout, shape):
     n_elements = np.prod(shape)
@@ -207,7 +251,9 @@ def test_four_dimensional_kernel(gpu_indexing, layout, shape):
     f = Field.create_from_numpy_array("f", arr_cpu)
     update_rule = [Assignment(f.center, sp.Symbol("value"))]
 
-    config = CreateKernelConfig(target=Target.GPU, gpu=gpu_indexing, iteration_slice=iteration_slice)
+    config = CreateKernelConfig(
+        target=Target.GPU, gpu=gpu_indexing, iteration_slice=iteration_slice
+    )
     ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
-- 
GitLab


From 727939605e85b2c981e3fd37c8441142db7272e1 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 17 Feb 2025 10:20:24 +0100
Subject: [PATCH 12/17] add all-round test case for indexing options

---
 pytest.ini                                  |  1 +
 tests/kernelcreation/test_domain_kernels.py | 18 +-----
 tests/kernelcreation/test_gpu.py            | 65 ++++++++++++++++++---
 3 files changed, 61 insertions(+), 23 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index 707a43b45..744a74bc7 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -64,6 +64,7 @@ exclude_lines =
        if 0:
        if False:
        if __name__ == .__main__.:
+       assert False
 
        # Don't cover type checking imports
        if TYPE_CHECKING:
diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py
index da261faec..0d71dbe1a 100644
--- a/tests/kernelcreation/test_domain_kernels.py
+++ b/tests/kernelcreation/test_domain_kernels.py
@@ -32,14 +32,7 @@ def inspect_dp_kernel(kernel: Kernel, gen_config: CreateKernelConfig):
             assert "_mm512_storeu_pd" in code
 
 
-def test_filter_kernel(gen_config):
-    if gen_config.target == Target.CUDA:
-        import cupy as cp
-
-        xp = cp
-    else:
-        xp = np
-
+def test_filter_kernel(gen_config, xp):
     weight = sp.Symbol("weight")
     stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]
 
@@ -62,14 +55,7 @@ def test_filter_kernel(gen_config):
     xp.testing.assert_allclose(dst_arr, expected)
 
 
-def test_filter_kernel_fixedsize(gen_config):
-    if gen_config.target == Target.CUDA:
-        import cupy as cp
-
-        xp = cp
-    else:
-        xp = np
-
+def test_filter_kernel_fixedsize(gen_config, xp):
     weight = sp.Symbol("weight")
     stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]
 
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 621e4c251..d80647fb6 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -11,10 +11,9 @@ from pystencils import (
     CreateKernelConfig,
     create_kernel,
     Target,
+    assignment_from_stencil,
 )
 
-# from pystencils.gpu import BlockIndexing
-from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import (
     add_ghost_layers,
     make_slice,
@@ -30,6 +29,58 @@ except ImportError:
     pytest.skip(reason="CuPy is not available", allow_module_level=True)
 
 
+@pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
+@pytest.mark.parametrize("omit_range_check", [False, True])
+@pytest.mark.parametrize("manual_grid", [False, True])
+def test_indexing_options(
+    indexing_scheme: str, omit_range_check: bool, manual_grid: bool
+):
+    src, dst = fields("src, dst: [3D]")
+    asm = Assignment(
+        dst.center(),
+        src[-1, 0, 0]
+        + src[1, 0, 0]
+        + src[0, -1, 0]
+        + src[0, 1, 0]
+        + src[0, 0, -1]
+        + src[0, 0, 1],
+    )
+
+    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg.gpu.indexing_scheme = indexing_scheme
+    cfg.gpu.omit_range_check = omit_range_check
+    cfg.gpu.manual_launch_grid = manual_grid
+
+    ast = create_kernel(asm, cfg)
+    kernel = ast.compile()
+
+    src_arr = cp.ones((18, 34, 42))
+    dst_arr = cp.zeros_like(src_arr)
+
+    if manual_grid:
+        match indexing_scheme:
+            case "linear3d":
+                kernel.launch_config.block_size = (10, 8, 8)
+                kernel.launch_config.grid_size = (4, 4, 2)
+            case "blockwise4d":
+                kernel.launch_config.block_size = (40, 1, 1)
+                kernel.launch_config.grid_size = (32, 16, 1)
+
+    elif indexing_scheme == "linear3d":
+        kernel.launch_config.block_size = (
+            10,
+            8,
+            8,
+        )  # must fit the src_arr shape (without ghost layers)
+
+    kernel(src=src_arr, dst=dst_arr)
+
+    expected = cp.zeros_like(src_arr)
+    expected[1:-1, 1:-1, 1:-1].fill(6.0)
+
+    cp.testing.assert_allclose(dst_arr, expected)
+
+
 def test_averaging_kernel():
     size = (40, 55)
     src_arr = np.random.rand(*size)
@@ -44,7 +95,7 @@ def test_averaging_kernel():
     )
 
     config = CreateKernelConfig(target=Target.GPU)
-    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     gpu_src_arr = cp.asarray(src_arr)
@@ -70,7 +121,7 @@ def test_variable_sized_fields():
     )
 
     config = CreateKernelConfig(target=Target.GPU)
-    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     size = (3, 3)
@@ -142,7 +193,7 @@ def test_ghost_layer():
     config.ghost_layers = ghost_layers
     config.gpu.indexing_scheme = "blockwise4d"
 
-    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     gpu_src_arr = cp.asarray(src_arr)
@@ -170,8 +221,8 @@ def test_setting_value():
     config.target = Target.CUDA
     config.iteration_slice = iteration_slice
     config.gpu.indexing_scheme = "blockwise4d"
-    
-    ast = create_kernel(sympy_cse_on_assignment_list(update_rule), config=config)
+
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     kernel(f=arr_gpu, value=np.float64(42.0))
-- 
GitLab


From 6b3f5288dcdc074325667d40682b5e2610962348 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 17 Feb 2025 11:47:40 +0100
Subject: [PATCH 13/17] small extension to the user guide

---
 docs/source/user_manual/gpu_kernels.md | 45 ++++++++++++++++++--------
 tests/kernelcreation/test_gpu.py       |  6 +---
 2 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 2fa7cd056..610c61ddf 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -54,7 +54,19 @@ It extends {py:class}`Kernel` with some GPU-specific information.
 
 If a GPU is available and [CuPy][cupy] is installed in the current environment,
 the kernel can be compiled and run immediately.
-To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field.
+To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field:
+
+```{code-cell} ipython3
+:tags: [raises-exception]
+import cupy as cp
+
+rng = cp.random.default_rng(seed=42)
+f_arr = rng.random((16, 16, 16))
+g_arr = cp.zeros_like(f_arr)
+
+kfunc = kernel.compile()
+kfunc(f=f_arr, g=g_arr)
+```
 
 :::{note}
 [CuPy][cupy] is a Python library for numerical computations on GPU arrays,
@@ -69,18 +81,6 @@ and to allocate and manage the data these kernels can be executed on.
 For more information on CuPy, refer to [their documentation][cupy-docs].
 :::
 
-```{code-cell} ipython3
-:tags: [raises-exception]
-import cupy as cp
-
-rng = cp.random.default_rng(seed=42)
-f_arr = rng.random((16, 16, 16))
-g_arr = cp.zeros_like(f_arr)
-
-kfunc = kernel.compile()
-kfunc(f=f_arr, g=g_arr)
-```
-
 (indexing_and_launch_config)=
 ## Modify the Indexing Scheme and Launch Configuration
 
@@ -130,6 +130,25 @@ kfunc(f=f_arr, g=g_arr)
 In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments
 and the given thread block size.
 
+:::{attention}
+
+According to the way GPU architecture splits thread blocks into warps,
+pystencils will map the kernel's *fastest* spatial coordinate onto the `x` block and thread
+indices, the second-fastest to `y`, and the slowest coordiante to `z`.
+
+This can mean that, when using `cupy` arrays with the default memory layout
+(corresponding to the `"numpy"` field layout specifier),
+the *thread coordinates* and the *spatial coordinates*
+map to each other in *opposite order*; e.g.
+
+| Spatial Coordinate | Thread Index  |
+|--------------------|---------------|
+| `x` (slowest)      | `threadIdx.z` |
+| `y`                | `threadIdx.y` |
+| `z` (fastest)      | `threadIdx.x` |
+
+:::
+
 (manual_launch_grids)=
 ### Manual Launch Grids and Non-Cuboid Iteration Patterns
 
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index d80647fb6..75239c9b1 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -67,11 +67,7 @@ def test_indexing_options(
                 kernel.launch_config.grid_size = (32, 16, 1)
 
     elif indexing_scheme == "linear3d":
-        kernel.launch_config.block_size = (
-            10,
-            8,
-            8,
-        )  # must fit the src_arr shape (without ghost layers)
+        kernel.launch_config.block_size = (10, 8, 8)
 
     kernel(src=src_arr, dst=dst_arr)
 
-- 
GitLab


From 2116e907dc0a8480cb42485b77fa653d2f19bd92 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 18 Feb 2025 10:06:20 +0100
Subject: [PATCH 14/17] Add design document on CUDA codegen to the docs

---
 .gitlab-ci.yml                                |  1 +
 docs/source/backend/gpu_codegen.md            | 92 +++++++++++++++++++
 docs/source/backend/index.rst                 |  1 +
 docs/source/backend/platforms.md              | 54 +++++++++++
 docs/source/backend/platforms.rst             |  6 --
 src/pystencils/backend/platforms/cuda.py      | 17 +++-
 .../backend/platforms/generic_gpu.py          |  9 +-
 src/pystencils/backend/platforms/platform.py  | 10 +-
 src/pystencils/codegen/gpu_indexing.py        | 36 ++++++--
 9 files changed, 197 insertions(+), 29 deletions(-)
 create mode 100644 docs/source/backend/gpu_codegen.md
 create mode 100644 docs/source/backend/platforms.md
 delete mode 100644 docs/source/backend/platforms.rst

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 474afbb11..a2ec00d16 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -337,6 +337,7 @@ build-documentation:
   artifacts:
     paths:
       - docs/build/html
+    when: always
 
 
 pages:
diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
new file mode 100644
index 000000000..2e3709b24
--- /dev/null
+++ b/docs/source/backend/gpu_codegen.md
@@ -0,0 +1,92 @@
+# GPU Code Generation
+
+The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components:
+
+ - The {any}`CudaPlatform` at `backend.platforms` which performs materialization of a kernel's iteration
+   space by mapping GPU block and thread indices to iteration space points. To perform this task,
+   it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping.
+   The platform also takes care of lowering mathematical functions to their CUDA runtime library implementation.
+ - In the code generation driver, the strings are drawn by the `GpuIndexing` helper class.
+   It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration
+   for the runtime system.
+
+:::{attention}
+
+Code generation for HIP through the `CudaPlatform` is experimental and not tested at the moment.
+:::
+
+## The CUDA Platform and Thread Mappings
+
+```{eval-rst}
+.. module:: pystencils.backend.platforms.cuda
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    ThreadMapping
+    Linear3DMapping
+    Blockwise4DMapping
+```
+
+## Thread Indexing In The Driver
+
+With regard to GPU thread indexing, the code generation driver has two tasks:
+it must provide the Cuda platform object with a valid thread mapping,
+and must also provide the runtime system with a [launch configuration](#gpu_launch_config)
+which defines the shape of the GPU block grid.
+Both of these are produced by the {any}`GpuIndexing` class.
+It is instantiated with the GPU indexing scheme and indexing options given by the user.
+
+At this time, the backend and code generation driver support two indexing schemes:
+"Linear3D" (see {any}`Linear3DMapping`) and "Blockwise4D" (see {any}`Blockwise4DMapping`).
+These are mostly reimplemented from the pystencils 1.3.x `"block"` and `"line"` indexing options.
+The GPU indexing system may be extended in the future.
+
+
+```{eval-rst}
+.. module:: pystencils.codegen.gpu_indexing
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    GpuIndexing
+```
+
+(gpu_launch_config)=
+## The Launch Configuration
+
+The launch configuration is attached to the `GpuKernel` and thus returned to the runtime system.
+Since a concrete launch configuration is not specific to the kernel itself, but to the kernels'
+invocation site, the code generator only attaches a *factory function* for launch configurations
+to `GpuKernel`. It is up to the runtime system to locally instantiate and configure a launch configuration.
+To determine the actual launch grid, the launch configuration must be evaluated at the kernel's call site
+by passing the required parameters to `GpuLaunchConfiguration.evaluate`
+
+The {any}`CupyJit`, for instance, will create the launch configuration object while preparing the JIT-compiled
+kernel wrapper object. The launch config is there exposed to the user, who may modify some of its properties.
+These depend on the type of the launch configuration:
+while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel
+parameters,
+the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size.
+
+The `evaluate` method can only be used from within a Python runtime environment.
+When exporting pystencils CUDA kernels for external use in C++ projects,
+equivalent C++ code evaluating the launch config must be generated.
+This is the task of, e..g., [pystencils-sfg](https://pycodegen.pages.i10git.cs.fau.de/pystencils-sfg/).
+
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    GpuLaunchConfiguration
+    AutomaticLaunchConfiguration
+    ManualLaunchConfiguration
+    DynamicBlockSizeLaunchConfiguration
+```
diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst
index 5ab8dbd34..0d384c55b 100644
--- a/docs/source/backend/index.rst
+++ b/docs/source/backend/index.rst
@@ -17,6 +17,7 @@ who wish to customize or extend the behaviour of the code generator in their app
     translation
     platforms
     transformations
+    gpu_codegen
     errors
     extensions
 
diff --git a/docs/source/backend/platforms.md b/docs/source/backend/platforms.md
new file mode 100644
index 000000000..2a0df48ed
--- /dev/null
+++ b/docs/source/backend/platforms.md
@@ -0,0 +1,54 @@
+# Platforms
+
+All target-specific code generation in the pystencils backend is facilitated
+through  the *platform classes*.
+This includes:
+
+ - Materialization of the iteration space, meaning the mapping of iteration space points to some indexing structure
+ - Lowering of mathematical functions to their implementation in some runtime environment
+ - Selection of vector intrinsics for SIMD-capable CPU targets
+
+Encapsulation of hardware- and environment-specific details into platform objects allows
+us to implement most of the code generator in a generic and hardware-agnostic way.
+It also makes it easier to extend pystencils with support for additional code generation
+targets in the future.
+
+## Base Classes
+
+```{eval-rst}
+.. module:: pystencils.backend.platforms
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    Platform
+    GenericCpu
+    GenericVectorCpu
+    GenericGpu
+```
+
+## CPU Platforms
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    X86VectorCpu
+    X86VectorArch
+```
+
+## GPU Platforms
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    CudaPlatform
+    SyclPlatform
+```
diff --git a/docs/source/backend/platforms.rst b/docs/source/backend/platforms.rst
deleted file mode 100644
index 68b74504c..000000000
--- a/docs/source/backend/platforms.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-*********
-Platforms
-*********
-
-.. automodule:: pystencils.backend.platforms
-    :members:
\ No newline at end of file
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 5f4995a02..e896fc2bb 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -46,7 +46,7 @@ GRID_DIM = [
 ]
 
 
-class ThreadToIndexMapping(ABC):
+class ThreadMapping(ABC):
 
     @abstractmethod
     def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
@@ -57,7 +57,7 @@ class ThreadToIndexMapping(ABC):
         """
 
 
-class Linear3DMapping(ThreadToIndexMapping):
+class Linear3DMapping(ThreadMapping):
     """3D globally linearized mapping, where each thread is assigned a work item according to
     its location in the global launch grid."""
 
@@ -109,7 +109,7 @@ class Linear3DMapping(ThreadToIndexMapping):
         return block_idx * block_size + thread_idx
 
 
-class Blockwise4DMapping(ThreadToIndexMapping):
+class Blockwise4DMapping(ThreadMapping):
     """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
     are mapped to block indices."""
 
@@ -162,13 +162,20 @@ class Blockwise4DMapping(ThreadToIndexMapping):
 
 
 class CudaPlatform(GenericGpu):
-    """Platform for CUDA-based GPUs."""
+    """Platform for CUDA-based GPUs.
+    
+    Args:
+        ctx: The kernel creation context
+        omit_range_check: If `True`, generated index translation code will not check if the point identified
+            by block and thread indices is actually contained in the iteration space
+        thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
+    """
 
     def __init__(
         self,
         ctx: KernelCreationContext,
         omit_range_check: bool = False,
-        thread_mapping: ThreadToIndexMapping | None = None,
+        thread_mapping: ThreadMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 7491ec8e9..b5b35c8b0 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,14 +1,7 @@
 from __future__ import annotations
-from abc import abstractmethod
 
-from ..ast.structural import PsBlock
-from ..kernelcreation.iteration_space import IterationSpace
 from .platform import Platform
 
 
 class GenericGpu(Platform):
-    @abstractmethod
-    def materialize_iteration_space(
-        self, body: PsBlock, ispace: IterationSpace
-    ) -> PsBlock:
-        pass
+    """Base class for GPU platforms."""
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index 9b7e642b5..8ed4729a2 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -11,9 +11,9 @@ class Platform(ABC):
     """Abstract base class for all supported platforms.
 
     The platform performs all target-dependent tasks during code generation:
-
-     - Translation of the iteration space to an index source (loop nest, GPU indexing, ...)
-     - Platform-specific optimizations (e.g. vectorization, OpenMP)
+    
+    - Translation of the iteration space to an index source (loop nest, GPU indexing, ...)
+    - Platform-specific optimizations (e.g. vectorization, OpenMP)
     """
 
     def __init__(self, ctx: KernelCreationContext) -> None:
@@ -22,12 +22,16 @@ class Platform(ABC):
     @property
     @abstractmethod
     def required_headers(self) -> set[str]:
+        """Set of header files that must be included at the point of definition of a kernel
+        running on this platform."""
         pass
 
     @abstractmethod
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
     ) -> PsBlock:
+        """Materialize the given iteration space as an indexing structure and embed the given
+        kernel body into that structure."""
         pass
 
     @abstractmethod
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 13549f734..afd2958c1 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -14,6 +14,7 @@ from ..backend.kernelcreation import (
     FullIterationSpace,
     SparseIterationSpace,
 )
+from ..backend.platforms.cuda import ThreadMapping
 
 from ..backend.ast.expressions import PsExpression
 
@@ -198,24 +199,41 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         return self._block_size
 
 
-class GpuIndexing(ABC):
+class GpuIndexing:
+    """Factory for GPU indexing objects required during code generation.
+    
+    This class acts as a helper class for the code generation driver.
+    It produces both the `ThreadMapping` required by the backend,
+    as well as factories for the launch configuration required later by the runtime system.
+
+    Args:
+        ctx: The kernel creation context
+        scheme: The desired GPU indexing scheme
+        block_size: A user-defined default block size, required only if the indexing scheme permits
+            modification of the block size
+        manual_launch_grid: If `True`, always emit a `ManualLaunchConfiguration` to force the runtime system
+            to set the launch configuration explicitly
+    """
+
     def __init__(
         self,
         ctx: KernelCreationContext,
         scheme: GpuIndexingScheme,
-        block_size: dim3 | _AUTO_TYPE,
-        manual_launch_grid: bool,
+        default_block_size: dim3 | _AUTO_TYPE | None = None,
+        manual_launch_grid: bool = False,
     ) -> None:
         self._ctx = ctx
         self._scheme = scheme
-        self._block_size = block_size
+        self._default_block_size = default_block_size
         self._manual_launch_grid = manual_launch_grid
 
         from ..backend.kernelcreation import AstFactory
 
         self._factory = AstFactory(self._ctx)
 
-    def get_thread_mapping(self):
+    def get_thread_mapping(self) -> ThreadMapping:
+        """Retrieve a thread mapping object for use by the backend"""
+
         from ..backend.platforms.cuda import Linear3DMapping, Blockwise4DMapping
 
         match self._scheme:
@@ -225,6 +243,7 @@ class GpuIndexing(ABC):
                 return Blockwise4DMapping()
 
     def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
+        """Retrieve a factory for the launch configuration for later consumption by the runtime system"""
         if self._manual_launch_grid:
             return ManualLaunchConfiguration
 
@@ -254,7 +273,10 @@ class GpuIndexing(ABC):
         return factory
 
     def _get_default_block_size(self, rank: int) -> dim3:
-        if isinstance(self._block_size, _AUTO_TYPE):
+        if self._default_block_size is None:
+            raise CodegenError("The default block size option was not set")
+
+        if isinstance(self._default_block_size, _AUTO_TYPE):
             match rank:
                 case 1:
                     return (256, 1, 1)
@@ -265,7 +287,7 @@ class GpuIndexing(ABC):
                 case _:
                     assert False, "unreachable code"
         else:
-            return self._block_size
+            return self._default_block_size
 
     def _get_blockwise4d_config_factory(
         self,
-- 
GitLab


From 01886e87c39febcb9da1f059715bbf9f9fa37f91 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 18 Feb 2025 11:19:52 +0100
Subject: [PATCH 15/17] add Lambda to API reference

---
 docs/source/api/codegen.md         | 1 +
 src/pystencils/codegen/__init__.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/docs/source/api/codegen.md b/docs/source/api/codegen.md
index 8e374d4e5..c52f28f51 100644
--- a/docs/source/api/codegen.md
+++ b/docs/source/api/codegen.md
@@ -177,4 +177,5 @@ The following categories with target-specific options are exposed:
   Kernel
   GpuKernel
   Parameter
+  Lambda
 ```
diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py
index fc1b70ca0..d06c18382 100644
--- a/src/pystencils/codegen/__init__.py
+++ b/src/pystencils/codegen/__init__.py
@@ -6,6 +6,7 @@ from .config import (
 from .parameters import Parameter
 from .kernel import Kernel, GpuKernel
 from .driver import create_kernel, get_driver
+from .functions import Lambda
 
 __all__ = [
     "Target",
@@ -14,6 +15,7 @@ __all__ = [
     "Parameter",
     "Kernel",
     "GpuKernel",
+    "Lambda",
     "create_kernel",
     "get_driver",
 ]
-- 
GitLab


From 28859ef6c4e34c7b22b45fb0fd42785f6ad092c7 Mon Sep 17 00:00:00 2001
From: Richard Angersbach <richard.angersbach@fau.de>
Date: Tue, 18 Feb 2025 15:50:08 +0100
Subject: [PATCH 16/17] Apply 2 suggestion(s) to 2 file(s)

Co-authored-by: Richard Angersbach <richard.angersbach@fau.de>
---
 docs/source/backend/gpu_codegen.md | 2 +-
 docs/source/backend/platforms.md   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 2e3709b24..1082669e6 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -76,7 +76,7 @@ the `ManualLaunchConfiguration` requires the user to manually specifiy both grid
 The `evaluate` method can only be used from within a Python runtime environment.
 When exporting pystencils CUDA kernels for external use in C++ projects,
 equivalent C++ code evaluating the launch config must be generated.
-This is the task of, e..g., [pystencils-sfg](https://pycodegen.pages.i10git.cs.fau.de/pystencils-sfg/).
+This is the task of, e.g., [pystencils-sfg](https://pycodegen.pages.i10git.cs.fau.de/pystencils-sfg/).
 
 
 ```{eval-rst}
diff --git a/docs/source/backend/platforms.md b/docs/source/backend/platforms.md
index 2a0df48ed..e7ffc6f15 100644
--- a/docs/source/backend/platforms.md
+++ b/docs/source/backend/platforms.md
@@ -1,7 +1,7 @@
 # Platforms
 
 All target-specific code generation in the pystencils backend is facilitated
-through  the *platform classes*.
+through the *platform classes*.
 This includes:
 
  - Materialization of the iteration space, meaning the mapping of iteration space points to some indexing structure
-- 
GitLab


From aa384a5b1461e930619d178aecf81542ec422573 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 18 Feb 2025 15:59:17 +0100
Subject: [PATCH 17/17] fix rank check for linear3d indexing

---
 src/pystencils/codegen/__init__.py     |  2 ++
 src/pystencils/codegen/gpu_indexing.py | 14 ++++++++------
 tests/kernelcreation/test_gpu.py       | 12 +++++++++++-
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py
index d06c18382..1b2cd2ffb 100644
--- a/src/pystencils/codegen/__init__.py
+++ b/src/pystencils/codegen/__init__.py
@@ -7,6 +7,7 @@ from .parameters import Parameter
 from .kernel import Kernel, GpuKernel
 from .driver import create_kernel, get_driver
 from .functions import Lambda
+from .errors import CodegenError
 
 __all__ = [
     "Target",
@@ -18,4 +19,5 @@ __all__ = [
     "Lambda",
     "create_kernel",
     "get_driver",
+    "CodegenError",
 ]
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index afd2958c1..2d22ec624 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -131,7 +131,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
 class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
     """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size.
-    
+
     The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items
     in the kernel's iteration space as follows.
     For each dimension :math:`c \\in \\{ x, y, z \\}`,
@@ -201,7 +201,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
 class GpuIndexing:
     """Factory for GPU indexing objects required during code generation.
-    
+
     This class acts as a helper class for the code generation driver.
     It produces both the `ThreadMapping` required by the backend,
     as well as factories for the launch configuration required later by the runtime system.
@@ -259,6 +259,12 @@ class GpuIndexing:
         work_items_expr = self._get_work_items()
         rank = len(work_items_expr)
 
+        if rank > 3:
+            raise CodegenError(
+                "Cannot create a launch grid configuration using the Linear3D indexing scheme"
+                f" for a {rank}-dimensional kernel."
+            )
+
         num_work_items = cast(
             _Dim3Lambda,
             tuple(Lambda.from_expression(self._ctx, wit) for wit in work_items_expr),
@@ -328,10 +334,6 @@ class GpuIndexing:
         match ispace:
             case FullIterationSpace():
                 dimensions = ispace.dimensions_in_loop_order()[::-1]
-                if len(dimensions) > 3:
-                    raise NotImplementedError(
-                        f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
-                    )
 
                 from ..backend.ast.analysis import collect_undefined_symbols as collect
 
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 75239c9b1..10b37e610 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -11,7 +11,6 @@ from pystencils import (
     CreateKernelConfig,
     create_kernel,
     Target,
-    assignment_from_stencil,
 )
 
 from pystencils.slicing import (
@@ -77,6 +76,17 @@ def test_indexing_options(
     cp.testing.assert_allclose(dst_arr, expected)
 
 
+def test_invalid_indexing_schemes():
+    src, dst = fields("src, dst: [4D]")
+    asm = Assignment(src.center(0), dst.center(0))
+
+    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg.gpu.indexing_scheme = "linear3d"
+
+    with pytest.raises(Exception):
+        create_kernel(asm, cfg)
+
+
 def test_averaging_kernel():
     size = (40, 55)
     src_arr = np.random.rand(*size)
-- 
GitLab