From 3d81f0311a1d569760a2aca08d43961abea74459 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 12 Feb 2025 14:52:13 +0100
Subject: [PATCH] remove GpuThreadsRange. Introduce GpuIndexing ABC and
 Linear3D implementation. Compute grid constraints for Linear3D. Start
 updating cupy JIT/

---
 src/pystencils/backend/memory.py              |  11 +-
 src/pystencils/backend/platforms/cuda.py      |  14 +--
 .../backend/platforms/generic_gpu.py          |  49 +-------
 src/pystencils/codegen/__init__.py            |   3 +-
 src/pystencils/codegen/config.py              |  57 ++++++++--
 src/pystencils/codegen/driver.py              |  59 +++++++---
 src/pystencils/codegen/errors.py              |   2 +
 .../codegen/{lambdas.py => functions.py}      |  15 ++-
 src/pystencils/codegen/gpu_indexing.py        | 106 +++++++++++++++++-
 src/pystencils/codegen/kernel.py              |  47 +-------
 src/pystencils/codegen/properties.py          |   5 +
 src/pystencils/jit/gpu_cupy.py                |  77 +++++++------
 .../platform/test_gpu_platforms.py            |  43 -------
 13 files changed, 283 insertions(+), 205 deletions(-)
 create mode 100644 src/pystencils/codegen/errors.py
 rename src/pystencils/codegen/{lambdas.py => functions.py} (77%)
 delete mode 100644 tests/nbackend/kernelcreation/platform/test_gpu_platforms.py

diff --git a/src/pystencils/backend/memory.py b/src/pystencils/backend/memory.py
index 7a5d62f69..0e9b21d6c 100644
--- a/src/pystencils/backend/memory.py
+++ b/src/pystencils/backend/memory.py
@@ -89,8 +89,13 @@ class PsSymbol:
         return f"PsSymbol({repr(self._name)}, {repr(self._dtype)})"
 
 
+class BackendPrivateProperty:
+    """Mix-in marker for symbol properties that are private to the backend
+    and should not be exported to parameters"""
+
+
 @dataclass(frozen=True)
-class BufferBasePtr(UniqueSymbolProperty):
+class BufferBasePtr(UniqueSymbolProperty, BackendPrivateProperty):
     """Symbol acts as a base pointer to a buffer."""
 
     buffer: PsBuffer
@@ -120,12 +125,12 @@ class PsBuffer:
         strides: Sequence[PsSymbol | PsConstant],
     ):
         bptr_type = base_ptr.get_dtype()
-        
+
         if not isinstance(bptr_type, PsPointerType):
             raise ValueError(
                 f"Type of buffer base pointer {base_ptr} was not a pointer type: {bptr_type}"
             )
-        
+
         if bptr_type.base_type != element_type:
             raise ValueError(
                 f"Base type of primary buffer base pointer {base_ptr} "
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 4f78d344d..cff6f935f 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -46,7 +46,7 @@ GRID_DIM = [
 ]
 
 
-class DenseThreadIdxMapping(ABC):
+class ThreadToIndexMapping(ABC):
 
     @abstractmethod
     def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
@@ -57,7 +57,7 @@ class DenseThreadIdxMapping(ABC):
         """
 
 
-class Linear3DMapping(DenseThreadIdxMapping):
+class Linear3DMapping(ThreadToIndexMapping):
     """3D globally linearized mapping, where each thread is assigned a work item according to
     its location in the global launch grid."""
 
@@ -86,7 +86,7 @@ class Linear3DMapping(DenseThreadIdxMapping):
         return block_idx * block_size + thread_idx
 
 
-class Blockwise4DMapping(DenseThreadIdxMapping):
+class Blockwise4DMapping(ThreadToIndexMapping):
     """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
     are mapped to block indices."""
 
@@ -122,12 +122,12 @@ class CudaPlatform(GenericGpu):
         self,
         ctx: KernelCreationContext,
         omit_range_check: bool = False,
-        dense_idx_mapping: DenseThreadIdxMapping | None = None,
+        thread_mapping: ThreadToIndexMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
         self._omit_range_check = omit_range_check
-        self._dense_idx_mapping = dense_idx_mapping
+        self._thread_mapping = thread_mapping
 
         self._typify = Typifier(ctx)
 
@@ -227,8 +227,8 @@ class CudaPlatform(GenericGpu):
         #     threads_range = None
 
         idx_mapper = (
-            self._dense_idx_mapping
-            if self._dense_idx_mapping is not None
+            self._thread_mapping
+            if self._thread_mapping is not None
             else Linear3DMapping()
         )
         ctr_mapping = idx_mapper(ispace)
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index f66f9aa0e..7491ec8e9 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,19 +1,9 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
 from abc import abstractmethod
 
-from ..ast.expressions import PsExpression
 from ..ast.structural import PsBlock
-from ..kernelcreation.iteration_space import (
-    IterationSpace,
-    FullIterationSpace,
-    SparseIterationSpace,
-)
+from ..kernelcreation.iteration_space import IterationSpace
 from .platform import Platform
-from ..exceptions import MaterializationError
-
-if TYPE_CHECKING:
-    from ...codegen.kernel import GpuThreadsRange
 
 
 class GenericGpu(Platform):
@@ -22,40 +12,3 @@ class GenericGpu(Platform):
         self, body: PsBlock, ispace: IterationSpace
     ) -> PsBlock:
         pass
-
-    @classmethod
-    def threads_from_ispace(cls, ispace: IterationSpace) -> GpuThreadsRange:
-        from ...codegen.kernel import GpuThreadsRange
-
-        if isinstance(ispace, FullIterationSpace):
-            return cls._threads_from_full_ispace(ispace)
-        elif isinstance(ispace, SparseIterationSpace):
-            work_items = (PsExpression.make(ispace.index_list.shape[0]),)
-            return GpuThreadsRange(work_items)
-        else:
-            assert False
-
-    @classmethod
-    def _threads_from_full_ispace(cls, ispace: FullIterationSpace) -> GpuThreadsRange:
-        from ...codegen.kernel import GpuThreadsRange
-
-        dimensions = ispace.dimensions_in_loop_order()[::-1]
-        if len(dimensions) > 3:
-            raise NotImplementedError(
-                f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
-            )
-
-        from ..ast.analysis import collect_undefined_symbols as collect
-
-        for dim in dimensions:
-            symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
-            for ctr in ispace.counters:
-                if ctr in symbs:
-                    raise MaterializationError(
-                        "Unable to construct GPU threads range for iteration space: "
-                        f"Limits of dimension counter {dim.counter.name} "
-                        f"depend on another dimension's counter {ctr.name}"
-                    )
-
-        work_items = [ispace.actual_iterations(dim) for dim in dimensions]
-        return GpuThreadsRange(work_items)
diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py
index e13f911dd..fc1b70ca0 100644
--- a/src/pystencils/codegen/__init__.py
+++ b/src/pystencils/codegen/__init__.py
@@ -4,7 +4,7 @@ from .config import (
     AUTO,
 )
 from .parameters import Parameter
-from .kernel import Kernel, GpuKernel, GpuThreadsRange
+from .kernel import Kernel, GpuKernel
 from .driver import create_kernel, get_driver
 
 __all__ = [
@@ -14,7 +14,6 @@ __all__ = [
     "Parameter",
     "Kernel",
     "GpuKernel",
-    "GpuThreadsRange",
     "create_kernel",
     "get_driver",
 ]
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 96ab13ea0..53a271852 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -86,7 +86,9 @@ class Option(Generic[Option_T, Arg_T]):
         self._name = name
         self._lookup = f"_{name}"
 
-    def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Option_T | None:
+    def __get__(
+        self, obj: ConfigBase, objtype: type[ConfigBase] | None = None
+    ) -> Option_T | None:
         if obj is None:
             return None
 
@@ -194,7 +196,9 @@ class Category(Generic[Category_T]):
         self._name = name
         self._lookup = f"_{name}"
 
-    def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Category_T:
+    def __get__(
+        self, obj: ConfigBase, objtype: type[ConfigBase] | None = None
+    ) -> Category_T:
         if obj is None:
             return None
 
@@ -365,6 +369,9 @@ class GpuIndexingScheme(Enum):
 class GpuOptions(ConfigBase):
     """Configuration options specific to GPU targets."""
 
+    indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D)
+    """Thread indexing scheme for dense GPU kernels."""
+
     omit_range_check: BasicOption[bool] = BasicOption(False)
     """If set to `True`, omit the iteration counter range check.
     
@@ -384,6 +391,31 @@ class GpuOptions(ConfigBase):
     The launch grid will then have to be specified manually at runtime.
     """
 
+    @indexing_scheme.validate
+    def _validate_idx_scheme(self, val: str | GpuIndexingScheme):
+        if isinstance(val, GpuIndexingScheme):
+            return val
+
+        match val.lower():
+            case "block":
+                warn(
+                    "GPU indexing scheme name `block` is deprecated and will be removed in pystencils 2.1. "
+                    "Use `Linear3D` instead."
+                )
+                return GpuIndexingScheme.Linear3D
+            case "line":
+                warn(
+                    "GPU indexing scheme name `line` is deprecated and will be removed in pystencils 2.1. "
+                    "Use `Blockwise4D` instead."
+                )
+                return GpuIndexingScheme.Blockwise4D
+            case "linear3d":
+                return GpuIndexingScheme.Linear3D
+            case "blockwise4d":
+                return GpuIndexingScheme.Blockwise4D
+            case _:
+                raise ValueError(f"Invalid GPU indexing scheme: {val}")
+
 
 @dataclass
 class SyclOptions(ConfigBase):
@@ -536,6 +568,9 @@ class CreateKernelConfig(ConfigBase):
     cpu_vectorize_info: InitVar[dict | None] = None
     """Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead."""
 
+    gpu_indexing: InitVar[str | None] = None
+    """Deprecated; use `gpu.indexing_scheme` instead."""
+
     gpu_indexing_params: InitVar[dict | None] = None
     """Deprecated; set options in the `gpu` category instead."""
 
@@ -594,6 +629,7 @@ class CreateKernelConfig(ConfigBase):
         data_type: UserTypeSpec | None,
         cpu_openmp: bool | int | None,
         cpu_vectorize_info: dict | None,
+        gpu_indexing: str | None,
         gpu_indexing_params: dict | None,
     ):  # pragma: no cover
         if data_type is not None:
@@ -623,9 +659,7 @@ class CreateKernelConfig(ConfigBase):
                     deprecated_omp.enable = True
                     deprecated_omp.num_threads = cpu_openmp
                 case _:
-                    raise ValueError(
-                        f"Invalid option for `cpu_openmp`: {cpu_openmp}"
-                    )
+                    raise ValueError(f"Invalid option for `cpu_openmp`: {cpu_openmp}")
 
             self.cpu.openmp = deprecated_omp
 
@@ -682,11 +716,20 @@ class CreateKernelConfig(ConfigBase):
 
             self.cpu.vectorize = deprecated_vec_opts
 
+        if gpu_indexing is not None:
+            _deprecated_option("gpu_indexing", "gpu.indexing_scheme")
+            warn(
+                "Setting the deprecated `gpu_indexing` will override the `gpu.indexing_scheme` option",
+                UserWarning,
+            )
+            self.gpu.indexing_scheme = gpu_indexing
+
         if gpu_indexing_params is not None:
-            _deprecated_option("gpu_indexing_params", "gpu_indexing")
+            _deprecated_option("gpu_indexing_params", "gpu")
             warn(
                 "Setting the deprecated `gpu_indexing_params` will override any options "
-                "passed in the `gpu` category."
+                "passed in the `gpu` category.",
+                UserWarning,
             )
 
             self.gpu = GpuOptions(
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 6f44e718d..152fceba8 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -10,10 +10,12 @@ from .config import (
     _AUTO_TYPE,
     GhostLayerSpec,
     IterationSliceSpec,
+    GpuIndexingScheme,
 )
-from .kernel import Kernel, GpuKernel, GpuThreadsRange
-from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
+from .kernel import Kernel, GpuKernel
+from .properties import PsSymbolProperty, FieldBasePtr
 from .parameters import Parameter
+from .gpu_indexing import GpuIndexing, GpuLaunchGridConstraints
 
 from ..field import Field
 from ..types import PsIntegerType, PsScalarType
@@ -145,6 +147,7 @@ class DefaultKernelCreationDriver:
         )
 
         self._target = cfg.get_target()
+        self._gpu_indexing: GpuIndexing | None = self._get_gpu_indexing()
         self._platform = self._get_platform()
 
         self._intermediates: CodegenIntermediates | None
@@ -169,9 +172,11 @@ class DefaultKernelCreationDriver:
                     kernel_body, self._ctx.get_iteration_space()
                 )
             case GenericGpu():
-                kernel_ast, gpu_threads = self._platform.materialize_iteration_space(
+                kernel_ast = self._platform.materialize_iteration_space(
                     kernel_body, self._ctx.get_iteration_space()
                 )
+            case _:
+                assert False, "unexpected platform"
 
         if self._intermediates is not None:
             self._intermediates.materialized_ispace = kernel_ast.clone()
@@ -219,7 +224,7 @@ class DefaultKernelCreationDriver:
                 self._ctx,
                 self._platform,
                 kernel_ast,
-                gpu_threads,
+                self._gpu_indexing,
                 self._cfg.get_option("function_name"),
                 self._target,
                 self._cfg.get_jit(),
@@ -395,6 +400,20 @@ class DefaultKernelCreationDriver:
 
         return kernel_ast
 
+    def _get_gpu_indexing(self) -> GpuIndexing | None:
+        if self._target != Target.CUDA:
+            return None
+
+        idx_scheme = self._cfg.gpu.get_option("indexing_scheme")
+
+        match idx_scheme:
+            case None | GpuIndexingScheme.Linear3D:
+                from .gpu_indexing import Linear3DGpuIndexing
+
+                return Linear3DGpuIndexing(self._ctx)
+            case _:
+                raise NotImplementedError()
+
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
             if Target._X86 in self._target:
@@ -430,7 +449,9 @@ class DefaultKernelCreationDriver:
                 case Target.SYCL:
                     from ..backend.platforms import SyclPlatform
 
-                    auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size")
+                    auto_block_size: bool = self._cfg.sycl.get_option(
+                        "automatic_block_size"
+                    )
 
                     return SyclPlatform(
                         self._ctx,
@@ -440,12 +461,16 @@ class DefaultKernelCreationDriver:
                 case Target.CUDA:
                     from ..backend.platforms import CudaPlatform
 
-                    manual_grid = gpu_opts.get_option("manual_launch_grid")
+                    thread_mapping = (
+                        self._gpu_indexing.get_thread_mapping()
+                        if self._gpu_indexing is not None
+                        else None
+                    )
 
                     return CudaPlatform(
                         self._ctx,
                         omit_range_check=omit_range_check,
-                        manual_launch_grid=manual_grid,
+                        thread_mapping=thread_mapping,
                     )
 
         raise NotImplementedError(
@@ -475,23 +500,25 @@ def create_gpu_kernel_function(
     ctx: KernelCreationContext,
     platform: Platform,
     body: PsBlock,
-    threads_range: GpuThreadsRange | None,
+    indexing: GpuIndexing | None,
     function_name: str,
     target_spec: Target,
     jit: JitBase,
 ) -> GpuKernel:
     undef_symbols = collect_undefined_symbols(body)
 
-    if threads_range is not None:
-        for threads in threads_range.num_work_items:
-            undef_symbols |= collect_undefined_symbols(threads)
+    launch_grid_constraints = (
+        indexing.get_launch_grid_constraints()
+        if indexing is not None
+        else GpuLaunchGridConstraints()
+    )
 
     params = _get_function_params(ctx, undef_symbols)
     req_headers = _get_headers(ctx, platform, body)
 
     kfunc = GpuKernel(
         body,
-        threads_range,
+        launch_grid_constraints,
         target_spec,
         function_name,
         params,
@@ -507,17 +534,19 @@ def _get_function_params(
 ) -> list[Parameter]:
     params: list[Parameter] = []
 
-    from pystencils.backend.memory import BufferBasePtr
+    from pystencils.backend.memory import BufferBasePtr, BackendPrivateProperty
 
     for symb in symbols:
         props: set[PsSymbolProperty] = set()
         for prop in symb.properties:
             match prop:
-                case FieldShape() | FieldStride():
-                    props.add(prop)
                 case BufferBasePtr(buf):
                     field = ctx.find_field(buf.name)
                     props.add(FieldBasePtr(field))
+                case BackendPrivateProperty():
+                    pass
+                case _:
+                    props.add(prop)
         params.append(Parameter(symb.name, symb.get_dtype(), props))
 
     params.sort(key=lambda p: p.name)
diff --git a/src/pystencils/codegen/errors.py b/src/pystencils/codegen/errors.py
new file mode 100644
index 000000000..eceb53f61
--- /dev/null
+++ b/src/pystencils/codegen/errors.py
@@ -0,0 +1,2 @@
+class CodegenError(Exception):
+    """Exception that indicates a fatal error in the code generation driver."""
diff --git a/src/pystencils/codegen/lambdas.py b/src/pystencils/codegen/functions.py
similarity index 77%
rename from src/pystencils/codegen/lambdas.py
rename to src/pystencils/codegen/functions.py
index dd0fb571d..2779fa289 100644
--- a/src/pystencils/codegen/lambdas.py
+++ b/src/pystencils/codegen/functions.py
@@ -6,17 +6,26 @@ import numpy as np
 from .parameters import Parameter
 from ..types import PsType
 
+from ..backend.kernelcreation import KernelCreationContext
 from ..backend.ast.expressions import PsExpression
 
 
 class Lambda:
     """A one-line function emitted by the code generator as an auxiliary object."""
 
+    @staticmethod
+    def from_expression(ctx: KernelCreationContext, expr: PsExpression):
+        from ..backend.ast.analysis import collect_undefined_symbols
+        from .driver import _get_function_params
+
+        params = _get_function_params(ctx, collect_undefined_symbols(expr))
+        return Lambda(expr, params)
+
     def __init__(self, expr: PsExpression, params: Sequence[Parameter]):
         self._expr = expr
         self._params = tuple(params)
         self._return_type = expr.get_dtype()
-    
+
     @property
     def parameters(self) -> tuple[Parameter, ...]:
         """Parameters to this lambda"""
@@ -29,10 +38,11 @@ class Lambda:
 
     def __call__(self, **kwargs) -> np.generic:
         """Evaluate this lambda with the given arguments.
-        
+
         The lambda must receive a value for each parameter listed in `parameters`.
         """
         from ..backend.ast.expressions import evaluate_expression
+
         return evaluate_expression(self._expr, kwargs)
 
     def __str__(self) -> str:
@@ -41,5 +51,6 @@ class Lambda:
     def c_code(self) -> str:
         """Print the C code of this lambda"""
         from ..backend.emission import CAstPrinter
+
         printer = CAstPrinter()
         return printer(self._expr)
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 1d9bf9c2c..2b84ef007 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -1,9 +1,21 @@
 from __future__ import annotations
 
+from abc import ABC, abstractmethod
+from typing import cast
 from itertools import chain
 
-from .lambdas import Lambda
+from .functions import Lambda
 from .parameters import Parameter
+from .properties import GpuBlockSize
+from .errors import CodegenError
+
+from ..backend.kernelcreation import (
+    KernelCreationContext,
+    FullIterationSpace,
+    SparseIterationSpace,
+)
+from ..backend.platforms.cuda import ThreadToIndexMapping
+from ..backend.ast.expressions import PsExpression
 
 
 _ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None]
@@ -11,7 +23,7 @@ _ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None]
 
 class GpuLaunchGridConstraints:
     """Constraints on the number of threads and blocks on the GPU launch grid for a given kernel.
-    
+
     This constraints set determines all or some of
     the number of threads on a GPU block as well as the number of blocks on the GPU grid,
     statically or depending on runtime parameters.
@@ -49,3 +61,93 @@ class GpuLaunchGridConstraints:
     def grid_size(self) -> _ConstraintTriple:
         """Constraints on the number of blocks on the grid"""
         return self._grid_size
+
+
+class GpuIndexing(ABC):
+    @abstractmethod
+    def get_thread_mapping(self) -> ThreadToIndexMapping | None: ...
+
+    @abstractmethod
+    def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints: ...
+
+
+class Linear3DGpuIndexing(GpuIndexing):
+
+    def __init__(self, ctx: KernelCreationContext) -> None:
+        self._ctx = ctx
+
+        from ..backend.kernelcreation import AstFactory
+
+        self._factory = AstFactory(self._ctx)
+
+    def get_thread_mapping(self) -> ThreadToIndexMapping:
+        from ..backend.platforms.cuda import Linear3DMapping
+
+        return Linear3DMapping()
+
+    def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints:
+        work_items = self._get_work_items()
+        rank = len(work_items)
+
+        from ..backend.constants import PsConstant
+        from ..backend.ast.expressions import PsExpression, PsIntDiv
+
+        block_size_constraints = [None] * rank + [
+            Lambda(self._factory.parse_index(1), ()) for _ in range(3 - rank)
+        ]
+
+        block_size_symbols = [
+            self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank)
+        ]
+        for c, bs in enumerate(block_size_symbols):
+            bs.add_property(GpuBlockSize(c))
+
+        def div_ceil(a: PsExpression, b: PsExpression):
+            return self._factory.parse_index(
+                PsIntDiv(a + b - PsExpression.make(PsConstant(1)), b)
+            )
+
+        grid_size_constraints = [
+            Lambda.from_expression(
+                self._ctx, div_ceil(witems, PsExpression.make(bsize))
+            )
+            for witems, bsize in zip(work_items, block_size_symbols)
+        ] + [
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
+            for _ in range(3 - rank)
+        ]
+
+        return GpuLaunchGridConstraints(
+            block_size=cast(_ConstraintTriple, tuple(block_size_constraints)),
+            grid_size=cast(_ConstraintTriple, tuple(grid_size_constraints)),
+        )
+
+    def _get_work_items(self) -> tuple[PsExpression, ...]:
+        ispace = self._ctx.get_iteration_space()
+        match ispace:
+            case FullIterationSpace():
+                dimensions = ispace.dimensions_in_loop_order()[::-1]
+                if len(dimensions) > 3:
+                    raise NotImplementedError(
+                        f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
+                    )
+
+                from ..backend.ast.analysis import collect_undefined_symbols as collect
+
+                for i, dim in enumerate(dimensions):
+                    symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
+                    for ctr in ispace.counters:
+                        if ctr in symbs:
+                            raise CodegenError(
+                                "Unable to construct GPU launch grid constraints for this kernel: "
+                                f"Limits in dimension {i} "
+                                f"depend on another dimension's counter {ctr.name}"
+                            )
+
+                return tuple(ispace.actual_iterations(dim) for dim in dimensions)
+
+            case SparseIterationSpace():
+                return (self._factory.parse_index(ispace.index_list.shape[0]),)
+
+            case _:
+                assert False, "unexpected iteration space"
diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py
index 3adc47876..8038f24b0 100644
--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -6,8 +6,8 @@ from itertools import chain
 
 from .target import Target
 from .parameters import Parameter
+from .gpu_indexing import GpuLaunchGridConstraints
 from ..backend.ast.structural import PsBlock
-from ..backend.ast.expressions import PsExpression
 from ..field import Field
 
 from .._deprecation import _deprecated
@@ -118,7 +118,7 @@ class GpuKernel(Kernel):
     def __init__(
         self,
         body: PsBlock,
-        threads_range: GpuThreadsRange | None,
+        launch_grid_constraints: GpuLaunchGridConstraints,
         target: Target,
         name: str,
         parameters: Sequence[Parameter],
@@ -126,46 +126,9 @@ class GpuKernel(Kernel):
         jit: JitBase,
     ):
         super().__init__(body, target, name, parameters, required_headers, jit)
-        self._threads_range = threads_range
+        self._launch_grid_constraints = launch_grid_constraints
 
     @property
-    def threads_range(self) -> GpuThreadsRange | None:
+    def launch_grid_constraints(self) -> GpuLaunchGridConstraints:
         """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._threads_range
-
-
-class GpuThreadsRange:
-    """Number of threads required by a GPU kernel, in order (x, y, z)."""
-
-    def __init__(
-        self,
-        num_work_items: Sequence[PsExpression],
-    ):
-        self._dim = len(num_work_items)
-        self._num_work_items = tuple(num_work_items)
-
-    # @property
-    # def grid_size(self) -> tuple[PsExpression, ...]:
-    #     return self._grid_size
-
-    # @property
-    # def block_size(self) -> tuple[PsExpression, ...]:
-    #     return self._block_size
-
-    @property
-    def num_work_items(self) -> tuple[PsExpression, ...]:
-        """Number of work items in (x, y, z)-order."""
-        return self._num_work_items
-
-    @property
-    def dim(self) -> int:
-        return self._dim
-
-    def __str__(self) -> str:
-        rep = "GpuThreadsRange { "
-        rep += "; ".join(f"{x}: {w}" for x, w in zip("xyz", self._num_work_items))
-        rep += " }"
-        return rep
-
-    def _repr_html_(self) -> str:
-        return str(self)
+        return self._launch_grid_constraints
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index d377fb3d3..df76489db 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -39,3 +39,8 @@ class FieldBasePtr(UniqueSymbolProperty):
 
 FieldProperty = FieldShape | FieldStride | FieldBasePtr
 _FieldProperty = (FieldShape, FieldStride, FieldBasePtr)
+
+
+@dataclass(frozen=True)
+class GpuBlockSize(UniqueSymbolProperty):
+    coordinate: int
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index a407bb75e..afdbd5097 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -40,7 +40,7 @@ class CupyKernelWrapper(KernelWrapper):
         self._kfunc: GpuKernel = kfunc
         self._raw_kernel = raw_kernel
         self._block_size = block_size
-        self._num_blocks: tuple[int, int, int] | None = None
+        self._grid_size: tuple[int, int, int] | None = None
         self._args_cache: dict[Any, tuple] = dict()
 
     @property
@@ -61,11 +61,11 @@ class CupyKernelWrapper(KernelWrapper):
 
     @property
     def num_blocks(self) -> tuple[int, int, int] | None:
-        return self._num_blocks
+        return self._grid_size
 
     @num_blocks.setter
     def num_blocks(self, nb: tuple[int, int, int] | None):
-        self._num_blocks = nb
+        self._grid_size = nb
 
     def __call__(self, **kwargs: Any):
         kernel_args, launch_grid = self._get_cached_args(**kwargs)
@@ -80,7 +80,9 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._block_size, self._grid_size) + tuple(
+            (k, id(v)) for k, v in kwargs.items()
+        )
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -164,6 +166,7 @@ class CupyKernelWrapper(KernelWrapper):
                             elem_dtype: PsType
 
                             from .. import DynamicType
+
                             if isinstance(field.dtype, DynamicType):
                                 assert isinstance(kparam.dtype, PsPointerType)
                                 elem_dtype = kparam.dtype.base_type
@@ -199,42 +202,48 @@ class CupyKernelWrapper(KernelWrapper):
                 add_arg(kparam.name, val, kparam.dtype)
 
         #   Determine launch grid
-        from ..backend.ast.expressions import evaluate_expression
-
-        symbolic_threads_range = self._kfunc.threads_range
 
-        if self._num_blocks is not None:
-            launch_grid = LaunchGrid(self._num_blocks, self._block_size)
+        from ..codegen.gpu_indexing import GpuBlockSize
 
-        elif symbolic_threads_range is not None:
-            threads_range: list[int] = [
-                evaluate_expression(expr, valuation)
-                for expr in symbolic_threads_range.num_work_items
-            ]
+        constraints = self._kfunc.launch_grid_constraints
 
-            if symbolic_threads_range.dim < 3:
-                threads_range += [1] * (3 - symbolic_threads_range.dim)
-
-            def div_ceil(a, b):
-                return a // b if a % b == 0 else a // b + 1
-
-            #   TODO: Refine this?
-            num_blocks = tuple(
-                div_ceil(threads, tpb)
-                for threads, tpb in zip(threads_range, self._block_size)
+        for cparam in constraints.parameters:
+            for prop in cparam.properties:
+                match prop:
+                    case GpuBlockSize(coord):
+                        valuation[cparam.name] = self._block_size[coord]
+                        break
+            else:
+                valuation[cparam.name] = kwargs[cparam.name]
+
+        # launch_block_size: list[int] = []
+        # for coord, (bsize_constr, user_bsize) in enumerate(
+        #     zip(constraints.block_size, self._block_size)
+        # ):
+        #     if bsize_constr is None:
+        #         launch_grid_size
+
+        launch_block_size = [
+            (
+                int(bsize_constr(**valuation))
+                if bsize_constr is not None
+                else self._block_size[coord]
             )
-            assert len(num_blocks) == 3
-
-            launch_grid = LaunchGrid(num_blocks, self._block_size)
-
-        else:
-            raise JitError(
-                "Unable to determine launch grid for GPU kernel invocation: "
-                "No manual grid size was specified, and the number of threads could not "
-                "be determined automatically."
+            for coord, bsize_constr in enumerate(constraints.block_size)
+        ]
+
+        launch_grid_size = [
+            (
+                int(gsize_constr(**valuation))
+                if gsize_constr is not None
+                else self._grid_size[coord]
             )
+            for coord, gsize_constr in enumerate(constraints.grid_size)
+        ]
 
-        return tuple(args), launch_grid
+        return tuple(args), LaunchGrid(
+            tuple(launch_grid_size), tuple(launch_block_size)
+        )
 
 
 class CupyJit(JitBase):
diff --git a/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py b/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py
deleted file mode 100644
index da2b3a5ad..000000000
--- a/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import pytest
-
-from pystencils.field import Field
-
-from pystencils.backend.kernelcreation import (
-    KernelCreationContext,
-    FullIterationSpace
-)
-
-from pystencils.backend.ast.structural import PsBlock, PsComment
-
-from pystencils.backend.platforms import CudaPlatform, SyclPlatform
-
-
-@pytest.mark.parametrize("layout", ["fzyx", "zyxf", "c", "f"])
-@pytest.mark.parametrize("platform_class", [CudaPlatform, SyclPlatform])
-def test_thread_range(platform_class, layout):
-    ctx = KernelCreationContext()
-
-    body = PsBlock([PsComment("Kernel body goes here")])
-    platform = platform_class(ctx)
-
-    dim = 3
-    archetype_field = Field.create_generic("field", spatial_dimensions=dim, layout=layout)
-    ispace = FullIterationSpace.create_with_ghost_layers(ctx, 1, archetype_field)
-
-    _, threads_range = platform.materialize_iteration_space(body, ispace)
-
-    assert threads_range.dim == dim
-    
-    match layout:
-        case "fzyx" | "zyxf" | "f":
-            indexing_order = [0, 1, 2]
-        case "c":
-            indexing_order = [2, 1, 0]
-
-    for i in range(dim):
-        #   Slowest to fastest coordinate
-        coordinate = indexing_order[i]
-        dimension = ispace.dimensions[coordinate]
-        witems = threads_range.num_work_items[i]
-        desired = dimension.stop - dimension.start
-        assert witems.structurally_equal(desired)
-- 
GitLab