From 9a15d1f77df46864d3d16068c71f99ec07b51628 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 10 Feb 2025 18:39:26 +0100
Subject: [PATCH] Implement Lambda and GpuLaunchGridConstraints. Start on
 backend WorkItemMapping.

---
 .../backend/platforms/generic_gpu.py          | 26 ++++++++--
 src/pystencils/codegen/config.py              | 30 +++++++++++
 src/pystencils/codegen/gpu_indexing.py        | 51 +++++++++++++++++++
 src/pystencils/codegen/lambdas.py             | 45 ++++++++++++++++
 4 files changed, 149 insertions(+), 3 deletions(-)
 create mode 100644 src/pystencils/codegen/gpu_indexing.py
 create mode 100644 src/pystencils/codegen/lambdas.py

diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 15df36cdd..f22c3c99b 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 
 from ..ast.expressions import PsExpression
 from ..ast.structural import PsBlock
@@ -16,6 +16,26 @@ if TYPE_CHECKING:
     from ...codegen.kernel import GpuThreadsRange
 
 
+class WorkItemMapping(ABC):
+    """Signature for work-item mappings used to modify the thread index mapping behavior"""
+
+    @abstractmethod
+    def __call__(
+        self,
+        block_idx: tuple[PsExpression, PsExpression, PsExpression],
+        thread_idx: tuple[PsExpression, PsExpression, PsExpression],
+        ispace_rank: int,
+    ) -> tuple[PsExpression, ...]:
+        """Compute a work item index from the current block index, thread index, and iteration space dimensionality.
+        
+        Implementations of this method must return a tuple with `ispace_rank` entries,
+        containing expressions for the compressed index of the work item identified by the
+        given GPU block and thread index triples.
+        (The *compressed index* is the work item's index before application
+        of the iteration space's lower limits and strides.)
+        """
+
+
 class GenericGpu(Platform):
     @abstractmethod
     def materialize_iteration_space(
@@ -38,13 +58,13 @@ class GenericGpu(Platform):
     @classmethod
     def _threads_from_full_ispace(cls, ispace: FullIterationSpace) -> GpuThreadsRange:
         from ...codegen.kernel import GpuThreadsRange
-        
+
         dimensions = ispace.dimensions_in_loop_order()[::-1]
         if len(dimensions) > 3:
             raise NotImplementedError(
                 f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
             )
-        
+
         from ..ast.analysis import collect_undefined_symbols as collect
 
         for dim in dimensions:
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index bce075731..96ab13ea0 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from warnings import warn
 from abc import ABC
 from collections.abc import Collection
+from enum import Enum, auto
 
 from typing import TYPE_CHECKING, Sequence, Generic, TypeVar, Callable, Any, cast
 from dataclasses import dataclass, InitVar, fields
@@ -331,6 +332,35 @@ class CpuOptions(ConfigBase):
     """
 
 
+class GpuIndexingScheme(Enum):
+    """Available index translation schemes for GPU kernels."""
+
+    Linear3D = auto()
+    """Map coordinates to global thread indices.
+
+    Supports up to three-dimensional iteration spaces.
+    For each dimension (with known start, stop and step values), compute the current iteration
+    point as ``start + step * (blockIdx.c * blockDim.c * threadDim.c)``
+    (where c :math:`\\in` (x, y, z)).
+    """
+
+    Blockwise4D = auto()
+    """On a 3D grid of 1D blocks, map the fastest coordinate onto the intra-block thread index,
+    and slower coordinates onto the block index.
+
+    Supports up to four-dimensional iteration spaces.
+    Using this indexing scheme, the iteration counters of up to four dimensions are assigned
+    like follows, from slowest to fastest:
+
+    .. code-block:: C++
+
+        ctr_3 = blockIdx.z;
+        ctr_2 = blockIdx.y;
+        ctr_1 = blockIdx.x;
+        ctr_0 = threadIDx.x;
+    """
+
+
 @dataclass
 class GpuOptions(ConfigBase):
     """Configuration options specific to GPU targets."""
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
new file mode 100644
index 000000000..1d9bf9c2c
--- /dev/null
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from itertools import chain
+
+from .lambdas import Lambda
+from .parameters import Parameter
+
+
+_ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None]
+
+
+class GpuLaunchGridConstraints:
+    """Constraints on the number of threads and blocks on the GPU launch grid for a given kernel.
+    
+    This constraints set determines all or some of
+    the number of threads on a GPU block as well as the number of blocks on the GPU grid,
+    statically or depending on runtime parameters.
+    """
+
+    def __init__(
+        self,
+        block_size: _ConstraintTriple | None = None,
+        grid_size: _ConstraintTriple | None = None,
+    ) -> None:
+        self._block_size: _ConstraintTriple = (
+            (None, None, None) if block_size is None else block_size
+        )
+        self._grid_size: _ConstraintTriple = (
+            (None, None, None) if grid_size is None else grid_size
+        )
+
+        params = set()
+        for constr in chain(self._block_size, self._grid_size):
+            if constr is not None:
+                params |= set(constr.parameters)
+        self._params = frozenset(params)
+
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters to this set of constraints"""
+        return self._params
+
+    @property
+    def block_size(self) -> _ConstraintTriple:
+        """Constraints on the number of threads per block"""
+        return self._block_size
+
+    @property
+    def grid_size(self) -> _ConstraintTriple:
+        """Constraints on the number of blocks on the grid"""
+        return self._grid_size
diff --git a/src/pystencils/codegen/lambdas.py b/src/pystencils/codegen/lambdas.py
new file mode 100644
index 000000000..dd0fb571d
--- /dev/null
+++ b/src/pystencils/codegen/lambdas.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+from typing import Sequence
+
+import numpy as np
+
+from .parameters import Parameter
+from ..types import PsType
+
+from ..backend.ast.expressions import PsExpression
+
+
+class Lambda:
+    """A one-line function emitted by the code generator as an auxiliary object."""
+
+    def __init__(self, expr: PsExpression, params: Sequence[Parameter]):
+        self._expr = expr
+        self._params = tuple(params)
+        self._return_type = expr.get_dtype()
+    
+    @property
+    def parameters(self) -> tuple[Parameter, ...]:
+        """Parameters to this lambda"""
+        return self._params
+
+    @property
+    def return_type(self) -> PsType:
+        """Return type of this lambda"""
+        return self._return_type
+
+    def __call__(self, **kwargs) -> np.generic:
+        """Evaluate this lambda with the given arguments.
+        
+        The lambda must receive a value for each parameter listed in `parameters`.
+        """
+        from ..backend.ast.expressions import evaluate_expression
+        return evaluate_expression(self._expr, kwargs)
+
+    def __str__(self) -> str:
+        return str(self._expr)
+
+    def c_code(self) -> str:
+        """Print the C code of this lambda"""
+        from ..backend.emission import CAstPrinter
+        printer = CAstPrinter()
+        return printer(self._expr)
-- 
GitLab