Introduce launch config factory pattern

- update GpuKernel to receive a launch config factory. - Update gpu-indexing to provide one. - Update cupy-jit to expose and evaluate the launch config

Introduce launch config factory pattern
e67b5e23 · Frederik Hennig · d2dd3dfa · e67b5e23 · e67b5e23 · e67b5e23
Commit e67b5e23 authored 4 months ago by Frederik Hennig
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
 from abc import ABC, abstractmethod
-from typing import Any

 from ..ast.structural import PsBlock
 from ..ast.expressions import PsCall, PsExpression
@@ -28,7 +27,7 @@ class Platform(ABC):
    @abstractmethod
    def materialize_iteration_space(
        self, body: PsBlock, ispace: IterationSpace
-    ) -> PsBlock | tuple[PsBlock, Any]:
+    ) -> PsBlock:
        pass

    @abstractmethod

--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
 from __future__ import annotations
-from typing import cast, Sequence, Iterable, TYPE_CHECKING
+from typing import cast, Sequence, Iterable, Callable, TYPE_CHECKING
 from dataclasses import dataclass, replace

 from .target import Target
@@ -15,7 +15,7 @@ from .config import (
 from .kernel import Kernel, GpuKernel
 from .properties import PsSymbolProperty, FieldBasePtr
 from .parameters import Parameter
-from .gpu_indexing import GpuIndexing, GpuLaunchGridConstraints
+from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration

 from ..field import Field
 from ..types import PsIntegerType, PsScalarType
@@ -40,7 +40,6 @@ from ..backend.platforms import (
    Platform,
    GenericCpu,
    GenericVectorCpu,
-    GenericGpu,
 )
 from ..backend.exceptions import VectorizationError

@@ -166,17 +165,9 @@ class DefaultKernelCreationDriver:
    ) -> Kernel:
        kernel_body = self.parse_kernel_body(assignments)

-        match self._platform:
-            case GenericCpu():
-                kernel_ast = self._platform.materialize_iteration_space(
-                    kernel_body, self._ctx.get_iteration_space()
-                )
-            case GenericGpu():
-                kernel_ast = self._platform.materialize_iteration_space(
-                    kernel_body, self._ctx.get_iteration_space()
-                )
-            case _:
-                assert False, "unexpected platform"
+        kernel_ast = self._platform.materialize_iteration_space(
+            kernel_body, self._ctx.get_iteration_space()
+        )

        if self._intermediates is not None:
            self._intermediates.materialized_ispace = kernel_ast.clone()
@@ -220,14 +211,16 @@ class DefaultKernelCreationDriver:
                self._cfg.get_jit(),
            )
        else:
+            assert self._gpu_indexing is not None
+
            return create_gpu_kernel_function(
                self._ctx,
                self._platform,
                kernel_ast,
-                self._gpu_indexing,
                self._cfg.get_option("function_name"),
                self._target,
                self._cfg.get_jit(),
+                self._gpu_indexing.get_launch_config,
            )

    def parse_kernel_body(
@@ -500,30 +493,24 @@ def create_gpu_kernel_function(
    ctx: KernelCreationContext,
    platform: Platform,
    body: PsBlock,
-    indexing: GpuIndexing | None,
    function_name: str,
    target_spec: Target,
    jit: JitBase,
+    launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
 ) -> GpuKernel:
    undef_symbols = collect_undefined_symbols(body)

-    launch_grid_constraints = (
-        indexing.get_launch_grid_constraints()
-        if indexing is not None
-        else GpuLaunchGridConstraints()
-    )
-
    params = _get_function_params(ctx, undef_symbols)
    req_headers = _get_headers(ctx, platform, body)

    kfunc = GpuKernel(
        body,
-        launch_grid_constraints,
        target_spec,
        function_name,
        params,
        req_headers,
        jit,
+        launch_config_factory,
    )
    kfunc.metadata.update(ctx.metadata)
    return kfunc

--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
 from __future__ import annotations

 from abc import ABC, abstractmethod
-from typing import cast
+from typing import cast, Any
 from itertools import chain

 from .functions import Lambda
+from .kernel import GpuKernel
 from .parameters import Parameter
-from .properties import GpuBlockSize
 from .errors import CodegenError

 from ..backend.kernelcreation import (
@@ -18,34 +18,30 @@ from ..backend.platforms.cuda import ThreadToIndexMapping
 from ..backend.ast.expressions import PsExpression


-_ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None]
+_Dim3Lambda = tuple[Lambda, Lambda, Lambda]


-class GpuLaunchGridConstraints:
-    """Constraints on the number of threads and blocks on the GPU launch grid for a given kernel.
+class GpuLaunchConfiguration:
+    """Base class for launch configurations for CUDA and HIP kernels.

-    This constraints set determines all or some of
-    the number of threads on a GPU block as well as the number of blocks on the GPU grid,
-    statically or depending on runtime parameters.
+    Args:
+        block_size: A triple of lambdas determining the GPU block size
+        grid_size: A triple of lambdas determining the GPU grid size
+        config_parameters: Set containing all parameters to the given lambdas that are not also
+            parameters to the associated kernel
    """

    def __init__(
        self,
-        block_size: _ConstraintTriple | None = None,
-        grid_size: _ConstraintTriple | None = None,
+        block_size: _Dim3Lambda,
+        grid_size: _Dim3Lambda,
+        config_parameters: set[Parameter],
    ) -> None:
-        self._block_size: _ConstraintTriple = (
-            (None, None, None) if block_size is None else block_size
-        )
-        self._grid_size: _ConstraintTriple = (
-            (None, None, None) if grid_size is None else grid_size
-        )
+        self._block_size = block_size
+        self._grid_size = grid_size

-        params = set()
-        for constr in chain(self._block_size, self._grid_size):
-            if constr is not None:
-                params |= set(constr.parameters)
-        self._params = frozenset(params)
+        self._params = frozenset(config_parameters)
+        self._valuation: dict[Parameter, Any] = dict()

    @property
    def parameters(self) -> frozenset[Parameter]:
@@ -53,12 +49,18 @@ class GpuLaunchGridConstraints:
        return self._params

    @property
-    def block_size(self) -> _ConstraintTriple:
+    def parameter_values(self) -> dict[Parameter, Any]:
+        """Values for all parameters that are specific to the launch grid configuration and not
+        also kernel parameters."""
+        return self._valuation
+
+    @property
+    def block_size(self) -> _Dim3Lambda:
        """Constraints on the number of threads per block"""
        return self._block_size

    @property
-    def grid_size(self) -> _ConstraintTriple:
+    def grid_size(self) -> _Dim3Lambda:
        """Constraints on the number of blocks on the grid"""
        return self._grid_size

@@ -68,7 +70,7 @@ class GpuIndexing(ABC):
    def get_thread_mapping(self) -> ThreadToIndexMapping | None: ...

    @abstractmethod
-    def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints: ...
+    def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration: ...


 class Linear3DGpuIndexing(GpuIndexing):
@@ -85,29 +87,48 @@ class Linear3DGpuIndexing(GpuIndexing):

        return Linear3DMapping()

-    def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints:
+    def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration:
+        block_size, grid_size = self._prepare_launch_grid()
+
+        kernel_params = set(kernel.parameters)
+        launch_config_params = (
+            set().union(
+                *(lb.parameters for lb in chain(block_size, grid_size))
+            )
+            - kernel_params
+        )
+
+        return GpuLaunchConfiguration(
+            block_size=cast(_Dim3Lambda, tuple(block_size)),
+            grid_size=cast(_Dim3Lambda, tuple(grid_size)),
+            config_parameters=launch_config_params,
+        )
+
+    def _prepare_launch_grid(self):
        work_items = self._get_work_items()
        rank = len(work_items)

        from ..backend.constants import PsConstant
        from ..backend.ast.expressions import PsExpression, PsIntDiv

-        block_size_constraints = [None] * rank + [
-            Lambda(self._factory.parse_index(1), ()) for _ in range(3 - rank)
-        ]
-
        block_size_symbols = [
            self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank)
        ]
-        for c, bs in enumerate(block_size_symbols):
-            bs.add_property(GpuBlockSize(c))
+
+        block_size = [
+            Lambda.from_expression(self._ctx, self._factory.parse_index(bs_symb))
+            for bs_symb in block_size_symbols
+        ] + [
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
+            for _ in range(3 - rank)
+        ]

        def div_ceil(a: PsExpression, b: PsExpression):
            return self._factory.parse_index(
                PsIntDiv(a + b - PsExpression.make(PsConstant(1)), b)
            )

-        grid_size_constraints = [
+        grid_size = [
            Lambda.from_expression(
                self._ctx, div_ceil(witems, PsExpression.make(bsize))
            )
@@ -117,10 +138,7 @@ class Linear3DGpuIndexing(GpuIndexing):
            for _ in range(3 - rank)
        ]

-        return GpuLaunchGridConstraints(
-            block_size=cast(_ConstraintTriple, tuple(block_size_constraints)),
-            grid_size=cast(_ConstraintTriple, tuple(grid_size_constraints)),
-        )
+        return block_size, grid_size

    def _get_work_items(self) -> tuple[PsExpression, ...]:
        ispace = self._ctx.get_iteration_space()

--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -6,7 +6,6 @@ from itertools import chain

 from .target import Target
 from .parameters import Parameter
-from .gpu_indexing import GpuLaunchGridConstraints
 from ..backend.ast.structural import PsBlock
 from ..field import Field

@@ -14,6 +13,7 @@ from .._deprecation import _deprecated

 if TYPE_CHECKING:
    from ..jit import JitBase
+    from .gpu_indexing import GpuLaunchConfiguration


 class Kernel:
@@ -118,17 +118,16 @@ class GpuKernel(Kernel):
    def __init__(
        self,
        body: PsBlock,
-        launch_grid_constraints: GpuLaunchGridConstraints,
        target: Target,
        name: str,
        parameters: Sequence[Parameter],
        required_headers: set[str],
        jit: JitBase,
+        launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
    ):
        super().__init__(body, target, name, parameters, required_headers, jit)
-        self._launch_grid_constraints = launch_grid_constraints
+        self._launch_config_factory = launch_config_factory

-    @property
-    def launch_grid_constraints(self) -> GpuLaunchGridConstraints:
+    def get_launch_configuration(self) -> GpuLaunchConfiguration:
        """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._launch_grid_constraints
+        return self._launch_config_factory(self)
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -39,8 +39,3 @@ class FieldBasePtr(UniqueSymbolProperty):

 FieldProperty = FieldShape | FieldStride | FieldBasePtr
 _FieldProperty = (FieldShape, FieldStride, FieldBasePtr)
-
-
-@dataclass(frozen=True)
-class GpuBlockSize(UniqueSymbolProperty):
-    coordinate: int
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -18,6 +18,7 @@ from ..codegen import (
    GpuKernel,
    Parameter,
 )
+from ..codegen.gpu_indexing import GpuLaunchConfiguration
 from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr
 from ..types import PsStructType, PsPointerType

@@ -35,38 +36,24 @@ class CupyKernelWrapper(KernelWrapper):
        self,
        kfunc: GpuKernel,
        raw_kernel: Any,
-        block_size: tuple[int, int, int],
    ):
        self._kfunc: GpuKernel = kfunc
+        self._launch_config = kfunc.get_launch_configuration()
        self._raw_kernel = raw_kernel
-        self._block_size = block_size
-        self._grid_size: tuple[int, int, int] | None = None
        self._args_cache: dict[Any, tuple] = dict()

    @property
    def kernel_function(self) -> GpuKernel:
        return self._kfunc
+    
+    @property
+    def launch_config(self) -> GpuLaunchConfiguration:
+        return self._launch_config

    @property
    def raw_kernel(self):
        return self._raw_kernel

-    @property
-    def block_size(self) -> tuple[int, int, int]:
-        return self._block_size
-
-    @block_size.setter
-    def block_size(self, bs: tuple[int, int, int]):
-        self._block_size = bs
-
-    @property
-    def num_blocks(self) -> tuple[int, int, int] | None:
-        return self._grid_size
-
-    @num_blocks.setter
-    def num_blocks(self, nb: tuple[int, int, int] | None):
-        self._grid_size = nb
-
    def __call__(self, **kwargs: Any):
        kernel_args, launch_grid = self._get_cached_args(**kwargs)
        device = self._get_device(kernel_args)
@@ -80,9 +67,10 @@ class CupyKernelWrapper(KernelWrapper):
        return devices.pop()

    def _get_cached_args(self, **kwargs):
-        key = (self._block_size, self._grid_size) + tuple(
-            (k, id(v)) for k, v in kwargs.items()
-        )
+        launch_config_params = self._launch_config.parameter_values
+        key = tuple(
+            (k, v) for k, v in launch_config_params.items()
+        ) + tuple((k, id(v)) for k, v in kwargs.items())

        if key not in self._args_cache:
            args = self._get_args(**kwargs)
@@ -203,48 +191,32 @@ class CupyKernelWrapper(KernelWrapper):

        #   Determine launch grid

-        from ..codegen.gpu_indexing import GpuBlockSize
-
-        constraints = self._kfunc.launch_grid_constraints
+        launch_cfg_valuation = valuation.copy()
+        launch_cfg_valuation.update(
+            {
+                param.name: value
+                for param, value in self._launch_config.parameter_values.items()
+            }
+        )

-        for cparam in constraints.parameters:
-            for prop in cparam.properties:
-                match prop:
-                    case GpuBlockSize(coord):
-                        valuation[cparam.name] = self._block_size[coord]
-                        break
-            else:
-                valuation[cparam.name] = kwargs[cparam.name]
-
-        # launch_block_size: list[int] = []
-        # for coord, (bsize_constr, user_bsize) in enumerate(
-        #     zip(constraints.block_size, self._block_size)
-        # ):
-        #     if bsize_constr is None:
-        #         launch_grid_size
-
-        launch_block_size = [
-            (
-                int(bsize_constr(**valuation))
-                if bsize_constr is not None
-                else self._block_size[coord]
-            )
-            for coord, bsize_constr in enumerate(constraints.block_size)
-        ]
-
-        launch_grid_size = [
-            (
-                int(gsize_constr(**valuation))
-                if gsize_constr is not None
-                else self._grid_size[coord]
-            )
-            for coord, gsize_constr in enumerate(constraints.grid_size)
-        ]
+        block_size = cast(
+            tuple[int, int, int],
+            tuple(
+                int(component(**launch_cfg_valuation))
+                for component in self._launch_config.block_size
+            ),
+        )

-        return tuple(args), LaunchGrid(
-            tuple(launch_grid_size), tuple(launch_block_size)
+        grid_size = cast(
+            tuple[int, int, int],
+            tuple(
+                int(component(**launch_cfg_valuation))
+                for component in self._launch_config.grid_size
+            ),
        )

+        return tuple(args), LaunchGrid(grid_size, block_size)
+

 class CupyJit(JitBase):

@@ -261,26 +233,26 @@ class CupyJit(JitBase):
            tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
        )

-    def compile(self, kfunc: Kernel) -> KernelWrapper:
+    def compile(self, kernel: Kernel) -> KernelWrapper:
        if not HAVE_CUPY:
            raise JitError(
                "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
            )

-        if not isinstance(kfunc, GpuKernel) or kfunc.target != Target.CUDA:
+        if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA:
            raise ValueError(
                "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP"
            )

        options = self._compiler_options()
-        prelude = self._prelude(kfunc)
-        kernel_code = self._kernel_code(kfunc)
+        prelude = self._prelude(kernel)
+        kernel_code = self._kernel_code(kernel)
        code = prelude + kernel_code

        raw_kernel = cp.RawKernel(
-            code, kfunc.name, options=options, backend="nvrtc", jitify=True
+            code, kernel.name, options=options, backend="nvrtc", jitify=True
        )
-        return CupyKernelWrapper(kfunc, raw_kernel, self._default_block_size)
+        return CupyKernelWrapper(kernel, raw_kernel)

    def _compiler_options(self) -> tuple[str, ...]:
        options = ["-w", "-std=c++11"]