Simplified interface of GpuLaunchConfiguration. Implement factories inside GpuIndexing.

26eb2d62 · Frederik Hennig · 210f768a · 26eb2d62 · 26eb2d62 · 26eb2d62
Commit 26eb2d62 authored 6 months ago by Frederik Hennig
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -381,7 +381,11 @@ class GpuOptions(ConfigBase):
    """

    block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
-    """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
+    """Desired block size for the execution of GPU kernels.
+    
+    This option only takes effect if `Linear3D` is chosen as an indexing scheme.
+    The block size may be overridden at runtime.
+    """

    manual_launch_grid: BasicOption[bool] = BasicOption(False)
    """Always require a manually specified launch grid when running this kernel.
@@ -596,11 +600,8 @@ class CreateKernelConfig(ConfigBase):
            elif self.get_target() == Target.CUDA:
                try:
                    from ..jit.gpu_cupy import CupyJit
-
-                    if self.gpu is not None and self.gpu.block_size is not None:
-                        return CupyJit(self.gpu.block_size)
-                    else:
-                        return CupyJit()
+                    
+                    return CupyJit()

                except ImportError:
                    from ..jit import no_jit

--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -220,7 +220,7 @@ class DefaultKernelCreationDriver:
                self._cfg.get_option("function_name"),
                self._target,
                self._cfg.get_jit(),
-                self._gpu_indexing.get_launch_config,
+                self._gpu_indexing.get_launch_config_factory(),
            )

    def parse_kernel_body(
@@ -397,15 +397,13 @@ class DefaultKernelCreationDriver:
        if self._target != Target.CUDA:
            return None

-        idx_scheme = self._cfg.gpu.get_option("indexing_scheme")
+        from .gpu_indexing import dim3

-        match idx_scheme:
-            case None | GpuIndexingScheme.Linear3D:
-                from .gpu_indexing import Linear3DGpuIndexing
+        idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
+        block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
+        manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")

-                return Linear3DGpuIndexing(self._ctx)
-            case _:
-                raise NotImplementedError()
+        return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid)

    def _get_platform(self) -> Platform:
        if Target._CPU in self._target:
@@ -496,7 +494,7 @@ def create_gpu_kernel_function(
    function_name: str,
    target_spec: Target,
    jit: JitBase,
-    launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
+    launch_config_factory: Callable[[], GpuLaunchConfiguration],
 ) -> GpuKernel:
    undef_symbols = collect_undefined_symbols(body)


--- a/src/pystencils/codegen/functions.py
+++ b/src/pystencils/codegen/functions.py
 from __future__ import annotations
-from typing import Sequence
-
-import numpy as np
+from typing import Sequence, Any

 from .parameters import Parameter
 from ..types import PsType
@@ -36,7 +34,7 @@ class Lambda:
        """Return type of this lambda"""
        return self._return_type

-    def __call__(self, **kwargs) -> np.generic:
+    def __call__(self, **kwargs) -> Any:
        """Evaluate this lambda with the given arguments.

        The lambda must receive a value for each parameter listed in `parameters`.

--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -5,7 +5,6 @@ from typing import cast, Any, Callable
 from itertools import chain

 from .functions import Lambda
-from .kernel import GpuKernel
 from .parameters import Parameter
 from .errors import CodegenError
 from .config import GpuIndexingScheme, _AUTO_TYPE
@@ -15,7 +14,7 @@ from ..backend.kernelcreation import (
    FullIterationSpace,
    SparseIterationSpace,
 )
-from ..backend.platforms.cuda import ThreadToIndexMapping
+
 from ..backend.ast.expressions import PsExpression


@@ -24,7 +23,7 @@ _Dim3Params = tuple[Parameter, Parameter, Parameter]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]


-class GpuLaunchConfiguration:
+class GpuLaunchConfiguration(ABC):
    """Base class for launch configurations for CUDA and HIP kernels.

    Args:
@@ -34,33 +33,61 @@ class GpuLaunchConfiguration:
            parameters to the associated kernel
    """

+    @property
+    @abstractmethod
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+
+    @abstractmethod
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        """Compute block and grid size for a kernel launch.
+
+        Args:
+            kwargs: Valuation providing a value for each parameter listed in `parameters`
+        """
+
+    @abstractmethod
+    def jit_cache_key(self) -> Any:
+        """Return a hashable object that represents any user-configurable options of
+        this launch configuration, such that when the configuration changes, the JIT parameter
+        cache is invalidated."""
+
+
+class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
+    """Launch configuration that is dynamically computed from kernel parameters.
+
+    This launch configuration permits no further user customization
+    """
+
    def __init__(
        self,
        block_size: _Dim3Lambda,
        grid_size: _Dim3Lambda,
-        config_parameters: set[Parameter],
    ) -> None:
        self._block_size = block_size
        self._grid_size = grid_size

-        self._params = frozenset(config_parameters)
-        self._valuation: dict[Parameter, Any] = dict()
+        self._params: frozenset[Parameter] = frozenset().union(
+            *(lb.parameters for lb in chain(block_size, grid_size))
+        )

    @property
    def parameters(self) -> frozenset[Parameter]:
-        """Parameters to this set of constraints"""
+        """Parameters of this launch configuration"""
        return self._params

-    def get_valuation(self) -> dict[Parameter, Any]:
-        """Values for all parameters that are specific to the launch grid configuration and not
-        also kernel parameters."""
-        return self._valuation
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        """Compute block and grid size for a kernel launch.

-    def get_block_size(self) -> _Dim3Lambda:
-        return self._block_size
+        Args:
+            kwargs: Valuation providing a value for each parameter listed in `parameters`
+        """
+        block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
+        grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
+        return cast(dim3, block_size), cast(dim3, grid_size)

-    def get_grid_size(self) -> _Dim3Lambda:
-        return self._grid_size
+    def jit_cache_key(self) -> Any:
+        return ()


 class ManualLaunchConfiguration(GpuLaunchConfiguration):
@@ -71,89 +98,93 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):

    def __init__(
        self,
-        block_size: _Dim3Lambda,
-        grid_size: _Dim3Lambda,
-        block_size_params: _Dim3Params,
-        grid_size_params: _Dim3Params,
-    ):
-        super().__init__(
-            cast(_Dim3Lambda, block_size),
-            cast(_Dim3Lambda, grid_size),
-            set(block_size_params).union(grid_size_params),
-        )
-        self._block_size_params = block_size_params
-        self._grid_size_params = grid_size_params
-
-        self._user_block_size: dim3 | None = None
-        self._user_grid_size: dim3 | None = None
+    ) -> None:
+        self._block_size: dim3 | None = None
+        self._grid_size: dim3 | None = None

    @property
    def block_size(self) -> dim3 | None:
-        return self._user_block_size
+        return self._block_size

    @block_size.setter
    def block_size(self, val: dim3):
-        self._user_block_size = val
+        self._block_size = val

    @property
    def grid_size(self) -> dim3 | None:
-        return self._user_grid_size
+        return self._grid_size

    @grid_size.setter
    def grid_size(self, val: dim3):
-        self._user_grid_size = val
-
-    def get_valuation(self) -> dict[Parameter, Any]:
-        if self._user_block_size is None:
-            raise AttributeError("No GPU block size was specified")
+        self._grid_size = val

-        if self._user_grid_size is None:
-            raise AttributeError("No GPU grid size was specified")
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        return frozenset()

-        valuation: dict[Parameter, Any] = dict()
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        if self._block_size is None:
+            raise AttributeError("No GPU block size was set by the user.")

-        for bs_param, bs in zip(self._block_size_params, self._user_block_size):
-            valuation[bs_param] = bs
+        if self._grid_size is None:
+            raise AttributeError("No GPU grid size was set by the user.")

-        for gs_param, gs in zip(self._grid_size_params, self._user_grid_size):
-            valuation[gs_param] = gs
+        return self._block_size, self._grid_size

-        return valuation
+    def jit_cache_key(self) -> Any:
+        return (self._block_size, self._grid_size)


-class GridFromBlockSizeConfiguration(GpuLaunchConfiguration):
-    """GPU launch configuration that computes the grid size from a user-defined block size."""
+class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
+    """GPU launch configuration that permits the user to set a block size dynamically."""

    def __init__(
        self,
-        block_size: _Dim3Lambda,
-        grid_size: _Dim3Lambda,
+        block_size_expr: _Dim3Lambda,
+        grid_size_expr: _Dim3Lambda,
        block_size_params: _Dim3Params,
        default_block_size: dim3 | None = None,
    ) -> None:
-        super().__init__(block_size, grid_size, set(block_size_params))
+        self._block_size_expr = block_size_expr
+        self._grid_size_expr = grid_size_expr

        self._block_size_params = block_size_params
-        self._user_block_size: dim3 | None = default_block_size
+        self._block_size: dim3 | None = default_block_size
+
+        self._params: frozenset[Parameter] = frozenset().union(
+            *(lb.parameters for lb in chain(block_size_expr, grid_size_expr))
+        ) - set(self._block_size_params)

    @property
    def block_size(self) -> dim3 | None:
-        return self._user_block_size
+        return self._block_size

    @block_size.setter
    def block_size(self, val: dim3):
-        self._user_block_size = val
+        self._block_size = val
+
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+        return self._params

-    def get_valuation(self) -> dict[Parameter, Any]:
-        if self._user_block_size is None:
-            raise AttributeError("No GPU block size was specified")
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        if self._block_size is None:
+            raise AttributeError("No GPU block size was specified by the user!")

-        valuation: dict[Parameter, Any] = dict()
+        kwargs.update(
+            {
+                param.name: value
+                for param, value in zip(self._block_size_params, self._block_size)
+            }
+        )

-        for bs_param, bs in zip(self._block_size_params, self._user_block_size):
-            valuation[bs_param] = bs
+        block_size = tuple(int(bs(**kwargs)) for bs in self._block_size_expr)
+        grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size_expr)
+        return cast(dim3, block_size), cast(dim3, grid_size)

-        return valuation
+    def jit_cache_key(self) -> Any:
+        return self._block_size


 class GpuIndexing(ABC):
@@ -182,65 +213,19 @@ class GpuIndexing(ABC):
            case GpuIndexingScheme.Blockwise4D:
                return Blockwise4DMapping()

-    def get_launch_config_factory(
-        self, scheme: GpuIndexingScheme
-    ) -> Callable[[], GpuLaunchConfiguration]:
+    def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
        if self._manual_launch_grid:
-            return self._manual_config_factory()
-
-        raise NotImplementedError()
-
-    def _manual_config_factory(self) -> Callable[[], ManualLaunchConfiguration]:
-        ctx = self._ctx
-
-        block_size_symbols = [
-            ctx.get_new_symbol(f"gpuBlockSize_{c}", ctx.index_dtype) for c in range(3)
-        ]
-        grid_size_symbols = [
-            ctx.get_new_symbol(f"gpuGridSize_{c}", ctx.index_dtype) for c in range(3)
-        ]
-
-        block_size = tuple(
-            Lambda.from_expression(ctx, PsExpression.make(bs))
-            for bs in block_size_symbols
-        )
+            return ManualLaunchConfiguration

-        grid_size = tuple(
-            Lambda.from_expression(ctx, PsExpression.make(gs))
-            for gs in grid_size_symbols
-        )
-
-        from .driver import _symbol_to_param
-
-        bs_params = [_symbol_to_param(ctx, s) for s in block_size_symbols]
-        gs_params = [_symbol_to_param(ctx, s) for s in grid_size_symbols]
-
-        def factory():
-            return ManualLaunchConfiguration(
-                cast(_Dim3Lambda, block_size),
-                cast(_Dim3Lambda, grid_size),
-                cast(_Dim3Params, bs_params),
-                cast(_Dim3Params, gs_params),
-            )
-
-        return factory
-
-
-class Linear3DGpuIndexing(GpuIndexing):
-
-    def __init__(self, ctx: KernelCreationContext) -> None:
-        self._ctx = ctx
-
-        from ..backend.kernelcreation import AstFactory
-
-        self._factory = AstFactory(self._ctx)
-
-    def get_thread_mapping(self) -> ThreadToIndexMapping:
-        from ..backend.platforms.cuda import Linear3DMapping
-
-        return Linear3DMapping()
+        match self._scheme:
+            case GpuIndexingScheme.Linear3D:
+                return self._get_linear3d_config_factory()
+            case GpuIndexingScheme.Blockwise4D:
+                return self._get_blockwise4d_config_factory()

-    def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration:
+    def _get_linear3d_config_factory(
+        self,
+    ) -> Callable[[], DynamicBlockSizeLaunchConfiguration]:
        work_items = self._get_work_items()
        rank = len(work_items)

@@ -280,13 +265,65 @@ class Linear3DGpuIndexing(GpuIndexing):
            _symbol_to_param(self._ctx, s) for s in block_size_symbols
        )

-        return GridFromBlockSizeConfiguration(
-            cast(_Dim3Lambda, tuple(block_size)),
-            cast(_Dim3Lambda, tuple(grid_size)),
-            cast(tuple[Parameter, Parameter, Parameter], block_size_params),
+        def factory():
+            return DynamicBlockSizeLaunchConfiguration(
+                cast(_Dim3Lambda, tuple(block_size)),
+                cast(_Dim3Lambda, tuple(grid_size)),
+                cast(_Dim3Params, block_size_params),
+                self._get_default_block_size(rank),
+            )
+
+        return factory
+
+    def _get_default_block_size(self, rank: int) -> dim3:
+        if isinstance(self._block_size, _AUTO_TYPE):
+            match rank:
+                case 1:
+                    return (256, 1, 1)
+                case 2:
+                    return (128, 2, 1)
+                case 3:
+                    return (128, 2, 2)
+                case _:
+                    assert False, "unreachable code"
+        else:
+            return self._block_size
+
+    def _get_blockwise4d_config_factory(
+        self,
+    ) -> Callable[[], AutomaticLaunchConfiguration]:
+        work_items = self._get_work_items()[::-1]  # Want this ordered fastest first
+        rank = len(work_items)
+
+        if rank > 4:
+            raise ValueError(f"Iteration space rank is too large: {rank}")
+
+        block_size = (
+            Lambda.from_expression(self._ctx, work_items[0]),
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
+        )
+
+        grid_size = tuple(
+            Lambda.from_expression(self._ctx, wit) for wit in work_items[1:]
+        ) + tuple(
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
+            for _ in range(4 - rank)
        )

+        def factory():
+            return AutomaticLaunchConfiguration(
+                block_size,
+                cast(_Dim3Lambda, grid_size),
+            )
+
+        return factory
+
    def _get_work_items(self) -> tuple[PsExpression, ...]:
+        """Return a tuple of expressions representing the number of work items
+        in each dimension of the kernel's iteration space,
+        ordered from slowest to fastest dimension.
+        """
        ispace = self._ctx.get_iteration_space()
        match ispace:
            case FullIterationSpace():

--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -6,6 +6,8 @@ from itertools import chain

 from .target import Target
 from .parameters import Parameter
+from .gpu_indexing import GpuLaunchConfiguration
+
 from ..backend.ast.structural import PsBlock
 from ..field import Field

@@ -13,7 +15,6 @@ from .._deprecation import _deprecated

 if TYPE_CHECKING:
    from ..jit import JitBase
-    from .gpu_indexing import GpuLaunchConfiguration


 class Kernel:
@@ -123,11 +124,11 @@ class GpuKernel(Kernel):
        parameters: Sequence[Parameter],
        required_headers: set[str],
        jit: JitBase,
-        launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
+        launch_config_factory: Callable[[], GpuLaunchConfiguration],
    ):
        super().__init__(body, target, name, parameters, required_headers, jit)
        self._launch_config_factory = launch_config_factory

    def get_launch_configuration(self) -> GpuLaunchConfiguration:
        """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._launch_config_factory(self)
+        return self._launch_config_factory()
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -45,7 +45,7 @@ class CupyKernelWrapper(KernelWrapper):
    @property
    def kernel_function(self) -> GpuKernel:
        return self._kfunc
-    
+
    @property
    def launch_config(self) -> GpuLaunchConfiguration:
        return self._launch_config
@@ -67,10 +67,9 @@ class CupyKernelWrapper(KernelWrapper):
        return devices.pop()

    def _get_cached_args(self, **kwargs):
-        launch_config_params = self._launch_config.get_valuation
-        key = tuple(
-            (k, v) for k, v in launch_config_params.items()
-        ) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._launch_config.jit_cache_key(),) + tuple(
+            (k, id(v)) for k, v in kwargs.items()
+        )

        if key not in self._args_cache:
            args = self._get_args(**kwargs)
@@ -80,7 +79,7 @@ class CupyKernelWrapper(KernelWrapper):
            return self._args_cache[key]

    def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]:
-        args = []
+        kernel_args = []
        valuation: dict[str, Any] = dict()

        def add_arg(name: str, arg: Any, dtype: PsType):
@@ -88,7 +87,7 @@ class CupyKernelWrapper(KernelWrapper):
            assert nptype is not None
            typecast = nptype.type
            arg = typecast(arg)
-            args.append(arg)
+            kernel_args.append(arg)
            valuation[name] = arg

        field_shapes = set()
@@ -168,7 +167,7 @@ class CupyKernelWrapper(KernelWrapper):
                                    f"Expected {field.dtype}, got {arr.dtype}"
                                )
                            check_shape(kparam, arr)
-                            args.append(arr)
+                            kernel_args.append(arr)
                            break

                        case FieldShape(field, coord):
@@ -191,31 +190,21 @@ class CupyKernelWrapper(KernelWrapper):

        #   Determine launch grid

-        launch_cfg_valuation = valuation.copy()
-        launch_cfg_valuation.update(
-            {
-                param.name: value
-                for param, value in self._launch_config.get_valuation.items()
-            }
-        )
+        def add_launch_config_arg(name: str, arg: Any, dtype: PsType):
+            nptype = dtype.numpy_dtype
+            assert nptype is not None
+            typecast = nptype.type
+            arg = typecast(arg)
+            valuation[name] = arg

-        block_size = cast(
-            tuple[int, int, int],
-            tuple(
-                int(component(**launch_cfg_valuation))
-                for component in self._launch_config.get_block_size()
-            ),
-        )
+        for cparam in self._launch_config.parameters:
+            if cparam.name not in valuation:
+                val = kwargs[cparam.name]
+                add_launch_config_arg(cparam.name, val, cparam.dtype)

-        grid_size = cast(
-            tuple[int, int, int],
-            tuple(
-                int(component(**launch_cfg_valuation))
-                for component in self._launch_config.get_grid_size()
-            ),
-        )
+        block_size, grid_size = self._launch_config.evaluate(**valuation)

-        return tuple(args), LaunchGrid(grid_size, block_size)
+        return tuple(kernel_args), LaunchGrid(grid_size, block_size)


 class CupyJit(JitBase):