Skip to content
Snippets Groups Projects
Commit 26eb2d62 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

Simplified interface of GpuLaunchConfiguration. Implement factories inside GpuIndexing.

parent 210f768a
No related branches found
No related tags found
1 merge request!449GPU Indexing Schemes and Launch Configurations
......@@ -381,7 +381,11 @@ class GpuOptions(ConfigBase):
"""
block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
"""Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
"""Desired block size for the execution of GPU kernels.
This option only takes effect if `Linear3D` is chosen as an indexing scheme.
The block size may be overridden at runtime.
"""
manual_launch_grid: BasicOption[bool] = BasicOption(False)
"""Always require a manually specified launch grid when running this kernel.
......@@ -596,11 +600,8 @@ class CreateKernelConfig(ConfigBase):
elif self.get_target() == Target.CUDA:
try:
from ..jit.gpu_cupy import CupyJit
if self.gpu is not None and self.gpu.block_size is not None:
return CupyJit(self.gpu.block_size)
else:
return CupyJit()
return CupyJit()
except ImportError:
from ..jit import no_jit
......
......@@ -220,7 +220,7 @@ class DefaultKernelCreationDriver:
self._cfg.get_option("function_name"),
self._target,
self._cfg.get_jit(),
self._gpu_indexing.get_launch_config,
self._gpu_indexing.get_launch_config_factory(),
)
def parse_kernel_body(
......@@ -397,15 +397,13 @@ class DefaultKernelCreationDriver:
if self._target != Target.CUDA:
return None
idx_scheme = self._cfg.gpu.get_option("indexing_scheme")
from .gpu_indexing import dim3
match idx_scheme:
case None | GpuIndexingScheme.Linear3D:
from .gpu_indexing import Linear3DGpuIndexing
idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
return Linear3DGpuIndexing(self._ctx)
case _:
raise NotImplementedError()
return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid)
def _get_platform(self) -> Platform:
if Target._CPU in self._target:
......@@ -496,7 +494,7 @@ def create_gpu_kernel_function(
function_name: str,
target_spec: Target,
jit: JitBase,
launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
launch_config_factory: Callable[[], GpuLaunchConfiguration],
) -> GpuKernel:
undef_symbols = collect_undefined_symbols(body)
......
from __future__ import annotations
from typing import Sequence
import numpy as np
from typing import Sequence, Any
from .parameters import Parameter
from ..types import PsType
......@@ -36,7 +34,7 @@ class Lambda:
"""Return type of this lambda"""
return self._return_type
def __call__(self, **kwargs) -> np.generic:
def __call__(self, **kwargs) -> Any:
"""Evaluate this lambda with the given arguments.
The lambda must receive a value for each parameter listed in `parameters`.
......
......@@ -5,7 +5,6 @@ from typing import cast, Any, Callable
from itertools import chain
from .functions import Lambda
from .kernel import GpuKernel
from .parameters import Parameter
from .errors import CodegenError
from .config import GpuIndexingScheme, _AUTO_TYPE
......@@ -15,7 +14,7 @@ from ..backend.kernelcreation import (
FullIterationSpace,
SparseIterationSpace,
)
from ..backend.platforms.cuda import ThreadToIndexMapping
from ..backend.ast.expressions import PsExpression
......@@ -24,7 +23,7 @@ _Dim3Params = tuple[Parameter, Parameter, Parameter]
_Dim3Lambda = tuple[Lambda, Lambda, Lambda]
class GpuLaunchConfiguration:
class GpuLaunchConfiguration(ABC):
"""Base class for launch configurations for CUDA and HIP kernels.
Args:
......@@ -34,33 +33,61 @@ class GpuLaunchConfiguration:
parameters to the associated kernel
"""
@property
@abstractmethod
def parameters(self) -> frozenset[Parameter]:
"""Parameters of this launch configuration"""
@abstractmethod
def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
"""Compute block and grid size for a kernel launch.
Args:
kwargs: Valuation providing a value for each parameter listed in `parameters`
"""
@abstractmethod
def jit_cache_key(self) -> Any:
"""Return a hashable object that represents any user-configurable options of
this launch configuration, such that when the configuration changes, the JIT parameter
cache is invalidated."""
class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
"""Launch configuration that is dynamically computed from kernel parameters.
This launch configuration permits no further user customization
"""
def __init__(
self,
block_size: _Dim3Lambda,
grid_size: _Dim3Lambda,
config_parameters: set[Parameter],
) -> None:
self._block_size = block_size
self._grid_size = grid_size
self._params = frozenset(config_parameters)
self._valuation: dict[Parameter, Any] = dict()
self._params: frozenset[Parameter] = frozenset().union(
*(lb.parameters for lb in chain(block_size, grid_size))
)
@property
def parameters(self) -> frozenset[Parameter]:
"""Parameters to this set of constraints"""
"""Parameters of this launch configuration"""
return self._params
def get_valuation(self) -> dict[Parameter, Any]:
"""Values for all parameters that are specific to the launch grid configuration and not
also kernel parameters."""
return self._valuation
def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
"""Compute block and grid size for a kernel launch.
def get_block_size(self) -> _Dim3Lambda:
return self._block_size
Args:
kwargs: Valuation providing a value for each parameter listed in `parameters`
"""
block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
return cast(dim3, block_size), cast(dim3, grid_size)
def get_grid_size(self) -> _Dim3Lambda:
return self._grid_size
def jit_cache_key(self) -> Any:
return ()
class ManualLaunchConfiguration(GpuLaunchConfiguration):
......@@ -71,89 +98,93 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
def __init__(
self,
block_size: _Dim3Lambda,
grid_size: _Dim3Lambda,
block_size_params: _Dim3Params,
grid_size_params: _Dim3Params,
):
super().__init__(
cast(_Dim3Lambda, block_size),
cast(_Dim3Lambda, grid_size),
set(block_size_params).union(grid_size_params),
)
self._block_size_params = block_size_params
self._grid_size_params = grid_size_params
self._user_block_size: dim3 | None = None
self._user_grid_size: dim3 | None = None
) -> None:
self._block_size: dim3 | None = None
self._grid_size: dim3 | None = None
@property
def block_size(self) -> dim3 | None:
return self._user_block_size
return self._block_size
@block_size.setter
def block_size(self, val: dim3):
self._user_block_size = val
self._block_size = val
@property
def grid_size(self) -> dim3 | None:
return self._user_grid_size
return self._grid_size
@grid_size.setter
def grid_size(self, val: dim3):
self._user_grid_size = val
def get_valuation(self) -> dict[Parameter, Any]:
if self._user_block_size is None:
raise AttributeError("No GPU block size was specified")
self._grid_size = val
if self._user_grid_size is None:
raise AttributeError("No GPU grid size was specified")
@property
def parameters(self) -> frozenset[Parameter]:
return frozenset()
valuation: dict[Parameter, Any] = dict()
def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
if self._block_size is None:
raise AttributeError("No GPU block size was set by the user.")
for bs_param, bs in zip(self._block_size_params, self._user_block_size):
valuation[bs_param] = bs
if self._grid_size is None:
raise AttributeError("No GPU grid size was set by the user.")
for gs_param, gs in zip(self._grid_size_params, self._user_grid_size):
valuation[gs_param] = gs
return self._block_size, self._grid_size
return valuation
def jit_cache_key(self) -> Any:
return (self._block_size, self._grid_size)
class GridFromBlockSizeConfiguration(GpuLaunchConfiguration):
"""GPU launch configuration that computes the grid size from a user-defined block size."""
class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
"""GPU launch configuration that permits the user to set a block size dynamically."""
def __init__(
self,
block_size: _Dim3Lambda,
grid_size: _Dim3Lambda,
block_size_expr: _Dim3Lambda,
grid_size_expr: _Dim3Lambda,
block_size_params: _Dim3Params,
default_block_size: dim3 | None = None,
) -> None:
super().__init__(block_size, grid_size, set(block_size_params))
self._block_size_expr = block_size_expr
self._grid_size_expr = grid_size_expr
self._block_size_params = block_size_params
self._user_block_size: dim3 | None = default_block_size
self._block_size: dim3 | None = default_block_size
self._params: frozenset[Parameter] = frozenset().union(
*(lb.parameters for lb in chain(block_size_expr, grid_size_expr))
) - set(self._block_size_params)
@property
def block_size(self) -> dim3 | None:
return self._user_block_size
return self._block_size
@block_size.setter
def block_size(self, val: dim3):
self._user_block_size = val
self._block_size = val
@property
def parameters(self) -> frozenset[Parameter]:
"""Parameters of this launch configuration"""
return self._params
def get_valuation(self) -> dict[Parameter, Any]:
if self._user_block_size is None:
raise AttributeError("No GPU block size was specified")
def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
if self._block_size is None:
raise AttributeError("No GPU block size was specified by the user!")
valuation: dict[Parameter, Any] = dict()
kwargs.update(
{
param.name: value
for param, value in zip(self._block_size_params, self._block_size)
}
)
for bs_param, bs in zip(self._block_size_params, self._user_block_size):
valuation[bs_param] = bs
block_size = tuple(int(bs(**kwargs)) for bs in self._block_size_expr)
grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size_expr)
return cast(dim3, block_size), cast(dim3, grid_size)
return valuation
def jit_cache_key(self) -> Any:
return self._block_size
class GpuIndexing(ABC):
......@@ -182,65 +213,19 @@ class GpuIndexing(ABC):
case GpuIndexingScheme.Blockwise4D:
return Blockwise4DMapping()
def get_launch_config_factory(
self, scheme: GpuIndexingScheme
) -> Callable[[], GpuLaunchConfiguration]:
def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
if self._manual_launch_grid:
return self._manual_config_factory()
raise NotImplementedError()
def _manual_config_factory(self) -> Callable[[], ManualLaunchConfiguration]:
ctx = self._ctx
block_size_symbols = [
ctx.get_new_symbol(f"gpuBlockSize_{c}", ctx.index_dtype) for c in range(3)
]
grid_size_symbols = [
ctx.get_new_symbol(f"gpuGridSize_{c}", ctx.index_dtype) for c in range(3)
]
block_size = tuple(
Lambda.from_expression(ctx, PsExpression.make(bs))
for bs in block_size_symbols
)
return ManualLaunchConfiguration
grid_size = tuple(
Lambda.from_expression(ctx, PsExpression.make(gs))
for gs in grid_size_symbols
)
from .driver import _symbol_to_param
bs_params = [_symbol_to_param(ctx, s) for s in block_size_symbols]
gs_params = [_symbol_to_param(ctx, s) for s in grid_size_symbols]
def factory():
return ManualLaunchConfiguration(
cast(_Dim3Lambda, block_size),
cast(_Dim3Lambda, grid_size),
cast(_Dim3Params, bs_params),
cast(_Dim3Params, gs_params),
)
return factory
class Linear3DGpuIndexing(GpuIndexing):
def __init__(self, ctx: KernelCreationContext) -> None:
self._ctx = ctx
from ..backend.kernelcreation import AstFactory
self._factory = AstFactory(self._ctx)
def get_thread_mapping(self) -> ThreadToIndexMapping:
from ..backend.platforms.cuda import Linear3DMapping
return Linear3DMapping()
match self._scheme:
case GpuIndexingScheme.Linear3D:
return self._get_linear3d_config_factory()
case GpuIndexingScheme.Blockwise4D:
return self._get_blockwise4d_config_factory()
def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration:
def _get_linear3d_config_factory(
self,
) -> Callable[[], DynamicBlockSizeLaunchConfiguration]:
work_items = self._get_work_items()
rank = len(work_items)
......@@ -280,13 +265,65 @@ class Linear3DGpuIndexing(GpuIndexing):
_symbol_to_param(self._ctx, s) for s in block_size_symbols
)
return GridFromBlockSizeConfiguration(
cast(_Dim3Lambda, tuple(block_size)),
cast(_Dim3Lambda, tuple(grid_size)),
cast(tuple[Parameter, Parameter, Parameter], block_size_params),
def factory():
return DynamicBlockSizeLaunchConfiguration(
cast(_Dim3Lambda, tuple(block_size)),
cast(_Dim3Lambda, tuple(grid_size)),
cast(_Dim3Params, block_size_params),
self._get_default_block_size(rank),
)
return factory
def _get_default_block_size(self, rank: int) -> dim3:
if isinstance(self._block_size, _AUTO_TYPE):
match rank:
case 1:
return (256, 1, 1)
case 2:
return (128, 2, 1)
case 3:
return (128, 2, 2)
case _:
assert False, "unreachable code"
else:
return self._block_size
def _get_blockwise4d_config_factory(
self,
) -> Callable[[], AutomaticLaunchConfiguration]:
work_items = self._get_work_items()[::-1] # Want this ordered fastest first
rank = len(work_items)
if rank > 4:
raise ValueError(f"Iteration space rank is too large: {rank}")
block_size = (
Lambda.from_expression(self._ctx, work_items[0]),
Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
)
grid_size = tuple(
Lambda.from_expression(self._ctx, wit) for wit in work_items[1:]
) + tuple(
Lambda.from_expression(self._ctx, self._factory.parse_index(1))
for _ in range(4 - rank)
)
def factory():
return AutomaticLaunchConfiguration(
block_size,
cast(_Dim3Lambda, grid_size),
)
return factory
def _get_work_items(self) -> tuple[PsExpression, ...]:
"""Return a tuple of expressions representing the number of work items
in each dimension of the kernel's iteration space,
ordered from slowest to fastest dimension.
"""
ispace = self._ctx.get_iteration_space()
match ispace:
case FullIterationSpace():
......
......@@ -6,6 +6,8 @@ from itertools import chain
from .target import Target
from .parameters import Parameter
from .gpu_indexing import GpuLaunchConfiguration
from ..backend.ast.structural import PsBlock
from ..field import Field
......@@ -13,7 +15,6 @@ from .._deprecation import _deprecated
if TYPE_CHECKING:
from ..jit import JitBase
from .gpu_indexing import GpuLaunchConfiguration
class Kernel:
......@@ -123,11 +124,11 @@ class GpuKernel(Kernel):
parameters: Sequence[Parameter],
required_headers: set[str],
jit: JitBase,
launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
launch_config_factory: Callable[[], GpuLaunchConfiguration],
):
super().__init__(body, target, name, parameters, required_headers, jit)
self._launch_config_factory = launch_config_factory
def get_launch_configuration(self) -> GpuLaunchConfiguration:
"""Object exposing the total size of the launch grid this kernel expects to be executed with."""
return self._launch_config_factory(self)
return self._launch_config_factory()
......@@ -45,7 +45,7 @@ class CupyKernelWrapper(KernelWrapper):
@property
def kernel_function(self) -> GpuKernel:
return self._kfunc
@property
def launch_config(self) -> GpuLaunchConfiguration:
return self._launch_config
......@@ -67,10 +67,9 @@ class CupyKernelWrapper(KernelWrapper):
return devices.pop()
def _get_cached_args(self, **kwargs):
launch_config_params = self._launch_config.get_valuation
key = tuple(
(k, v) for k, v in launch_config_params.items()
) + tuple((k, id(v)) for k, v in kwargs.items())
key = (self._launch_config.jit_cache_key(),) + tuple(
(k, id(v)) for k, v in kwargs.items()
)
if key not in self._args_cache:
args = self._get_args(**kwargs)
......@@ -80,7 +79,7 @@ class CupyKernelWrapper(KernelWrapper):
return self._args_cache[key]
def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]:
args = []
kernel_args = []
valuation: dict[str, Any] = dict()
def add_arg(name: str, arg: Any, dtype: PsType):
......@@ -88,7 +87,7 @@ class CupyKernelWrapper(KernelWrapper):
assert nptype is not None
typecast = nptype.type
arg = typecast(arg)
args.append(arg)
kernel_args.append(arg)
valuation[name] = arg
field_shapes = set()
......@@ -168,7 +167,7 @@ class CupyKernelWrapper(KernelWrapper):
f"Expected {field.dtype}, got {arr.dtype}"
)
check_shape(kparam, arr)
args.append(arr)
kernel_args.append(arr)
break
case FieldShape(field, coord):
......@@ -191,31 +190,21 @@ class CupyKernelWrapper(KernelWrapper):
# Determine launch grid
launch_cfg_valuation = valuation.copy()
launch_cfg_valuation.update(
{
param.name: value
for param, value in self._launch_config.get_valuation.items()
}
)
def add_launch_config_arg(name: str, arg: Any, dtype: PsType):
nptype = dtype.numpy_dtype
assert nptype is not None
typecast = nptype.type
arg = typecast(arg)
valuation[name] = arg
block_size = cast(
tuple[int, int, int],
tuple(
int(component(**launch_cfg_valuation))
for component in self._launch_config.get_block_size()
),
)
for cparam in self._launch_config.parameters:
if cparam.name not in valuation:
val = kwargs[cparam.name]
add_launch_config_arg(cparam.name, val, cparam.dtype)
grid_size = cast(
tuple[int, int, int],
tuple(
int(component(**launch_cfg_valuation))
for component in self._launch_config.get_grid_size()
),
)
block_size, grid_size = self._launch_config.evaluate(**valuation)
return tuple(args), LaunchGrid(grid_size, block_size)
return tuple(kernel_args), LaunchGrid(grid_size, block_size)
class CupyJit(JitBase):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment