Skip to content
Snippets Groups Projects
Commit e67b5e23 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

Introduce launch config factory pattern

 - update GpuKernel to receive a launch config factory.
 - Update gpu-indexing to provide one.
 - Update cupy-jit to expose and evaluate the launch config
parent d2dd3dfa
No related branches found
No related tags found
1 merge request!449GPU Indexing Schemes and Launch Configurations
Pipeline #74017 failed
from abc import ABC, abstractmethod
from typing import Any
from ..ast.structural import PsBlock
from ..ast.expressions import PsCall, PsExpression
......@@ -28,7 +27,7 @@ class Platform(ABC):
@abstractmethod
def materialize_iteration_space(
self, body: PsBlock, ispace: IterationSpace
) -> PsBlock | tuple[PsBlock, Any]:
) -> PsBlock:
pass
@abstractmethod
......
from __future__ import annotations
from typing import cast, Sequence, Iterable, TYPE_CHECKING
from typing import cast, Sequence, Iterable, Callable, TYPE_CHECKING
from dataclasses import dataclass, replace
from .target import Target
......@@ -15,7 +15,7 @@ from .config import (
from .kernel import Kernel, GpuKernel
from .properties import PsSymbolProperty, FieldBasePtr
from .parameters import Parameter
from .gpu_indexing import GpuIndexing, GpuLaunchGridConstraints
from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration
from ..field import Field
from ..types import PsIntegerType, PsScalarType
......@@ -40,7 +40,6 @@ from ..backend.platforms import (
Platform,
GenericCpu,
GenericVectorCpu,
GenericGpu,
)
from ..backend.exceptions import VectorizationError
......@@ -166,17 +165,9 @@ class DefaultKernelCreationDriver:
) -> Kernel:
kernel_body = self.parse_kernel_body(assignments)
match self._platform:
case GenericCpu():
kernel_ast = self._platform.materialize_iteration_space(
kernel_body, self._ctx.get_iteration_space()
)
case GenericGpu():
kernel_ast = self._platform.materialize_iteration_space(
kernel_body, self._ctx.get_iteration_space()
)
case _:
assert False, "unexpected platform"
kernel_ast = self._platform.materialize_iteration_space(
kernel_body, self._ctx.get_iteration_space()
)
if self._intermediates is not None:
self._intermediates.materialized_ispace = kernel_ast.clone()
......@@ -220,14 +211,16 @@ class DefaultKernelCreationDriver:
self._cfg.get_jit(),
)
else:
assert self._gpu_indexing is not None
return create_gpu_kernel_function(
self._ctx,
self._platform,
kernel_ast,
self._gpu_indexing,
self._cfg.get_option("function_name"),
self._target,
self._cfg.get_jit(),
self._gpu_indexing.get_launch_config,
)
def parse_kernel_body(
......@@ -500,30 +493,24 @@ def create_gpu_kernel_function(
ctx: KernelCreationContext,
platform: Platform,
body: PsBlock,
indexing: GpuIndexing | None,
function_name: str,
target_spec: Target,
jit: JitBase,
launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
) -> GpuKernel:
undef_symbols = collect_undefined_symbols(body)
launch_grid_constraints = (
indexing.get_launch_grid_constraints()
if indexing is not None
else GpuLaunchGridConstraints()
)
params = _get_function_params(ctx, undef_symbols)
req_headers = _get_headers(ctx, platform, body)
kfunc = GpuKernel(
body,
launch_grid_constraints,
target_spec,
function_name,
params,
req_headers,
jit,
launch_config_factory,
)
kfunc.metadata.update(ctx.metadata)
return kfunc
......
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import cast
from typing import cast, Any
from itertools import chain
from .functions import Lambda
from .kernel import GpuKernel
from .parameters import Parameter
from .properties import GpuBlockSize
from .errors import CodegenError
from ..backend.kernelcreation import (
......@@ -18,34 +18,30 @@ from ..backend.platforms.cuda import ThreadToIndexMapping
from ..backend.ast.expressions import PsExpression
_ConstraintTriple = tuple[Lambda | None, Lambda | None, Lambda | None]
_Dim3Lambda = tuple[Lambda, Lambda, Lambda]
class GpuLaunchGridConstraints:
"""Constraints on the number of threads and blocks on the GPU launch grid for a given kernel.
class GpuLaunchConfiguration:
"""Base class for launch configurations for CUDA and HIP kernels.
This constraints set determines all or some of
the number of threads on a GPU block as well as the number of blocks on the GPU grid,
statically or depending on runtime parameters.
Args:
block_size: A triple of lambdas determining the GPU block size
grid_size: A triple of lambdas determining the GPU grid size
config_parameters: Set containing all parameters to the given lambdas that are not also
parameters to the associated kernel
"""
def __init__(
self,
block_size: _ConstraintTriple | None = None,
grid_size: _ConstraintTriple | None = None,
block_size: _Dim3Lambda,
grid_size: _Dim3Lambda,
config_parameters: set[Parameter],
) -> None:
self._block_size: _ConstraintTriple = (
(None, None, None) if block_size is None else block_size
)
self._grid_size: _ConstraintTriple = (
(None, None, None) if grid_size is None else grid_size
)
self._block_size = block_size
self._grid_size = grid_size
params = set()
for constr in chain(self._block_size, self._grid_size):
if constr is not None:
params |= set(constr.parameters)
self._params = frozenset(params)
self._params = frozenset(config_parameters)
self._valuation: dict[Parameter, Any] = dict()
@property
def parameters(self) -> frozenset[Parameter]:
......@@ -53,12 +49,18 @@ class GpuLaunchGridConstraints:
return self._params
@property
def block_size(self) -> _ConstraintTriple:
def parameter_values(self) -> dict[Parameter, Any]:
"""Values for all parameters that are specific to the launch grid configuration and not
also kernel parameters."""
return self._valuation
@property
def block_size(self) -> _Dim3Lambda:
"""Constraints on the number of threads per block"""
return self._block_size
@property
def grid_size(self) -> _ConstraintTriple:
def grid_size(self) -> _Dim3Lambda:
"""Constraints on the number of blocks on the grid"""
return self._grid_size
......@@ -68,7 +70,7 @@ class GpuIndexing(ABC):
def get_thread_mapping(self) -> ThreadToIndexMapping | None: ...
@abstractmethod
def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints: ...
def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration: ...
class Linear3DGpuIndexing(GpuIndexing):
......@@ -85,29 +87,48 @@ class Linear3DGpuIndexing(GpuIndexing):
return Linear3DMapping()
def get_launch_grid_constraints(self) -> GpuLaunchGridConstraints:
def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration:
block_size, grid_size = self._prepare_launch_grid()
kernel_params = set(kernel.parameters)
launch_config_params = (
set().union(
*(lb.parameters for lb in chain(block_size, grid_size))
)
- kernel_params
)
return GpuLaunchConfiguration(
block_size=cast(_Dim3Lambda, tuple(block_size)),
grid_size=cast(_Dim3Lambda, tuple(grid_size)),
config_parameters=launch_config_params,
)
def _prepare_launch_grid(self):
work_items = self._get_work_items()
rank = len(work_items)
from ..backend.constants import PsConstant
from ..backend.ast.expressions import PsExpression, PsIntDiv
block_size_constraints = [None] * rank + [
Lambda(self._factory.parse_index(1), ()) for _ in range(3 - rank)
]
block_size_symbols = [
self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank)
]
for c, bs in enumerate(block_size_symbols):
bs.add_property(GpuBlockSize(c))
block_size = [
Lambda.from_expression(self._ctx, self._factory.parse_index(bs_symb))
for bs_symb in block_size_symbols
] + [
Lambda.from_expression(self._ctx, self._factory.parse_index(1))
for _ in range(3 - rank)
]
def div_ceil(a: PsExpression, b: PsExpression):
return self._factory.parse_index(
PsIntDiv(a + b - PsExpression.make(PsConstant(1)), b)
)
grid_size_constraints = [
grid_size = [
Lambda.from_expression(
self._ctx, div_ceil(witems, PsExpression.make(bsize))
)
......@@ -117,10 +138,7 @@ class Linear3DGpuIndexing(GpuIndexing):
for _ in range(3 - rank)
]
return GpuLaunchGridConstraints(
block_size=cast(_ConstraintTriple, tuple(block_size_constraints)),
grid_size=cast(_ConstraintTriple, tuple(grid_size_constraints)),
)
return block_size, grid_size
def _get_work_items(self) -> tuple[PsExpression, ...]:
ispace = self._ctx.get_iteration_space()
......
......@@ -6,7 +6,6 @@ from itertools import chain
from .target import Target
from .parameters import Parameter
from .gpu_indexing import GpuLaunchGridConstraints
from ..backend.ast.structural import PsBlock
from ..field import Field
......@@ -14,6 +13,7 @@ from .._deprecation import _deprecated
if TYPE_CHECKING:
from ..jit import JitBase
from .gpu_indexing import GpuLaunchConfiguration
class Kernel:
......@@ -118,17 +118,16 @@ class GpuKernel(Kernel):
def __init__(
self,
body: PsBlock,
launch_grid_constraints: GpuLaunchGridConstraints,
target: Target,
name: str,
parameters: Sequence[Parameter],
required_headers: set[str],
jit: JitBase,
launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
):
super().__init__(body, target, name, parameters, required_headers, jit)
self._launch_grid_constraints = launch_grid_constraints
self._launch_config_factory = launch_config_factory
@property
def launch_grid_constraints(self) -> GpuLaunchGridConstraints:
def get_launch_configuration(self) -> GpuLaunchConfiguration:
"""Object exposing the total size of the launch grid this kernel expects to be executed with."""
return self._launch_grid_constraints
return self._launch_config_factory(self)
......@@ -39,8 +39,3 @@ class FieldBasePtr(UniqueSymbolProperty):
FieldProperty = FieldShape | FieldStride | FieldBasePtr
_FieldProperty = (FieldShape, FieldStride, FieldBasePtr)
@dataclass(frozen=True)
class GpuBlockSize(UniqueSymbolProperty):
coordinate: int
......@@ -18,6 +18,7 @@ from ..codegen import (
GpuKernel,
Parameter,
)
from ..codegen.gpu_indexing import GpuLaunchConfiguration
from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr
from ..types import PsStructType, PsPointerType
......@@ -35,38 +36,24 @@ class CupyKernelWrapper(KernelWrapper):
self,
kfunc: GpuKernel,
raw_kernel: Any,
block_size: tuple[int, int, int],
):
self._kfunc: GpuKernel = kfunc
self._launch_config = kfunc.get_launch_configuration()
self._raw_kernel = raw_kernel
self._block_size = block_size
self._grid_size: tuple[int, int, int] | None = None
self._args_cache: dict[Any, tuple] = dict()
@property
def kernel_function(self) -> GpuKernel:
return self._kfunc
@property
def launch_config(self) -> GpuLaunchConfiguration:
return self._launch_config
@property
def raw_kernel(self):
return self._raw_kernel
@property
def block_size(self) -> tuple[int, int, int]:
return self._block_size
@block_size.setter
def block_size(self, bs: tuple[int, int, int]):
self._block_size = bs
@property
def num_blocks(self) -> tuple[int, int, int] | None:
return self._grid_size
@num_blocks.setter
def num_blocks(self, nb: tuple[int, int, int] | None):
self._grid_size = nb
def __call__(self, **kwargs: Any):
kernel_args, launch_grid = self._get_cached_args(**kwargs)
device = self._get_device(kernel_args)
......@@ -80,9 +67,10 @@ class CupyKernelWrapper(KernelWrapper):
return devices.pop()
def _get_cached_args(self, **kwargs):
key = (self._block_size, self._grid_size) + tuple(
(k, id(v)) for k, v in kwargs.items()
)
launch_config_params = self._launch_config.parameter_values
key = tuple(
(k, v) for k, v in launch_config_params.items()
) + tuple((k, id(v)) for k, v in kwargs.items())
if key not in self._args_cache:
args = self._get_args(**kwargs)
......@@ -203,48 +191,32 @@ class CupyKernelWrapper(KernelWrapper):
# Determine launch grid
from ..codegen.gpu_indexing import GpuBlockSize
constraints = self._kfunc.launch_grid_constraints
launch_cfg_valuation = valuation.copy()
launch_cfg_valuation.update(
{
param.name: value
for param, value in self._launch_config.parameter_values.items()
}
)
for cparam in constraints.parameters:
for prop in cparam.properties:
match prop:
case GpuBlockSize(coord):
valuation[cparam.name] = self._block_size[coord]
break
else:
valuation[cparam.name] = kwargs[cparam.name]
# launch_block_size: list[int] = []
# for coord, (bsize_constr, user_bsize) in enumerate(
# zip(constraints.block_size, self._block_size)
# ):
# if bsize_constr is None:
# launch_grid_size
launch_block_size = [
(
int(bsize_constr(**valuation))
if bsize_constr is not None
else self._block_size[coord]
)
for coord, bsize_constr in enumerate(constraints.block_size)
]
launch_grid_size = [
(
int(gsize_constr(**valuation))
if gsize_constr is not None
else self._grid_size[coord]
)
for coord, gsize_constr in enumerate(constraints.grid_size)
]
block_size = cast(
tuple[int, int, int],
tuple(
int(component(**launch_cfg_valuation))
for component in self._launch_config.block_size
),
)
return tuple(args), LaunchGrid(
tuple(launch_grid_size), tuple(launch_block_size)
grid_size = cast(
tuple[int, int, int],
tuple(
int(component(**launch_cfg_valuation))
for component in self._launch_config.grid_size
),
)
return tuple(args), LaunchGrid(grid_size, block_size)
class CupyJit(JitBase):
......@@ -261,26 +233,26 @@ class CupyJit(JitBase):
tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
)
def compile(self, kfunc: Kernel) -> KernelWrapper:
def compile(self, kernel: Kernel) -> KernelWrapper:
if not HAVE_CUPY:
raise JitError(
"`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
)
if not isinstance(kfunc, GpuKernel) or kfunc.target != Target.CUDA:
if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA:
raise ValueError(
"The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP"
)
options = self._compiler_options()
prelude = self._prelude(kfunc)
kernel_code = self._kernel_code(kfunc)
prelude = self._prelude(kernel)
kernel_code = self._kernel_code(kernel)
code = prelude + kernel_code
raw_kernel = cp.RawKernel(
code, kfunc.name, options=options, backend="nvrtc", jitify=True
code, kernel.name, options=options, backend="nvrtc", jitify=True
)
return CupyKernelWrapper(kfunc, raw_kernel, self._default_block_size)
return CupyKernelWrapper(kernel, raw_kernel)
def _compiler_options(self) -> tuple[str, ...]:
options = ["-w", "-std=c++11"]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment