Skip to content
Snippets Groups Projects

GPU Indexing Schemes and Launch Configurations

Merged Frederik Hennig requested to merge fhennig/lambdas into v2.0-dev
3 files
+ 89
66
Compare changes
  • Side-by-side
  • Inline
Files
3
@@ -19,7 +19,6 @@ from ..backend.ast.expressions import PsExpression
@@ -19,7 +19,6 @@ from ..backend.ast.expressions import PsExpression
dim3 = tuple[int, int, int]
dim3 = tuple[int, int, int]
_Dim3Params = tuple[Parameter, Parameter, Parameter]
_Dim3Lambda = tuple[Lambda, Lambda, Lambda]
_Dim3Lambda = tuple[Lambda, Lambda, Lambda]
@@ -73,15 +72,9 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
@@ -73,15 +72,9 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
@property
@property
def parameters(self) -> frozenset[Parameter]:
def parameters(self) -> frozenset[Parameter]:
"""Parameters of this launch configuration"""
return self._params
return self._params
def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
"""Compute block and grid size for a kernel launch.
Args:
kwargs: Valuation providing a value for each parameter listed in `parameters`
"""
block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
return cast(dim3, block_size), cast(dim3, grid_size)
return cast(dim3, block_size), cast(dim3, grid_size)
@@ -136,27 +129,38 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
@@ -136,27 +129,38 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
"""GPU launch configuration that permits the user to set a block size dynamically."""
"""GPU launch configuration that permits the user to set a block size and dynamically computes the grid size.
 
 
The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items
 
in the kernel's iteration space as follows.
 
For each dimension :math:`c \\in \\{ x, y, z \\}`,
 
 
- if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``;
 
- otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
 
"""
def __init__(
def __init__(
self,
self,
block_size_expr: _Dim3Lambda,
num_work_items: _Dim3Lambda,
grid_size_expr: _Dim3Lambda,
block_size_params: _Dim3Params,
default_block_size: dim3 | None = None,
default_block_size: dim3 | None = None,
) -> None:
) -> None:
self._block_size_expr = block_size_expr
self._num_work_items = num_work_items
self._grid_size_expr = grid_size_expr
self._block_size_params = block_size_params
self._block_size: dim3 | None = default_block_size
self._block_size: dim3 | None = default_block_size
self._params: frozenset[Parameter] = frozenset().union(
self._params: frozenset[Parameter] = frozenset().union(
*(lb.parameters for lb in chain(block_size_expr, grid_size_expr))
*(wit.parameters for wit in num_work_items)
) - set(self._block_size_params)
)
 
 
@property
 
def num_work_items(self) -> _Dim3Lambda:
 
"""Lambda expressions that compute the number of work items in each iteration space
 
dimension from kernel parameters."""
 
return self._num_work_items
@property
@property
def block_size(self) -> dim3 | None:
def block_size(self) -> dim3 | None:
 
"""The desired GPU block size."""
return self._block_size
return self._block_size
@block_size.setter
@block_size.setter
@@ -172,16 +176,23 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
@@ -172,16 +176,23 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
if self._block_size is None:
if self._block_size is None:
raise AttributeError("No GPU block size was specified by the user!")
raise AttributeError("No GPU block size was specified by the user!")
kwargs.update(
from ..utils import div_ceil
{
param.name: value
num_work_items = cast(
for param, value in zip(self._block_size_params, self._block_size)
dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items)
}
)
 
reduced_block_size = cast(
 
dim3,
 
tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)),
 
)
 
grid_size = cast(
 
dim3,
 
tuple(
 
div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size)
 
),
)
)
block_size = tuple(int(bs(**kwargs)) for bs in self._block_size_expr)
return reduced_block_size, grid_size
grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size_expr)
return cast(dim3, block_size), cast(dim3, grid_size)
def jit_cache_key(self) -> Any:
def jit_cache_key(self) -> Any:
return self._block_size
return self._block_size
@@ -226,50 +237,17 @@ class GpuIndexing(ABC):
@@ -226,50 +237,17 @@ class GpuIndexing(ABC):
def _get_linear3d_config_factory(
def _get_linear3d_config_factory(
self,
self,
) -> Callable[[], DynamicBlockSizeLaunchConfiguration]:
) -> Callable[[], DynamicBlockSizeLaunchConfiguration]:
work_items = self._get_work_items()
work_items_expr = self._get_work_items()
rank = len(work_items)
rank = len(work_items_expr)
from ..backend.constants import PsConstant
from ..backend.ast.expressions import PsExpression, PsIntDiv
block_size_symbols = [
self._ctx.get_new_symbol(f"gpuBlockSize_{c}", self._ctx.index_dtype) for c in range(rank)
]
block_size = [
Lambda.from_expression(self._ctx, self._factory.parse_index(bs_symb))
for bs_symb in block_size_symbols
] + [
Lambda.from_expression(self._ctx, self._factory.parse_index(1))
for _ in range(3 - rank)
]
def div_ceil(a: PsExpression, b: PsExpression):
return self._factory.parse_index(
PsIntDiv(a + b - PsExpression.make(PsConstant(1)), b)
)
grid_size = [
Lambda.from_expression(
self._ctx, div_ceil(witems, PsExpression.make(bsize))
)
for witems, bsize in zip(work_items, block_size_symbols)
] + [
Lambda.from_expression(self._ctx, self._factory.parse_index(1))
for _ in range(3 - rank)
]
from .driver import _symbol_to_param
block_size_params = tuple(
num_work_items = cast(
_symbol_to_param(self._ctx, s) for s in block_size_symbols
_Dim3Lambda,
 
tuple(Lambda.from_expression(self._ctx, wit) for wit in work_items_expr),
)
)
def factory():
def factory():
return DynamicBlockSizeLaunchConfiguration(
return DynamicBlockSizeLaunchConfiguration(
cast(_Dim3Lambda, tuple(block_size)),
num_work_items,
cast(_Dim3Lambda, tuple(grid_size)),
cast(_Dim3Params, block_size_params),
self._get_default_block_size(rank),
self._get_default_block_size(rank),
)
)
Loading