Skip to content
Snippets Groups Projects

Extend Support for CUDA and HIP kernel invocations

Merged Frederik Hennig requested to merge fhennig/cuda-invoke into master
All threads resolved!
Viewing commit 6c120a84
Show latest version
8 files
+ 345
186
Preferences
Compare changes
Files
8
from __future__ import annotations
from typing import Sequence, TypeAlias, overload
from typing import Sequence, TypeAlias
from abc import ABC, abstractmethod
import sympy as sp
from functools import reduce
@@ -13,12 +13,7 @@ from pystencils import (
Assignment,
AssignmentCollection,
)
from pystencils.codegen import Kernel, GpuKernel, Lambda
from pystencils.codegen.gpu_indexing import (
ManualLaunchConfiguration,
AutomaticLaunchConfiguration,
DynamicBlockSizeLaunchConfiguration,
)
from pystencils.codegen import Kernel, Lambda
from pystencils.types import create_type, UserTypeSpec, PsType
from ..context import SfgContext, SfgCursor
@@ -26,7 +21,6 @@ from .custom import CustomGenerator
from ..ir import (
SfgCallTreeNode,
SfgKernelCallNode,
SfgCudaKernelInvocation,
SfgStatements,
SfgFunctionParams,
SfgRequireIncludes,
@@ -404,166 +398,6 @@ class SfgBasicComposer(SfgIComposer):
"""
return SfgKernelCallNode(kernel_handle)
@overload
def cuda_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
grid_size: ExprLike,
block_size: ExprLike,
stream: ExprLike | None = None,
) -> SfgCallTreeNode:
"""Invoke a CUDA kernel with a manual launch grid.
Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
set to `True`.
"""
@overload
def cuda_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
stream: ExprLike | None = None,
) -> SfgCallTreeNode:
"""Invoke a CUDA kernel with an automatic launch grid.
This signature accepts kernels generated with an indexing scheme that permits
the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
"""
@overload
def cuda_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
block_size: ExprLike | None = None,
stream: ExprLike | None = None,
) -> SfgCallTreeNode:
"""Invoke a CUDA kernel with a dynamic launch grid.
This signature accepts kernels generated with an indexing scheme that permits a user-defined
blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
The grid size is calculated automatically.
"""
def cuda_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
ker = kernel_handle.kernel
if not isinstance(ker, GpuKernel):
raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}")
launch_config = ker.get_launch_configuration()
from ..lang.cuda import dim3
def _render_invocation(
grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None
):
stmt_grid_size = make_statements(grid_size)
stmt_block_size = make_statements(block_size)
stmt_stream = make_statements(stream) if stream is not None else None
return SfgCudaKernelInvocation(
kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream
)
grid_size: ExprLike
block_size: ExprLike
stream: ExprLike | None = kwargs.get("stream", None)
match launch_config:
case ManualLaunchConfiguration():
grid_size = kwargs["grid_size"]
block_size = kwargs["block_size"]
return _render_invocation(grid_size, block_size, stream)
case AutomaticLaunchConfiguration():
grid_size_entries = [
self.expr_from_lambda(gs) for gs in launch_config._grid_size
]
grid_size_var = dim3(const=True).var("__grid_size")
block_size_entries = [
self.expr_from_lambda(bs) for bs in launch_config._block_size
]
block_size_var = dim3(const=True).var("__block_size")
nodes = [
self.init(grid_size_var)(*grid_size_entries),
self.init(block_size_var)(*block_size_entries),
_render_invocation(grid_size_var, block_size_var, stream),
]
return SfgBlock(SfgSequence(nodes))
case DynamicBlockSizeLaunchConfiguration():
user_block_size: ExprLike | None = kwargs.get("block_size", None)
block_size_init_args: tuple[ExprLike, ...]
if user_block_size is None:
if launch_config.block_size is None:
raise ValueError(
"Neither a user-defined nor a default block size was defined."
)
block_size_init_args = tuple(
str(bs) for bs in launch_config.block_size
)
else:
block_size_init_args = (user_block_size,)
block_size_var = dim3(const=True).var("__block_size")
from ..lang.cpp import std
work_items_entries = [
self.expr_from_lambda(wit) for wit in launch_config.num_work_items
]
work_items_var = std.tuple(
"uint32_t", "uint32_t", "uint32_t", const=True
).var("__work_items")
def _min(a: ExprLike, b: ExprLike):
return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
def _div_ceil(a: ExprLike, b: ExprLike):
return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
reduced_block_size_entries = [
_min(work_items_var.get(i), bs)
for i, bs in enumerate(
[block_size_var.x, block_size_var.y, block_size_var.z]
)
]
reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
grid_size_entries = [
_div_ceil(work_items_var.get(i), bs)
for i, bs in enumerate(
[
reduced_block_size_var.x,
reduced_block_size_var.y,
reduced_block_size_var.z,
]
)
]
grid_size_var = dim3(const=True).var("__grid_size")
nodes = [
self.init(block_size_var)(*block_size_init_args),
self.init(work_items_var)(*work_items_entries),
self.init(reduced_block_size_var)(*reduced_block_size_entries),
self.init(grid_size_var)(*grid_size_entries),
_render_invocation(grid_size_var, reduced_block_size_var, stream),
]
return SfgBlock(SfgSequence(nodes))
case _:
raise ValueError(f"Unexpected launch configuration: {launch_config}")
def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence:
"""Syntax sequencing. For details, see `make_sequence`"""
return make_sequence(*args)