Skip to content
Snippets Groups Projects
Commit 6c120a84 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

Add CUDA and HIP API provider protocols. Factor out GPU stuff into separate Gpu Composer.

parent e93ca92c
Branches
No related tags found
1 merge request!24Extend Support for CUDA and HIP kernel invocations
Pipeline #75198 passed
from __future__ import annotations
from typing import Sequence, TypeAlias, overload
from typing import Sequence, TypeAlias
from abc import ABC, abstractmethod
import sympy as sp
from functools import reduce
......@@ -13,12 +13,7 @@ from pystencils import (
Assignment,
AssignmentCollection,
)
from pystencils.codegen import Kernel, GpuKernel, Lambda
from pystencils.codegen.gpu_indexing import (
ManualLaunchConfiguration,
AutomaticLaunchConfiguration,
DynamicBlockSizeLaunchConfiguration,
)
from pystencils.codegen import Kernel, Lambda
from pystencils.types import create_type, UserTypeSpec, PsType
from ..context import SfgContext, SfgCursor
......@@ -26,7 +21,6 @@ from .custom import CustomGenerator
from ..ir import (
SfgCallTreeNode,
SfgKernelCallNode,
SfgCudaKernelInvocation,
SfgStatements,
SfgFunctionParams,
SfgRequireIncludes,
......@@ -404,166 +398,6 @@ class SfgBasicComposer(SfgIComposer):
"""
return SfgKernelCallNode(kernel_handle)
@overload
def cuda_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
grid_size: ExprLike,
block_size: ExprLike,
stream: ExprLike | None = None,
) -> SfgCallTreeNode:
"""Invoke a CUDA kernel with a manual launch grid.
Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
set to `True`.
"""
@overload
def cuda_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
stream: ExprLike | None = None,
) -> SfgCallTreeNode:
"""Invoke a CUDA kernel with an automatic launch grid.
This signature accepts kernels generated with an indexing scheme that permits
the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
"""
@overload
def cuda_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
block_size: ExprLike | None = None,
stream: ExprLike | None = None,
) -> SfgCallTreeNode:
"""Invoke a CUDA kernel with a dynamic launch grid.
This signature accepts kernels generated with an indexing scheme that permits a user-defined
blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
The grid size is calculated automatically.
"""
def cuda_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
ker = kernel_handle.kernel
if not isinstance(ker, GpuKernel):
raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}")
launch_config = ker.get_launch_configuration()
from ..lang.cuda import dim3
def _render_invocation(
grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None
):
stmt_grid_size = make_statements(grid_size)
stmt_block_size = make_statements(block_size)
stmt_stream = make_statements(stream) if stream is not None else None
return SfgCudaKernelInvocation(
kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream
)
grid_size: ExprLike
block_size: ExprLike
stream: ExprLike | None = kwargs.get("stream", None)
match launch_config:
case ManualLaunchConfiguration():
grid_size = kwargs["grid_size"]
block_size = kwargs["block_size"]
return _render_invocation(grid_size, block_size, stream)
case AutomaticLaunchConfiguration():
grid_size_entries = [
self.expr_from_lambda(gs) for gs in launch_config._grid_size
]
grid_size_var = dim3(const=True).var("__grid_size")
block_size_entries = [
self.expr_from_lambda(bs) for bs in launch_config._block_size
]
block_size_var = dim3(const=True).var("__block_size")
nodes = [
self.init(grid_size_var)(*grid_size_entries),
self.init(block_size_var)(*block_size_entries),
_render_invocation(grid_size_var, block_size_var, stream),
]
return SfgBlock(SfgSequence(nodes))
case DynamicBlockSizeLaunchConfiguration():
user_block_size: ExprLike | None = kwargs.get("block_size", None)
block_size_init_args: tuple[ExprLike, ...]
if user_block_size is None:
if launch_config.block_size is None:
raise ValueError(
"Neither a user-defined nor a default block size was defined."
)
block_size_init_args = tuple(
str(bs) for bs in launch_config.block_size
)
else:
block_size_init_args = (user_block_size,)
block_size_var = dim3(const=True).var("__block_size")
from ..lang.cpp import std
work_items_entries = [
self.expr_from_lambda(wit) for wit in launch_config.num_work_items
]
work_items_var = std.tuple(
"uint32_t", "uint32_t", "uint32_t", const=True
).var("__work_items")
def _min(a: ExprLike, b: ExprLike):
return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
def _div_ceil(a: ExprLike, b: ExprLike):
return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
reduced_block_size_entries = [
_min(work_items_var.get(i), bs)
for i, bs in enumerate(
[block_size_var.x, block_size_var.y, block_size_var.z]
)
]
reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
grid_size_entries = [
_div_ceil(work_items_var.get(i), bs)
for i, bs in enumerate(
[
reduced_block_size_var.x,
reduced_block_size_var.y,
reduced_block_size_var.z,
]
)
]
grid_size_var = dim3(const=True).var("__grid_size")
nodes = [
self.init(block_size_var)(*block_size_init_args),
self.init(work_items_var)(*work_items_entries),
self.init(reduced_block_size_var)(*reduced_block_size_entries),
self.init(grid_size_var)(*grid_size_entries),
_render_invocation(grid_size_var, reduced_block_size_var, stream),
]
return SfgBlock(SfgSequence(nodes))
case _:
raise ValueError(f"Unexpected launch configuration: {launch_config}")
def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence:
"""Syntax sequencing. For details, see `make_sequence`"""
return make_sequence(*args)
......
......@@ -3,12 +3,13 @@ from typing import TYPE_CHECKING
from .basic_composer import SfgBasicComposer
from .class_composer import SfgClassComposer
from .gpu_composer import SfgGpuComposer
if TYPE_CHECKING:
from ..context import SfgContext
class SfgComposer(SfgBasicComposer, SfgClassComposer):
class SfgComposer(SfgBasicComposer, SfgClassComposer, SfgGpuComposer):
"""Primary interface for constructing source files in pystencils-sfg.
The SfgComposer combines the `SfgBasicComposer`
......@@ -19,3 +20,4 @@ class SfgComposer(SfgBasicComposer, SfgClassComposer):
def __init__(self, sfg: SfgContext | SfgBasicComposer):
SfgBasicComposer.__init__(self, sfg)
SfgClassComposer.__init__(self)
SfgGpuComposer.__init__(self)
from __future__ import annotations
from typing import overload
from pystencils.codegen import GpuKernel
from pystencils.codegen.gpu_indexing import (
ManualLaunchConfiguration,
AutomaticLaunchConfiguration,
DynamicBlockSizeLaunchConfiguration,
)
from .mixin import SfgComposerMixIn
from .basic_composer import SfgBasicComposer, make_statements
from ..ir import (
SfgKernelHandle,
SfgCallTreeNode,
SfgCudaKernelInvocation,
SfgBlock,
SfgSequence,
)
from ..lang import ExprLike, AugExpr
from ..lang.gpu import ProvidesGpuRuntimeAPI
class SfgGpuComposer(SfgComposerMixIn):
def __init__(self) -> None:
self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
def use_cuda(self):
from ..lang.gpu import CudaAPI
if self._gpu_api_provider is not None and not isinstance(
self._gpu_api_provider, CudaAPI
):
raise ValueError(
"Cannot select CUDA GPU API since another API was already chosen"
)
self._gpu_api_provider = CudaAPI()
def use_hip(self):
from ..lang.gpu import HipAPI
if self._gpu_api_provider is not None and not isinstance(
self._gpu_api_provider, HipAPI
):
raise ValueError(
"Cannot select HIP GPU API since another API was already chosen"
)
self._gpu_api_provider = HipAPI()
@property
def gpu_api(self) -> ProvidesGpuRuntimeAPI:
if self._gpu_api_provider is None:
raise AttributeError(
"No GPU API was selected - call `use_cuda()` or `use_hip()` first."
)
return self._gpu_api_provider
@overload
def gpu_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
grid_size: ExprLike,
block_size: ExprLike,
stream: ExprLike | None = None,
) -> SfgCallTreeNode:
"""Invoke a CUDA kernel with a manual launch grid.
Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
set to `True`.
"""
@overload
def gpu_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
stream: ExprLike | None = None,
) -> SfgCallTreeNode:
"""Invoke a CUDA kernel with an automatic launch grid.
This signature accepts kernels generated with an indexing scheme that permits
the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
"""
@overload
def gpu_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
block_size: ExprLike | None = None,
stream: ExprLike | None = None,
) -> SfgCallTreeNode:
"""Invoke a CUDA kernel with a dynamic launch grid.
This signature accepts kernels generated with an indexing scheme that permits a user-defined
blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
The grid size is calculated automatically.
"""
def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
assert isinstance(
self, SfgBasicComposer
) # for type checking this function body
ker = kernel_handle.kernel
if not isinstance(ker, GpuKernel):
raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}")
launch_config = ker.get_launch_configuration()
dim3 = self.gpu_api.dim3
def _render_invocation(
grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None
):
stmt_grid_size = make_statements(grid_size)
stmt_block_size = make_statements(block_size)
stmt_stream = make_statements(stream) if stream is not None else None
return SfgCudaKernelInvocation(
kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream
)
grid_size: ExprLike
block_size: ExprLike
stream: ExprLike | None = kwargs.get("stream", None)
match launch_config:
case ManualLaunchConfiguration():
grid_size = kwargs["grid_size"]
block_size = kwargs["block_size"]
return _render_invocation(grid_size, block_size, stream)
case AutomaticLaunchConfiguration():
grid_size_entries = [
self.expr_from_lambda(gs) for gs in launch_config._grid_size
]
grid_size_var = dim3(const=True).var("__grid_size")
block_size_entries = [
self.expr_from_lambda(bs) for bs in launch_config._block_size
]
block_size_var = dim3(const=True).var("__block_size")
nodes = [
self.init(grid_size_var)(*grid_size_entries),
self.init(block_size_var)(*block_size_entries),
_render_invocation(grid_size_var, block_size_var, stream),
]
return SfgBlock(SfgSequence(nodes))
case DynamicBlockSizeLaunchConfiguration():
user_block_size: ExprLike | None = kwargs.get("block_size", None)
block_size_init_args: tuple[ExprLike, ...]
if user_block_size is None:
if launch_config.block_size is None:
raise ValueError(
"Neither a user-defined nor a default block size was defined."
)
block_size_init_args = tuple(
str(bs) for bs in launch_config.block_size
)
else:
block_size_init_args = (user_block_size,)
block_size_var = dim3(const=True).var("__block_size")
from ..lang.cpp import std
work_items_entries = [
self.expr_from_lambda(wit) for wit in launch_config.num_work_items
]
work_items_var = std.tuple(
"uint32_t", "uint32_t", "uint32_t", const=True
).var("__work_items")
def _min(a: ExprLike, b: ExprLike):
return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
def _div_ceil(a: ExprLike, b: ExprLike):
return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
reduced_block_size_entries = [
_min(work_items_var.get(i), bs)
for i, bs in enumerate(
[block_size_var.x, block_size_var.y, block_size_var.z]
)
]
reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
grid_size_entries = [
_div_ceil(work_items_var.get(i), bs)
for i, bs in enumerate(
[
reduced_block_size_var.x,
reduced_block_size_var.y,
reduced_block_size_var.z,
]
)
]
grid_size_var = dim3(const=True).var("__grid_size")
nodes = [
self.init(block_size_var)(*block_size_init_args),
self.init(work_items_var)(*work_items_entries),
self.init(reduced_block_size_var)(*reduced_block_size_entries),
self.init(grid_size_var)(*grid_size_entries),
_render_invocation(grid_size_var, reduced_block_size_var, stream),
]
return SfgBlock(SfgSequence(nodes))
case _:
raise ValueError(f"Unexpected launch configuration: {launch_config}")
def cuda_invoke(
self,
kernel_handle: SfgKernelHandle,
num_blocks: ExprLike,
threads_per_block: ExprLike,
stream: ExprLike | None,
):
from warnings import warn
warn(
"cuda_invoke is deprecated and will be removed before version 0.1. "
"Call `use_cuda()` and use `gpu_invoke` instead.",
FutureWarning,
)
return self.gpu_invoke(
kernel_handle,
grid_size=num_blocks,
block_size=threads_per_block,
stream=stream,
)
from .expressions import CppClass, cpptype
from ..extensions.gpu import dim3class
dim3 = dim3class(
"<cuda_runtime.h>"
)
"""Reflection of CUDA's `dim3`."""
class cudaStream_t(CppClass):
template = cpptype("cudaStream_t", "<cuda_runtime.h>")
from __future__ import annotations
from typing import Protocol
from .expressions import CppClass, cpptype, AugExpr
class _Dim3Base(CppClass):
def ctor(self, dim0=1, dim1=1, dim2=1):
return self.ctor_bind(dim0, dim1, dim2)
@property
def x(self):
return AugExpr.format("{}.x", self)
@property
def y(self):
return AugExpr.format("{}.y", self)
@property
def z(self):
return AugExpr.format("{}.z", self)
@property
def dims(self):
"""The dims property."""
return [self.x, self.y, self.z]
class ProvidesGpuRuntimeAPI(Protocol):
dim3: type[_Dim3Base]
stream_t: type[AugExpr]
class CudaAPI(ProvidesGpuRuntimeAPI):
class dim3(_Dim3Base):
template = cpptype("dim3", "<cuda_runtime.h>")
class stream_t(CppClass):
template = cpptype("cudaStream_t", "<cuda_runtime.h>")
class HipAPI(ProvidesGpuRuntimeAPI):
class dim3(_Dim3Base):
template = cpptype("dim3", "<hip/hip_runtime.h>")
class stream_t(CppClass):
template = cpptype("hipStream_t", "<hip/hip_runtime.h>")
......@@ -104,6 +104,19 @@ CudaKernels:
- --expt-relaxed-constexpr
skip-if-not-found: true
# HIP
HipKernels:
sfg-args:
file-extensions: ["h++", "hip"]
compile:
cxx: hipcc
cxx-flags:
- -std=c++20
- -Wall
- -Werror
skip-if-not-found: true
# SYCL
SyclKernels:
......
from pystencilssfg import SourceFileGenerator
from pystencilssfg.lang.cuda import dim3
from pystencilssfg.lang.cpp import std
import pystencils as ps
......@@ -7,6 +6,7 @@ import pystencils as ps
std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
with SourceFileGenerator() as sfg:
sfg.use_cuda()
src, dst = ps.fields("src, dst: double[3D]", layout="c")
asm = ps.Assignment(dst(0), 2 * src(0))
......@@ -14,10 +14,10 @@ with SourceFileGenerator() as sfg:
khandle = sfg.kernels.create(asm, "scale", cfg)
block_size = dim3().var("blockSize")
block_size = sfg.gpu_api.dim3().var("blockSize")
sfg.function("invoke")(
sfg.map_field(src, std.mdspan.from_field(src)),
sfg.map_field(dst, std.mdspan.from_field(dst)),
sfg.cuda_invoke(khandle, block_size=block_size)
sfg.gpu_invoke(khandle, block_size=block_size),
)
from pystencilssfg import SourceFileGenerator
from pystencilssfg.lang.cpp import std
import pystencils as ps
std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
with SourceFileGenerator() as sfg:
sfg.use_hip()
src, dst = ps.fields("src, dst: double[3D]", layout="c")
asm = ps.Assignment(dst(0), 2 * src(0))
cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
khandle = sfg.kernels.create(asm, "scale", cfg)
block_size = sfg.gpu_api.dim3().var("blockSize")
sfg.function("invoke")(
sfg.map_field(src, std.mdspan.from_field(src)),
sfg.map_field(dst, std.mdspan.from_field(dst)),
sfg.gpu_invoke(khandle, block_size=block_size),
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment