From 6c120a849a18bb8c5b6e60dad50c0ea5611bf1ba Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Thu, 6 Mar 2025 09:55:43 +0100 Subject: [PATCH] Add CUDA and HIP API provider protocols. Factor out GPU stuff into separate Gpu Composer. --- src/pystencilssfg/composer/basic_composer.py | 170 +----------- src/pystencilssfg/composer/composer.py | 4 +- src/pystencilssfg/composer/gpu_composer.py | 248 ++++++++++++++++++ src/pystencilssfg/lang/cuda.py | 13 - src/pystencilssfg/lang/gpu.py | 52 ++++ tests/generator_scripts/index.yaml | 13 + tests/generator_scripts/source/CudaKernels.py | 8 +- tests/generator_scripts/source/HipKernels.py | 23 ++ 8 files changed, 345 insertions(+), 186 deletions(-) create mode 100644 src/pystencilssfg/composer/gpu_composer.py delete mode 100644 src/pystencilssfg/lang/cuda.py create mode 100644 src/pystencilssfg/lang/gpu.py create mode 100644 tests/generator_scripts/source/HipKernels.py diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py index 0466e6c..97334db 100644 --- a/src/pystencilssfg/composer/basic_composer.py +++ b/src/pystencilssfg/composer/basic_composer.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Sequence, TypeAlias, overload +from typing import Sequence, TypeAlias from abc import ABC, abstractmethod import sympy as sp from functools import reduce @@ -13,12 +13,7 @@ from pystencils import ( Assignment, AssignmentCollection, ) -from pystencils.codegen import Kernel, GpuKernel, Lambda -from pystencils.codegen.gpu_indexing import ( - ManualLaunchConfiguration, - AutomaticLaunchConfiguration, - DynamicBlockSizeLaunchConfiguration, -) +from pystencils.codegen import Kernel, Lambda from pystencils.types import create_type, UserTypeSpec, PsType from ..context import SfgContext, SfgCursor @@ -26,7 +21,6 @@ from .custom import CustomGenerator from ..ir import ( SfgCallTreeNode, SfgKernelCallNode, - SfgCudaKernelInvocation, SfgStatements, SfgFunctionParams, SfgRequireIncludes, @@ -404,166 +398,6 @@ class SfgBasicComposer(SfgIComposer): """ return SfgKernelCallNode(kernel_handle) - @overload - def cuda_invoke( - self, - kernel_handle: SfgKernelHandle, - *, - grid_size: ExprLike, - block_size: ExprLike, - stream: ExprLike | None = None, - ) -> SfgCallTreeNode: - """Invoke a CUDA kernel with a manual launch grid. - - Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>` - set to `True`. - """ - - @overload - def cuda_invoke( - self, - kernel_handle: SfgKernelHandle, - *, - stream: ExprLike | None = None, - ) -> SfgCallTreeNode: - """Invoke a CUDA kernel with an automatic launch grid. - - This signature accepts kernels generated with an indexing scheme that permits - the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>` - """ - - @overload - def cuda_invoke( - self, - kernel_handle: SfgKernelHandle, - *, - block_size: ExprLike | None = None, - stream: ExprLike | None = None, - ) -> SfgCallTreeNode: - """Invoke a CUDA kernel with a dynamic launch grid. - - This signature accepts kernels generated with an indexing scheme that permits a user-defined - blocks size, such as `Linear3D <IndexingScheme.Linear3D>`. - The grid size is calculated automatically. - """ - - def cuda_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode: - ker = kernel_handle.kernel - - if not isinstance(ker, GpuKernel): - raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}") - - launch_config = ker.get_launch_configuration() - - from ..lang.cuda import dim3 - - def _render_invocation( - grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None - ): - stmt_grid_size = make_statements(grid_size) - stmt_block_size = make_statements(block_size) - stmt_stream = make_statements(stream) if stream is not None else None - - return SfgCudaKernelInvocation( - kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream - ) - - grid_size: ExprLike - block_size: ExprLike - stream: ExprLike | None = kwargs.get("stream", None) - - match launch_config: - case ManualLaunchConfiguration(): - grid_size = kwargs["grid_size"] - block_size = kwargs["block_size"] - - return _render_invocation(grid_size, block_size, stream) - - case AutomaticLaunchConfiguration(): - grid_size_entries = [ - self.expr_from_lambda(gs) for gs in launch_config._grid_size - ] - grid_size_var = dim3(const=True).var("__grid_size") - - block_size_entries = [ - self.expr_from_lambda(bs) for bs in launch_config._block_size - ] - block_size_var = dim3(const=True).var("__block_size") - - nodes = [ - self.init(grid_size_var)(*grid_size_entries), - self.init(block_size_var)(*block_size_entries), - _render_invocation(grid_size_var, block_size_var, stream), - ] - - return SfgBlock(SfgSequence(nodes)) - - case DynamicBlockSizeLaunchConfiguration(): - user_block_size: ExprLike | None = kwargs.get("block_size", None) - - block_size_init_args: tuple[ExprLike, ...] - if user_block_size is None: - if launch_config.block_size is None: - raise ValueError( - "Neither a user-defined nor a default block size was defined." - ) - - block_size_init_args = tuple( - str(bs) for bs in launch_config.block_size - ) - else: - block_size_init_args = (user_block_size,) - - block_size_var = dim3(const=True).var("__block_size") - - from ..lang.cpp import std - - work_items_entries = [ - self.expr_from_lambda(wit) for wit in launch_config.num_work_items - ] - work_items_var = std.tuple( - "uint32_t", "uint32_t", "uint32_t", const=True - ).var("__work_items") - - def _min(a: ExprLike, b: ExprLike): - return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b) - - def _div_ceil(a: ExprLike, b: ExprLike): - return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b) - - reduced_block_size_entries = [ - _min(work_items_var.get(i), bs) - for i, bs in enumerate( - [block_size_var.x, block_size_var.y, block_size_var.z] - ) - ] - reduced_block_size_var = dim3(const=True).var("__reduced_block_size") - - grid_size_entries = [ - _div_ceil(work_items_var.get(i), bs) - for i, bs in enumerate( - [ - reduced_block_size_var.x, - reduced_block_size_var.y, - reduced_block_size_var.z, - ] - ) - ] - grid_size_var = dim3(const=True).var("__grid_size") - - nodes = [ - self.init(block_size_var)(*block_size_init_args), - self.init(work_items_var)(*work_items_entries), - self.init(reduced_block_size_var)(*reduced_block_size_entries), - self.init(grid_size_var)(*grid_size_entries), - _render_invocation(grid_size_var, reduced_block_size_var, stream), - ] - - return SfgBlock(SfgSequence(nodes)) - - case _: - raise ValueError(f"Unexpected launch configuration: {launch_config}") - def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence: """Syntax sequencing. For details, see `make_sequence`""" return make_sequence(*args) diff --git a/src/pystencilssfg/composer/composer.py b/src/pystencilssfg/composer/composer.py index bba479e..b1cfc4b 100644 --- a/src/pystencilssfg/composer/composer.py +++ b/src/pystencilssfg/composer/composer.py @@ -3,12 +3,13 @@ from typing import TYPE_CHECKING from .basic_composer import SfgBasicComposer from .class_composer import SfgClassComposer +from .gpu_composer import SfgGpuComposer if TYPE_CHECKING: from ..context import SfgContext -class SfgComposer(SfgBasicComposer, SfgClassComposer): +class SfgComposer(SfgBasicComposer, SfgClassComposer, SfgGpuComposer): """Primary interface for constructing source files in pystencils-sfg. The SfgComposer combines the `SfgBasicComposer` @@ -19,3 +20,4 @@ class SfgComposer(SfgBasicComposer, SfgClassComposer): def __init__(self, sfg: SfgContext | SfgBasicComposer): SfgBasicComposer.__init__(self, sfg) SfgClassComposer.__init__(self) + SfgGpuComposer.__init__(self) diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py new file mode 100644 index 0000000..bedc206 --- /dev/null +++ b/src/pystencilssfg/composer/gpu_composer.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +from typing import overload + +from pystencils.codegen import GpuKernel +from pystencils.codegen.gpu_indexing import ( + ManualLaunchConfiguration, + AutomaticLaunchConfiguration, + DynamicBlockSizeLaunchConfiguration, +) + +from .mixin import SfgComposerMixIn +from .basic_composer import SfgBasicComposer, make_statements + +from ..ir import ( + SfgKernelHandle, + SfgCallTreeNode, + SfgCudaKernelInvocation, + SfgBlock, + SfgSequence, +) +from ..lang import ExprLike, AugExpr +from ..lang.gpu import ProvidesGpuRuntimeAPI + + +class SfgGpuComposer(SfgComposerMixIn): + + def __init__(self) -> None: + self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None + + def use_cuda(self): + from ..lang.gpu import CudaAPI + + if self._gpu_api_provider is not None and not isinstance( + self._gpu_api_provider, CudaAPI + ): + raise ValueError( + "Cannot select CUDA GPU API since another API was already chosen" + ) + + self._gpu_api_provider = CudaAPI() + + def use_hip(self): + from ..lang.gpu import HipAPI + + if self._gpu_api_provider is not None and not isinstance( + self._gpu_api_provider, HipAPI + ): + raise ValueError( + "Cannot select HIP GPU API since another API was already chosen" + ) + + self._gpu_api_provider = HipAPI() + + @property + def gpu_api(self) -> ProvidesGpuRuntimeAPI: + if self._gpu_api_provider is None: + raise AttributeError( + "No GPU API was selected - call `use_cuda()` or `use_hip()` first." + ) + + return self._gpu_api_provider + + @overload + def gpu_invoke( + self, + kernel_handle: SfgKernelHandle, + *, + grid_size: ExprLike, + block_size: ExprLike, + stream: ExprLike | None = None, + ) -> SfgCallTreeNode: + """Invoke a CUDA kernel with a manual launch grid. + + Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>` + set to `True`. + """ + + @overload + def gpu_invoke( + self, + kernel_handle: SfgKernelHandle, + *, + stream: ExprLike | None = None, + ) -> SfgCallTreeNode: + """Invoke a CUDA kernel with an automatic launch grid. + + This signature accepts kernels generated with an indexing scheme that permits + the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>` + """ + + @overload + def gpu_invoke( + self, + kernel_handle: SfgKernelHandle, + *, + block_size: ExprLike | None = None, + stream: ExprLike | None = None, + ) -> SfgCallTreeNode: + """Invoke a CUDA kernel with a dynamic launch grid. + + This signature accepts kernels generated with an indexing scheme that permits a user-defined + blocks size, such as `Linear3D <IndexingScheme.Linear3D>`. + The grid size is calculated automatically. + """ + + def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode: + assert isinstance( + self, SfgBasicComposer + ) # for type checking this function body + + ker = kernel_handle.kernel + + if not isinstance(ker, GpuKernel): + raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}") + + launch_config = ker.get_launch_configuration() + + dim3 = self.gpu_api.dim3 + + def _render_invocation( + grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None + ): + stmt_grid_size = make_statements(grid_size) + stmt_block_size = make_statements(block_size) + stmt_stream = make_statements(stream) if stream is not None else None + + return SfgCudaKernelInvocation( + kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream + ) + + grid_size: ExprLike + block_size: ExprLike + stream: ExprLike | None = kwargs.get("stream", None) + + match launch_config: + case ManualLaunchConfiguration(): + grid_size = kwargs["grid_size"] + block_size = kwargs["block_size"] + + return _render_invocation(grid_size, block_size, stream) + + case AutomaticLaunchConfiguration(): + grid_size_entries = [ + self.expr_from_lambda(gs) for gs in launch_config._grid_size + ] + grid_size_var = dim3(const=True).var("__grid_size") + + block_size_entries = [ + self.expr_from_lambda(bs) for bs in launch_config._block_size + ] + block_size_var = dim3(const=True).var("__block_size") + + nodes = [ + self.init(grid_size_var)(*grid_size_entries), + self.init(block_size_var)(*block_size_entries), + _render_invocation(grid_size_var, block_size_var, stream), + ] + + return SfgBlock(SfgSequence(nodes)) + + case DynamicBlockSizeLaunchConfiguration(): + user_block_size: ExprLike | None = kwargs.get("block_size", None) + + block_size_init_args: tuple[ExprLike, ...] + if user_block_size is None: + if launch_config.block_size is None: + raise ValueError( + "Neither a user-defined nor a default block size was defined." + ) + + block_size_init_args = tuple( + str(bs) for bs in launch_config.block_size + ) + else: + block_size_init_args = (user_block_size,) + + block_size_var = dim3(const=True).var("__block_size") + + from ..lang.cpp import std + + work_items_entries = [ + self.expr_from_lambda(wit) for wit in launch_config.num_work_items + ] + work_items_var = std.tuple( + "uint32_t", "uint32_t", "uint32_t", const=True + ).var("__work_items") + + def _min(a: ExprLike, b: ExprLike): + return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b) + + def _div_ceil(a: ExprLike, b: ExprLike): + return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b) + + reduced_block_size_entries = [ + _min(work_items_var.get(i), bs) + for i, bs in enumerate( + [block_size_var.x, block_size_var.y, block_size_var.z] + ) + ] + reduced_block_size_var = dim3(const=True).var("__reduced_block_size") + + grid_size_entries = [ + _div_ceil(work_items_var.get(i), bs) + for i, bs in enumerate( + [ + reduced_block_size_var.x, + reduced_block_size_var.y, + reduced_block_size_var.z, + ] + ) + ] + grid_size_var = dim3(const=True).var("__grid_size") + + nodes = [ + self.init(block_size_var)(*block_size_init_args), + self.init(work_items_var)(*work_items_entries), + self.init(reduced_block_size_var)(*reduced_block_size_entries), + self.init(grid_size_var)(*grid_size_entries), + _render_invocation(grid_size_var, reduced_block_size_var, stream), + ] + + return SfgBlock(SfgSequence(nodes)) + + case _: + raise ValueError(f"Unexpected launch configuration: {launch_config}") + + def cuda_invoke( + self, + kernel_handle: SfgKernelHandle, + num_blocks: ExprLike, + threads_per_block: ExprLike, + stream: ExprLike | None, + ): + from warnings import warn + + warn( + "cuda_invoke is deprecated and will be removed before version 0.1. " + "Call `use_cuda()` and use `gpu_invoke` instead.", + FutureWarning, + ) + + return self.gpu_invoke( + kernel_handle, + grid_size=num_blocks, + block_size=threads_per_block, + stream=stream, + ) diff --git a/src/pystencilssfg/lang/cuda.py b/src/pystencilssfg/lang/cuda.py deleted file mode 100644 index 28794da..0000000 --- a/src/pystencilssfg/lang/cuda.py +++ /dev/null @@ -1,13 +0,0 @@ -from .expressions import CppClass, cpptype - - -from ..extensions.gpu import dim3class - -dim3 = dim3class( - "<cuda_runtime.h>" -) -"""Reflection of CUDA's `dim3`.""" - - -class cudaStream_t(CppClass): - template = cpptype("cudaStream_t", "<cuda_runtime.h>") diff --git a/src/pystencilssfg/lang/gpu.py b/src/pystencilssfg/lang/gpu.py new file mode 100644 index 0000000..ccf86d9 --- /dev/null +++ b/src/pystencilssfg/lang/gpu.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from typing import Protocol + +from .expressions import CppClass, cpptype, AugExpr + + +class _Dim3Base(CppClass): + def ctor(self, dim0=1, dim1=1, dim2=1): + return self.ctor_bind(dim0, dim1, dim2) + + @property + def x(self): + return AugExpr.format("{}.x", self) + + @property + def y(self): + return AugExpr.format("{}.y", self) + + @property + def z(self): + return AugExpr.format("{}.z", self) + + @property + def dims(self): + """The dims property.""" + return [self.x, self.y, self.z] + + +class ProvidesGpuRuntimeAPI(Protocol): + + dim3: type[_Dim3Base] + + stream_t: type[AugExpr] + + +class CudaAPI(ProvidesGpuRuntimeAPI): + + class dim3(_Dim3Base): + template = cpptype("dim3", "<cuda_runtime.h>") + + class stream_t(CppClass): + template = cpptype("cudaStream_t", "<cuda_runtime.h>") + + +class HipAPI(ProvidesGpuRuntimeAPI): + + class dim3(_Dim3Base): + template = cpptype("dim3", "<hip/hip_runtime.h>") + + class stream_t(CppClass): + template = cpptype("hipStream_t", "<hip/hip_runtime.h>") diff --git a/tests/generator_scripts/index.yaml b/tests/generator_scripts/index.yaml index bfbedda..837ea10 100644 --- a/tests/generator_scripts/index.yaml +++ b/tests/generator_scripts/index.yaml @@ -104,6 +104,19 @@ CudaKernels: - --expt-relaxed-constexpr skip-if-not-found: true +# HIP + +HipKernels: + sfg-args: + file-extensions: ["h++", "hip"] + compile: + cxx: hipcc + cxx-flags: + - -std=c++20 + - -Wall + - -Werror + skip-if-not-found: true + # SYCL SyclKernels: diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py index 9bd37a5..21064f6 100644 --- a/tests/generator_scripts/source/CudaKernels.py +++ b/tests/generator_scripts/source/CudaKernels.py @@ -1,5 +1,4 @@ from pystencilssfg import SourceFileGenerator -from pystencilssfg.lang.cuda import dim3 from pystencilssfg.lang.cpp import std import pystencils as ps @@ -7,6 +6,7 @@ import pystencils as ps std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>") with SourceFileGenerator() as sfg: + sfg.use_cuda() src, dst = ps.fields("src, dst: double[3D]", layout="c") asm = ps.Assignment(dst(0), 2 * src(0)) @@ -14,10 +14,10 @@ with SourceFileGenerator() as sfg: khandle = sfg.kernels.create(asm, "scale", cfg) - block_size = dim3().var("blockSize") - + block_size = sfg.gpu_api.dim3().var("blockSize") + sfg.function("invoke")( sfg.map_field(src, std.mdspan.from_field(src)), sfg.map_field(dst, std.mdspan.from_field(dst)), - sfg.cuda_invoke(khandle, block_size=block_size) + sfg.gpu_invoke(khandle, block_size=block_size), ) diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py new file mode 100644 index 0000000..16508d2 --- /dev/null +++ b/tests/generator_scripts/source/HipKernels.py @@ -0,0 +1,23 @@ +from pystencilssfg import SourceFileGenerator +from pystencilssfg.lang.cpp import std + +import pystencils as ps + +std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>") + +with SourceFileGenerator() as sfg: + sfg.use_hip() + + src, dst = ps.fields("src, dst: double[3D]", layout="c") + asm = ps.Assignment(dst(0), 2 * src(0)) + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + + khandle = sfg.kernels.create(asm, "scale", cfg) + + block_size = sfg.gpu_api.dim3().var("blockSize") + + sfg.function("invoke")( + sfg.map_field(src, std.mdspan.from_field(src)), + sfg.map_field(dst, std.mdspan.from_field(dst)), + sfg.gpu_invoke(khandle, block_size=block_size), + ) -- GitLab