diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py index bedc2067eff11760936f04cd740027143211e639..63949800188e7b4f66c7bb2568d369f34135271d 100644 --- a/src/pystencilssfg/composer/gpu_composer.py +++ b/src/pystencilssfg/composer/gpu_composer.py @@ -15,7 +15,7 @@ from .basic_composer import SfgBasicComposer, make_statements from ..ir import ( SfgKernelHandle, SfgCallTreeNode, - SfgCudaKernelInvocation, + SfgGpuKernelInvocation, SfgBlock, SfgSequence, ) @@ -68,6 +68,7 @@ class SfgGpuComposer(SfgComposerMixIn): *, grid_size: ExprLike, block_size: ExprLike, + shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode: """Invoke a CUDA kernel with a manual launch grid. @@ -81,6 +82,7 @@ class SfgGpuComposer(SfgComposerMixIn): self, kernel_handle: SfgKernelHandle, *, + shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode: """Invoke a CUDA kernel with an automatic launch grid. @@ -95,6 +97,7 @@ class SfgGpuComposer(SfgComposerMixIn): kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, + shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode: """Invoke a CUDA kernel with a dynamic launch grid. @@ -118,27 +121,35 @@ class SfgGpuComposer(SfgComposerMixIn): dim3 = self.gpu_api.dim3 - def _render_invocation( - grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None - ): + grid_size: ExprLike + block_size: ExprLike + shared_memory_bytes: ExprLike = kwargs.get("shared_memory_bytes", "0") + stream: ExprLike | None = kwargs.get("stream", None) + + def _render_invocation(grid_size: ExprLike, block_size: ExprLike): stmt_grid_size = make_statements(grid_size) stmt_block_size = make_statements(block_size) + stmt_smem = ( + make_statements(shared_memory_bytes) + if shared_memory_bytes is not None + else None + ) stmt_stream = make_statements(stream) if stream is not None else None - return SfgCudaKernelInvocation( - kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream + return SfgGpuKernelInvocation( + kernel_handle, + stmt_grid_size, + stmt_block_size, + shared_memory_bytes=stmt_smem, + stream=stmt_stream, ) - grid_size: ExprLike - block_size: ExprLike - stream: ExprLike | None = kwargs.get("stream", None) - match launch_config: case ManualLaunchConfiguration(): grid_size = kwargs["grid_size"] block_size = kwargs["block_size"] - return _render_invocation(grid_size, block_size, stream) + return _render_invocation(grid_size, block_size) case AutomaticLaunchConfiguration(): grid_size_entries = [ @@ -154,7 +165,7 @@ class SfgGpuComposer(SfgComposerMixIn): nodes = [ self.init(grid_size_var)(*grid_size_entries), self.init(block_size_var)(*block_size_entries), - _render_invocation(grid_size_var, block_size_var, stream), + _render_invocation(grid_size_var, block_size_var), ] return SfgBlock(SfgSequence(nodes)) @@ -217,7 +228,7 @@ class SfgGpuComposer(SfgComposerMixIn): self.init(work_items_var)(*work_items_entries), self.init(reduced_block_size_var)(*reduced_block_size_entries), self.init(grid_size_var)(*grid_size_entries), - _render_invocation(grid_size_var, reduced_block_size_var, stream), + _render_invocation(grid_size_var, reduced_block_size_var), ] return SfgBlock(SfgSequence(nodes)) diff --git a/src/pystencilssfg/ir/__init__.py b/src/pystencilssfg/ir/__init__.py index 8f03fed0d4c2467377cdaab6cf100a13f7ded9fb..0d93fb148333921d60f25cf68ace89f61873e383 100644 --- a/src/pystencilssfg/ir/__init__.py +++ b/src/pystencilssfg/ir/__init__.py @@ -3,7 +3,7 @@ from .call_tree import ( SfgCallTreeLeaf, SfgEmptyNode, SfgKernelCallNode, - SfgCudaKernelInvocation, + SfgGpuKernelInvocation, SfgBlock, SfgSequence, SfgStatements, @@ -47,7 +47,7 @@ __all__ = [ "SfgCallTreeLeaf", "SfgEmptyNode", "SfgKernelCallNode", - "SfgCudaKernelInvocation", + "SfgGpuKernelInvocation", "SfgSequence", "SfgBlock", "SfgStatements", diff --git a/src/pystencilssfg/ir/call_tree.py b/src/pystencilssfg/ir/call_tree.py index 78ba84117cadf9f83f8d0cfdd4afc2ff2e169e81..ab84db4fe8452e57b3b3a85b0fb529865e84c5f9 100644 --- a/src/pystencilssfg/ir/call_tree.py +++ b/src/pystencilssfg/ir/call_tree.py @@ -210,12 +210,20 @@ class SfgKernelCallNode(SfgCallTreeLeaf): return f"{fnc_name}({call_parameters});" -class SfgCudaKernelInvocation(SfgCallTreeNode): +class SfgGpuKernelInvocation(SfgCallTreeNode): + """A CUDA or HIP kernel invocation. + + See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration + or https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions + for the syntax. + """ + def __init__( self, kernel_handle: SfgKernelHandle, grid_size: SfgStatements, block_size: SfgStatements, + shared_memory_bytes: SfgStatements | None, stream: SfgStatements | None, ): from pystencils import Target @@ -231,14 +239,23 @@ class SfgCudaKernelInvocation(SfgCallTreeNode): self._kernel_handle = kernel_handle self._grid_size = grid_size self._block_size = block_size + self._shared_memory_bytes = shared_memory_bytes self._stream = stream @property def children(self) -> Sequence[SfgCallTreeNode]: return ( - self._grid_size, - self._block_size, - ) + ((self._stream,) if self._stream is not None else ()) + ( + self._grid_size, + self._block_size, + ) + + ( + (self._shared_memory_bytes,) + if self._shared_memory_bytes is not None + else () + ) + + ((self._stream,) if self._stream is not None else ()) + ) @property def depends(self) -> set[SfgVar]: @@ -250,6 +267,9 @@ class SfgCudaKernelInvocation(SfgCallTreeNode): call_parameters = ", ".join([p.name for p in kparams]) grid_args = [self._grid_size, self._block_size] + if self._shared_memory_bytes is not None: + grid_args += [self._shared_memory_bytes] + if self._stream is not None: grid_args += [self._stream] diff --git a/tests/generator_scripts/index.yaml b/tests/generator_scripts/index.yaml index 837ea10bb526774bea5457c2bb437ea9a4939910..788c8bd58efd077ea3a47884b260a65ad2dc49dc 100644 --- a/tests/generator_scripts/index.yaml +++ b/tests/generator_scripts/index.yaml @@ -108,7 +108,7 @@ CudaKernels: HipKernels: sfg-args: - file-extensions: ["h++", "hip"] + file-extensions: ["hpp", "hip"] compile: cxx: hipcc cxx-flags: diff --git a/tests/generator_scripts/source/HipKernels.harness.cpp b/tests/generator_scripts/source/HipKernels.harness.cpp new file mode 100644 index 0000000000000000000000000000000000000000..495f100456dd07dd0be246a0437338f737c997fe --- /dev/null +++ b/tests/generator_scripts/source/HipKernels.harness.cpp @@ -0,0 +1,72 @@ +#include "HipKernels.hpp" + +#include <hip/hip_runtime.h> + +#include <experimental/mdspan> +#include <random> +#include <iostream> + +#undef NDEBUG +#include <cassert> + +namespace stdex = std::experimental; + +using extents_t = stdex::dextents<uint64_t, 3>; +using field_t = stdex::mdspan<double, extents_t, stdex::layout_right>; + +void checkHipError(hipError_t err) +{ + if (err != hipSuccess) + { + std::cerr << "HIP Error: " << err << std::endl; + exit(2); + } +} + +int main(void) +{ + + extents_t extents{23, 25, 132}; + size_t items{extents.extent(0) * extents.extent(1) * extents.extent(2)}; + + double *data_src; + checkHipError(hipMallocManaged<double>(&data_src, sizeof(double) * items)); + field_t src{data_src, extents}; + + double *data_dst; + checkHipError(hipMallocManaged<double>(&data_dst, sizeof(double) * items)); + field_t dst{data_dst, extents}; + + std::random_device rd; + std::mt19937 gen{rd()}; + std::uniform_real_distribution<double> distrib{-1.0, 1.0}; + + for (size_t i = 0; i < items; ++i) + { + data_src[i] = distrib(gen); + } + + dim3 blockSize{64, 8, 1}; + + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + + gen::gpuScaleKernel(blockSize, dst, src, stream); + + checkHipError(hipStreamSynchronize(stream)); + + for (size_t i = 0; i < items; ++i) + { + const double desired = 2.0 * data_src[i]; + if (std::abs(desired - data_dst[i]) >= 1e-12) + { + std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl; + exit(EXIT_FAILURE); + } + } + + checkHipError(hipFree(data_src)); + checkHipError(hipFree(data_dst)); + + return EXIT_SUCCESS; +} diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py index 16508d2a93f1aaf75c4b6dfb1b3fea26ecda62e7..ed229337ceb6efe51378483eb83752abd0d89c52 100644 --- a/tests/generator_scripts/source/HipKernels.py +++ b/tests/generator_scripts/source/HipKernels.py @@ -7,6 +7,7 @@ std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan with SourceFileGenerator() as sfg: sfg.use_hip() + sfg.namespace("gen") src, dst = ps.fields("src, dst: double[3D]", layout="c") asm = ps.Assignment(dst(0), 2 * src(0)) @@ -15,9 +16,10 @@ with SourceFileGenerator() as sfg: khandle = sfg.kernels.create(asm, "scale", cfg) block_size = sfg.gpu_api.dim3().var("blockSize") + stream = sfg.gpu_api.stream_t().var("stream") - sfg.function("invoke")( - sfg.map_field(src, std.mdspan.from_field(src)), - sfg.map_field(dst, std.mdspan.from_field(dst)), - sfg.gpu_invoke(khandle, block_size=block_size), + sfg.function("gpuScaleKernel")( + sfg.map_field(src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")), + sfg.map_field(dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")), + sfg.gpu_invoke(khandle, block_size=block_size, stream=stream), )