From b1b71c327617e57eee03c532f74c38116439a39d Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Mon, 10 Mar 2025 16:24:03 +0100 Subject: [PATCH] extend HIP test case. Do not alter block sizes in composer. --- src/pystencilssfg/composer/gpu_composer.py | 53 +++++-------- src/pystencilssfg/context.py | 4 + .../source/HipKernels.harness.cpp | 73 ++++++++++++----- tests/generator_scripts/source/HipKernels.py | 78 ++++++++++++++++--- 4 files changed, 146 insertions(+), 62 deletions(-) diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py index 6394980..b24afcd 100644 --- a/src/pystencilssfg/composer/gpu_composer.py +++ b/src/pystencilssfg/composer/gpu_composer.py @@ -29,31 +29,24 @@ class SfgGpuComposer(SfgComposerMixIn): self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None def use_cuda(self): + """Instruct the GPU composer to use the CUDA runtime API""" from ..lang.gpu import CudaAPI - if self._gpu_api_provider is not None and not isinstance( - self._gpu_api_provider, CudaAPI - ): - raise ValueError( - "Cannot select CUDA GPU API since another API was already chosen" - ) - self._gpu_api_provider = CudaAPI() def use_hip(self): + """Instruct the GPU composer to use the HIP runtime API""" from ..lang.gpu import HipAPI - if self._gpu_api_provider is not None and not isinstance( - self._gpu_api_provider, HipAPI - ): - raise ValueError( - "Cannot select HIP GPU API since another API was already chosen" - ) - self._gpu_api_provider = HipAPI() @property def gpu_api(self) -> ProvidesGpuRuntimeAPI: + """GPU runtime API wrapper currently used by this GPU composer. + + Raises: + AttributeError: If no runtime API was set yet (see `use_cuda`, `use_hip`) + """ if self._gpu_api_provider is None: raise AttributeError( "No GPU API was selected - call `use_cuda()` or `use_hip()` first." @@ -104,7 +97,8 @@ class SfgGpuComposer(SfgComposerMixIn): This signature accepts kernels generated with an indexing scheme that permits a user-defined blocks size, such as `Linear3D <IndexingScheme.Linear3D>`. - The grid size is calculated automatically. + The grid size is calculated automatically by dividing the number of work items in each + dimension by the block size, rounding up. """ def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode: @@ -144,6 +138,9 @@ class SfgGpuComposer(SfgComposerMixIn): stream=stmt_stream, ) + def to_uint32_t(expr: AugExpr) -> AugExpr: + return AugExpr("uint32_t").format("uint32_t({})", expr) + match launch_config: case ManualLaunchConfiguration(): grid_size = kwargs["grid_size"] @@ -153,12 +150,14 @@ class SfgGpuComposer(SfgComposerMixIn): case AutomaticLaunchConfiguration(): grid_size_entries = [ - self.expr_from_lambda(gs) for gs in launch_config._grid_size + to_uint32_t(self.expr_from_lambda(gs)) + for gs in launch_config._grid_size ] grid_size_var = dim3(const=True).var("__grid_size") block_size_entries = [ - self.expr_from_lambda(bs) for bs in launch_config._block_size + to_uint32_t(self.expr_from_lambda(bs)) + for bs in launch_config._block_size ] block_size_var = dim3(const=True).var("__block_size") @@ -197,27 +196,16 @@ class SfgGpuComposer(SfgComposerMixIn): "uint32_t", "uint32_t", "uint32_t", const=True ).var("__work_items") - def _min(a: ExprLike, b: ExprLike): - return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b) - def _div_ceil(a: ExprLike, b: ExprLike): return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b) - reduced_block_size_entries = [ - _min(work_items_var.get(i), bs) - for i, bs in enumerate( - [block_size_var.x, block_size_var.y, block_size_var.z] - ) - ] - reduced_block_size_var = dim3(const=True).var("__reduced_block_size") - grid_size_entries = [ _div_ceil(work_items_var.get(i), bs) for i, bs in enumerate( [ - reduced_block_size_var.x, - reduced_block_size_var.y, - reduced_block_size_var.z, + block_size_var.x, + block_size_var.y, + block_size_var.z, ] ) ] @@ -226,9 +214,8 @@ class SfgGpuComposer(SfgComposerMixIn): nodes = [ self.init(block_size_var)(*block_size_init_args), self.init(work_items_var)(*work_items_entries), - self.init(reduced_block_size_var)(*reduced_block_size_entries), self.init(grid_size_var)(*grid_size_entries), - _render_invocation(grid_size_var, reduced_block_size_var), + _render_invocation(grid_size_var, block_size_var), ] return SfgBlock(SfgSequence(nodes)) diff --git a/src/pystencilssfg/context.py b/src/pystencilssfg/context.py index 1622a1e..3ea82f2 100644 --- a/src/pystencilssfg/context.py +++ b/src/pystencilssfg/context.py @@ -150,6 +150,9 @@ class SfgCursor: self._loc[f].append(block) self._loc[f] = block.elements + outer_namespace = self._cur_namespace + self._cur_namespace = namespace + @contextmanager def ctxmgr(): try: @@ -157,5 +160,6 @@ class SfgCursor: finally: # Have the cursor step back out of the nested namespace blocks self._loc = outer_locs + self._cur_namespace = outer_namespace return ctxmgr() diff --git a/tests/generator_scripts/source/HipKernels.harness.cpp b/tests/generator_scripts/source/HipKernels.harness.cpp index 495f100..b6d2d2d 100644 --- a/tests/generator_scripts/source/HipKernels.harness.cpp +++ b/tests/generator_scripts/source/HipKernels.harness.cpp @@ -5,6 +5,7 @@ #include <experimental/mdspan> #include <random> #include <iostream> +#include <functional> #undef NDEBUG #include <cassert> @@ -41,29 +42,63 @@ int main(void) std::mt19937 gen{rd()}; std::uniform_real_distribution<double> distrib{-1.0, 1.0}; - for (size_t i = 0; i < items; ++i) - { - data_src[i] = distrib(gen); - } - - dim3 blockSize{64, 8, 1}; - - hipStream_t stream; - checkHipError(hipStreamCreate(&stream)); - - gen::gpuScaleKernel(blockSize, dst, src, stream); + auto check = [&](std::function< void () > invoke) { + for (size_t i = 0; i < items; ++i) + { + data_src[i] = distrib(gen); + data_dst[i] = NAN; + } - checkHipError(hipStreamSynchronize(stream)); + invoke(); - for (size_t i = 0; i < items; ++i) - { - const double desired = 2.0 * data_src[i]; - if (std::abs(desired - data_dst[i]) >= 1e-12) + for (size_t i = 0; i < items; ++i) { - std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl; - exit(EXIT_FAILURE); + const double desired = 2.0 * data_src[i]; + if (std::abs(desired - data_dst[i]) >= 1e-12) + { + std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl; + exit(EXIT_FAILURE); + } } - } + }; + + check([&]() { + /* Linear3D Dynamic */ + dim3 blockSize{64, 8, 1}; + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + gen::linear3d::scaleKernel(blockSize, dst, src, stream); + checkHipError(hipStreamSynchronize(stream)); + }); + + check([&]() { + /* Blockwise4D Automatic */ + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + gen::blockwise4d::scaleKernel(dst, src, stream); + checkHipError(hipStreamSynchronize(stream)); + }); + + check([&]() { + /* Linear3D Manual */ + dim3 blockSize{32, 8, 1}; + dim3 gridSize{5, 4, 23}; + + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream); + checkHipError(hipStreamSynchronize(stream)); + }); + + check([&]() { + /* Blockwise4D Manual */ + dim3 blockSize{132, 1, 1}; + dim3 gridSize{25, 23, 1}; + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream); + checkHipError(hipStreamSynchronize(stream)); + }); checkHipError(hipFree(data_src)); checkHipError(hipFree(data_dst)); diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py index ed22933..35315b8 100644 --- a/tests/generator_scripts/source/HipKernels.py +++ b/tests/generator_scripts/source/HipKernels.py @@ -5,21 +5,79 @@ import pystencils as ps std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>") + +src, dst = ps.fields("src, dst: double[3D]", layout="c") +asm = ps.Assignment(dst(0), 2 * src(0)) + + with SourceFileGenerator() as sfg: sfg.use_hip() sfg.namespace("gen") - src, dst = ps.fields("src, dst: double[3D]", layout="c") - asm = ps.Assignment(dst(0), 2 * src(0)) - cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) - - khandle = sfg.kernels.create(asm, "scale", cfg) + base_config = ps.CreateKernelConfig(target=ps.Target.CUDA) block_size = sfg.gpu_api.dim3().var("blockSize") + grid_size = sfg.gpu_api.dim3().var("gridSize") stream = sfg.gpu_api.stream_t().var("stream") - sfg.function("gpuScaleKernel")( - sfg.map_field(src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")), - sfg.map_field(dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")), - sfg.gpu_invoke(khandle, block_size=block_size, stream=stream), - ) + with sfg.namespace("linear3d"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, block_size=block_size, stream=stream), + ) + + with sfg.namespace("blockwise4d"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "blockwise4d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, stream=stream), + ) + + with sfg.namespace("linear3d_manual"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + cfg.gpu.manual_launch_grid = True + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream), + ) + + with sfg.namespace("blockwise4d_manual"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "blockwise4d" + cfg.gpu.manual_launch_grid = True + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream), + ) -- GitLab