From 4bc5ff44dd2076e36411ee037bf1fc48a1efaf57 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Mon, 10 Mar 2025 16:43:00 +0100 Subject: [PATCH] add cuda tests; add CUDA requirement to CI --- .gitlab-ci.yml | 5 +- .../source/CudaKernels.harness.cpp | 107 ++++++++++++++++++ tests/generator_scripts/source/CudaKernels.py | 80 +++++++++++-- 3 files changed, 181 insertions(+), 11 deletions(-) create mode 100644 tests/generator_scripts/source/CudaKernels.harness.cpp diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1420bd2..9a6e7b5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -38,10 +38,13 @@ typechecker: coverage_format: cobertura path: coverage.xml -"testsuite-py3.10": +"testsuite-py3.10+cuda": extends: .testsuite-base script: - nox --session testsuite-3.10 + tags: + - docker + - cuda11 "testsuite-py3.13": extends: .testsuite-base diff --git a/tests/generator_scripts/source/CudaKernels.harness.cpp b/tests/generator_scripts/source/CudaKernels.harness.cpp new file mode 100644 index 0000000..b86a7c2 --- /dev/null +++ b/tests/generator_scripts/source/CudaKernels.harness.cpp @@ -0,0 +1,107 @@ +#include "CudaKernels.hpp" + +#include <cuda/cuda_runtime.h> + +#include <experimental/mdspan> +#include <random> +#include <iostream> +#include <functional> + +#undef NDEBUG +#include <cassert> + +namespace stdex = std::experimental; + +using extents_t = stdex::dextents<uint64_t, 3>; +using field_t = stdex::mdspan<double, extents_t, stdex::layout_right>; + +void checkCudaError(cudaError_t err) +{ + if (err != cudaSuccess) + { + std::cerr << "HIP Error: " << err << std::endl; + exit(2); + } +} + +int main(void) +{ + + extents_t extents{23, 25, 132}; + size_t items{extents.extent(0) * extents.extent(1) * extents.extent(2)}; + + double *data_src; + checkCudaError(cudaMallocManaged<double>(&data_src, sizeof(double) * items)); + field_t src{data_src, extents}; + + double *data_dst; + checkCudaError(cudaMallocManaged<double>(&data_dst, sizeof(double) * items)); + field_t dst{data_dst, extents}; + + std::random_device rd; + std::mt19937 gen{rd()}; + std::uniform_real_distribution<double> distrib{-1.0, 1.0}; + + auto check = [&](std::function< void () > invoke) { + for (size_t i = 0; i < items; ++i) + { + data_src[i] = distrib(gen); + data_dst[i] = NAN; + } + + invoke(); + + for (size_t i = 0; i < items; ++i) + { + const double desired = 2.0 * data_src[i]; + if (std::abs(desired - data_dst[i]) >= 1e-12) + { + std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl; + exit(EXIT_FAILURE); + } + } + }; + + check([&]() { + /* Linear3D Dynamic */ + dim3 blockSize{64, 8, 1}; + cudaStream_t stream; + checkCudaError(cudaStreamCreate(&stream)); + gen::linear3d::scaleKernel(blockSize, dst, src, stream); + checkCudaError(cudaStreamSynchronize(stream)); + }); + + check([&]() { + /* Blockwise4D Automatic */ + cudaStream_t stream; + checkCudaError(cudaStreamCreate(&stream)); + gen::blockwise4d::scaleKernel(dst, src, stream); + checkCudaError(cudaStreamSynchronize(stream)); + }); + + check([&]() { + /* Linear3D Manual */ + dim3 blockSize{32, 8, 1}; + dim3 gridSize{5, 4, 23}; + + cudaStream_t stream; + checkCudaError(cudaStreamCreate(&stream)); + gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream); + checkCudaError(cudaStreamSynchronize(stream)); + }); + + check([&]() { + /* Blockwise4D Manual */ + dim3 blockSize{132, 1, 1}; + dim3 gridSize{25, 23, 1}; + cudaStream_t stream; + checkCudaError(cudaStreamCreate(&stream)); + gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream); + checkCudaError(cudaStreamSynchronize(stream)); + }); + + checkCudaError(cudaFree(data_src)); + checkCudaError(cudaFree(data_dst)); + + return EXIT_SUCCESS; +} diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py index 21064f6..dc7e643 100644 --- a/tests/generator_scripts/source/CudaKernels.py +++ b/tests/generator_scripts/source/CudaKernels.py @@ -5,19 +5,79 @@ import pystencils as ps std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>") + +src, dst = ps.fields("src, dst: double[3D]", layout="c") +asm = ps.Assignment(dst(0), 2 * src(0)) + + with SourceFileGenerator() as sfg: sfg.use_cuda() + sfg.namespace("gen") - src, dst = ps.fields("src, dst: double[3D]", layout="c") - asm = ps.Assignment(dst(0), 2 * src(0)) - cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) - - khandle = sfg.kernels.create(asm, "scale", cfg) + base_config = ps.CreateKernelConfig(target=ps.Target.CUDA) block_size = sfg.gpu_api.dim3().var("blockSize") + grid_size = sfg.gpu_api.dim3().var("gridSize") + stream = sfg.gpu_api.stream_t().var("stream") + + with sfg.namespace("linear3d"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, block_size=block_size, stream=stream), + ) + + with sfg.namespace("blockwise4d"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "blockwise4d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, stream=stream), + ) + + with sfg.namespace("linear3d_manual"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + cfg.gpu.manual_launch_grid = True + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream), + ) + + with sfg.namespace("blockwise4d_manual"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "blockwise4d" + cfg.gpu.manual_launch_grid = True + khandle = sfg.kernels.create(asm, "scale", cfg) - sfg.function("invoke")( - sfg.map_field(src, std.mdspan.from_field(src)), - sfg.map_field(dst, std.mdspan.from_field(dst)), - sfg.gpu_invoke(khandle, block_size=block_size), - ) + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream), + ) -- GitLab