diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1420bd2b53e918cfce8e6b5491e94888ad36a0f9..9a6e7b5155f7aade1c8c5c0bbdac73f854a56df9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -38,10 +38,13 @@ typechecker:
         coverage_format: cobertura
         path: coverage.xml
 
-"testsuite-py3.10":
+"testsuite-py3.10+cuda":
   extends: .testsuite-base
   script:
     - nox --session testsuite-3.10
+  tags:
+    - docker
+    - cuda11
 
 "testsuite-py3.13":
   extends: .testsuite-base
diff --git a/tests/generator_scripts/source/CudaKernels.harness.cpp b/tests/generator_scripts/source/CudaKernels.harness.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b86a7c2146d9bd2cb30a8b91b03b7dd738208e33
--- /dev/null
+++ b/tests/generator_scripts/source/CudaKernels.harness.cpp
@@ -0,0 +1,107 @@
+#include "CudaKernels.hpp"
+
+#include <cuda/cuda_runtime.h>
+
+#include <experimental/mdspan>
+#include <random>
+#include <iostream>
+#include <functional>
+
+#undef NDEBUG
+#include <cassert>
+
+namespace stdex = std::experimental;
+
+using extents_t = stdex::dextents<uint64_t, 3>;
+using field_t = stdex::mdspan<double, extents_t, stdex::layout_right>;
+
+void checkCudaError(cudaError_t err)
+{
+    if (err != cudaSuccess)
+    {
+        std::cerr << "HIP Error: " << err << std::endl;
+        exit(2);
+    }
+}
+
+int main(void)
+{
+
+    extents_t extents{23, 25, 132};
+    size_t items{extents.extent(0) * extents.extent(1) * extents.extent(2)};
+
+    double *data_src;
+    checkCudaError(cudaMallocManaged<double>(&data_src, sizeof(double) * items));
+    field_t src{data_src, extents};
+
+    double *data_dst;
+    checkCudaError(cudaMallocManaged<double>(&data_dst, sizeof(double) * items));
+    field_t dst{data_dst, extents};
+
+    std::random_device rd;
+    std::mt19937 gen{rd()};
+    std::uniform_real_distribution<double> distrib{-1.0, 1.0};
+
+    auto check = [&](std::function< void () > invoke) {
+        for (size_t i = 0; i < items; ++i)
+        {
+            data_src[i] = distrib(gen);
+            data_dst[i] = NAN;
+        }
+
+        invoke();
+
+        for (size_t i = 0; i < items; ++i)
+        {
+            const double desired = 2.0 * data_src[i];
+            if (std::abs(desired - data_dst[i]) >= 1e-12)
+            {
+                std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl;
+                exit(EXIT_FAILURE);
+            }
+        }
+    };
+
+    check([&]() {
+        /* Linear3D Dynamic */
+        dim3 blockSize{64, 8, 1};
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::linear3d::scaleKernel(blockSize, dst, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream));
+    });
+
+    check([&]() {
+        /* Blockwise4D Automatic */
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::blockwise4d::scaleKernel(dst, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream));
+    });
+
+    check([&]() {
+        /* Linear3D Manual */
+        dim3 blockSize{32, 8, 1};
+        dim3 gridSize{5, 4, 23};
+
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream));
+    });
+
+    check([&]() {
+        /* Blockwise4D Manual */
+        dim3 blockSize{132, 1, 1};
+        dim3 gridSize{25, 23, 1};
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream));
+    });
+
+    checkCudaError(cudaFree(data_src));
+    checkCudaError(cudaFree(data_dst));
+
+    return EXIT_SUCCESS;
+}
diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py
index 21064f69a2bb5ce014ce0208807dd1c1a05fd2cf..dc7e643bab0d33aa957d3d7957f679c0d9729086 100644
--- a/tests/generator_scripts/source/CudaKernels.py
+++ b/tests/generator_scripts/source/CudaKernels.py
@@ -5,19 +5,79 @@ import pystencils as ps
 
 std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
 
+
+src, dst = ps.fields("src, dst: double[3D]", layout="c")
+asm = ps.Assignment(dst(0), 2 * src(0))
+
+
 with SourceFileGenerator() as sfg:
     sfg.use_cuda()
+    sfg.namespace("gen")
 
-    src, dst = ps.fields("src, dst: double[3D]", layout="c")
-    asm = ps.Assignment(dst(0), 2 * src(0))
-    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
-
-    khandle = sfg.kernels.create(asm, "scale", cfg)
+    base_config = ps.CreateKernelConfig(target=ps.Target.CUDA)
 
     block_size = sfg.gpu_api.dim3().var("blockSize")
+    grid_size = sfg.gpu_api.dim3().var("gridSize")
+    stream = sfg.gpu_api.stream_t().var("stream")
+
+    with sfg.namespace("linear3d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, stream=stream),
+        )
+
+    with sfg.namespace("blockwise4d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, stream=stream),
+        )
+
+    with sfg.namespace("linear3d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+        )
+
+    with sfg.namespace("blockwise4d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
 
-    sfg.function("invoke")(
-        sfg.map_field(src, std.mdspan.from_field(src)),
-        sfg.map_field(dst, std.mdspan.from_field(dst)),
-        sfg.gpu_invoke(khandle, block_size=block_size),
-    )
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+        )