Merge branch 'devel/fix_reduction_docs' into 'v2.0-dev'

Fix output of warp-level kernel in reduction user guide See merge request !465

Merge branch 'devel/fix_reduction_docs' into 'v2.0-dev'
6c635e09 · Richard Angersbach · 1cea8375 · ac5bc2ce · 6c635e09 · 6c635e09
Commit 6c635e09 authored 3 months ago by Richard Angersbach
--- a/docs/source/user_manual/reductions.md
+++ b/docs/source/user_manual/reductions.md
@@ -88,8 +88,8 @@ create the kernel object via the {any}`create_kernel` function.
 For this example, we assume a kernel configuration for CPU platforms with no optimizations explicitly enabled.

 ```{code-cell} ipython3
-cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
-kernel = ps.create_kernel(assign_sum, cfg)
+cpu_cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
+kernel = ps.create_kernel(assign_sum, cpu_cfg)

 ps.inspect(kernel)
 ```
@@ -111,14 +111,14 @@ but will be incorporated in the reduction computation.
 Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value.

 ```{code-cell} ipython3
-    kernel_func = kernel.compile()
+kernel_func = kernel.compile()

-    x_array = np.ones((4, 4, 4), dtype="float64")
-    reduction_result = np.zeros((1,), dtype="float64")
+x_array = np.ones((4, 4, 4), dtype="float64")
+reduction_result = np.zeros((1,), dtype="float64")

-    kernel_func(x=x_array, r=reduction_result)
-    
-    reduction_result[0]
+kernel_func(x=x_array, r=reduction_result)
+
+reduction_result[0]
 ```

 ### GPU Platforms
@@ -128,11 +128,11 @@ Similar to the CPU section, a base variant for NVIDIA GPUs without
 explicitly employing any optimizations is shown:

 ```{code-cell} ipython3
-    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+gpu_cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)

-    kernel_gpu = ps.create_kernel(assign_sum, cfg)
+kernel_gpu = ps.create_kernel(assign_sum, gpu_cfg)

-    ps.inspect(kernel_gpu)
+ps.inspect(kernel_gpu)
 ```

 The steps for running the generated code on NVIDIA GPUs are identical but the fields and the write-back pointer 
@@ -159,17 +159,17 @@ which are not supported yet.

 ```{code-cell} ipython3
 # configure SIMD vectorization
-cfg = ps.CreateKernelConfig(
+cpu_cfg_opt = ps.CreateKernelConfig(
  target=ps.Target.X86_AVX,
 )
-cfg.cpu.vectorize.enable = True
-cfg.cpu.vectorize.assume_inner_stride_one = True
+cpu_cfg_opt.cpu.vectorize.enable = True
+cpu_cfg_opt.cpu.vectorize.assume_inner_stride_one = True

 # configure OpenMP parallelization
-cfg.cpu.openmp.enable = True
-cfg.cpu.openmp.num_threads = 8
+cpu_cfg_opt.cpu.openmp.enable = True
+cpu_cfg_opt.cpu.openmp.num_threads = 8

-kernel_cpu_opt = ps.create_kernel(assign_sum, cfg)
+kernel_cpu_opt = ps.create_kernel(assign_sum, cpu_cfg_opt)

 ps.inspect(kernel_cpu_opt)
 ```
@@ -190,13 +190,14 @@ we employ a block fitting algorithm to obtain a block size that is also optimize
 You can find more detailed information about warp size alignment in {ref}`gpu_codegen`.

 ```{code-cell} ipython3
-    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
-    cfg.gpu.assume_warp_aligned_block_size = True
+gpu_cfg_opt = ps.CreateKernelConfig(target=ps.Target.CUDA)
+gpu_cfg_opt.gpu.assume_warp_aligned_block_size = True
+gpu_cfg_opt.gpu.warp_size = 32
+
+kernel_gpu_opt = ps.create_kernel(assign_sum, gpu_cfg_opt)

-    kernel_gpu_opt = ps.create_kernel(assign_sum, cfg)
-    
-    kernel_func = kernel_gpu_opt.compile()
-    kernel_func.launch_config.fit_block_size((32, 1, 1))
+kernel_func = kernel_gpu_opt.compile()
+kernel_func.launch_config.fit_block_size((32, 1, 1))

-    ps.inspect(kernel_gpu_opt)
+ps.inspect(kernel_gpu_opt)
 ```
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -256,21 +256,18 @@ class GenericGpu(Platform):
            raise MaterializationError(f"Unknown type of iteration space: {ispace}")

    @staticmethod
-    def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
+    def _block_local_thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
        """Returns thread indices multiplied with block dimension strides per dimension."""

        return tuple(
-            idx
-            * PsConstantExpr(
-                PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
-            )
+            idx * reduce(operator.mul, BLOCK_DIM[:i]) if i > 0 else idx
            for i, idx in enumerate(THREAD_IDX[: ispace.rank])
        )

    def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression:
        """Returns expression that determines whether a thread is the first within a warp."""

-        tids_per_dim = GenericGpu._thread_index_per_dim(ispace)
+        tids_per_dim = GenericGpu._block_local_thread_index_per_dim(ispace)
        tid: PsExpression = tids_per_dim[0]
        for t in tids_per_dim[1:]:
            tid = PsAdd(tid, t)

--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -5,21 +5,35 @@ import pystencils as ps
 from pystencils import Target
 from pystencils.sympyextensions import ReductionOp, reduction_assignment

-INIT_W = 5
-INIT_ARR = 2
-SIZE = 15
-SOLUTION = {
-    ReductionOp.Add: INIT_W + INIT_ARR * SIZE,
-    ReductionOp.Sub: INIT_W - INIT_ARR * SIZE,
-    ReductionOp.Mul: INIT_W * INIT_ARR**SIZE,
-    ReductionOp.Min: min(INIT_W, INIT_ARR),
-    ReductionOp.Max: max(INIT_W, INIT_ARR),
-}
+init_w = 5.0
+arr_size = 16
+
+
+def init_arr(op):
+    match op:
+        case ReductionOp.Mul:
+            return 0.99  # avoid value overflow for large array sizes
+        case _:
+            return 2.0
+
+
+def get_expected_solution(op, array):
+    match op:
+        case ReductionOp.Add:
+            return init_w + np.sum(array)
+        case ReductionOp.Sub:
+            return init_w - np.sum(array)
+        case ReductionOp.Mul:
+            return init_w * np.prod(array)
+        case ReductionOp.Min:
+            return min(init_w, np.min(array))
+        case ReductionOp.Max:
+            return max(init_w, np.max(array))


 # get AST for kernel with reduction assignment
-def get_reduction_assign_ast(dtype, op, config):
-    x = ps.fields(f"x: {dtype}[1d]")
+def get_reduction_assign_ast(dtype, op, dims, config):
+    x = ps.fields(f"x: {dtype}[{dims}d]")
    w = ps.TypedSymbol("w", dtype)

    red_assign = reduction_assignment(w, op, x.center())
@@ -27,15 +41,22 @@ def get_reduction_assign_ast(dtype, op, config):
    return ps.create_kernel([red_assign], config, default_dtype=dtype)


-def get_cpu_array(op, dtype):
+def get_cpu_array(dtype, op, dims):
+    shape = (arr_size,) * dims
+
    # increase difficulty of min/max tests by using range of values
    match op:
-        case ReductionOp.Min:
-            return np.linspace(INIT_ARR, INIT_ARR + SIZE, SIZE, dtype=dtype)
-        case ReductionOp.Max:
-            return np.linspace(INIT_ARR - SIZE, INIT_ARR, SIZE, dtype=dtype)
+        case ReductionOp.Min | ReductionOp.Max:
+            lo = init_arr(op) - arr_size
+            mi = init_arr(op)
+            hi = init_arr(op) + arr_size
+
+            if op is ReductionOp.Min:
+                return np.random.randint(mi, hi, size=shape).astype(dtype)
+            else:
+                return np.random.randint(lo, mi, size=shape).astype(dtype)
        case _:
-            return np.full((SIZE,), INIT_ARR, dtype=dtype)
+            return np.full(shape, init_arr(op), dtype=dtype)


 @pytest.mark.parametrize(
@@ -52,7 +73,12 @@ def get_cpu_array(op, dtype):
        ReductionOp.Max,
    ],
 )
-def test_reduction_cpu(target, dtype, op):
+@pytest.mark.parametrize("dims", [1, 2, 3])
+def test_reduction_cpu(
+        target: ps.Target,
+        dtype: str,
+        op: str,
+        dims: int):
    config = ps.CreateKernelConfig(target=target)
    config.cpu.openmp.enable = True

@@ -60,14 +86,14 @@ def test_reduction_cpu(target, dtype, op):
        config.cpu.vectorize.enable = True
        config.cpu.vectorize.assume_inner_stride_one = True

-    ast_reduction = get_reduction_assign_ast(dtype, op, config)
+    ast_reduction = get_reduction_assign_ast(dtype, op, dims, config)
    kernel_reduction = ast_reduction.compile()

-    array = get_cpu_array(op, dtype)
-    reduction_array = np.full((1,), INIT_W, dtype=dtype)
+    array = get_cpu_array(dtype, op, dims)
+    reduction_array = np.full((1,), init_w, dtype=dtype)

    kernel_reduction(x=array, w=reduction_array)
-    assert np.allclose(reduction_array, SOLUTION[op])
+    assert np.allclose(reduction_array, get_expected_solution(op, array))


 @pytest.mark.parametrize("dtype", ["float64", "float32"])
@@ -81,6 +107,7 @@ def test_reduction_cpu(target, dtype, op):
        ReductionOp.Max,
    ],
 )
+@pytest.mark.parametrize("dims", [1, 2, 3])
 @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
 @pytest.mark.parametrize("use_block_fitting", [True, False])
 @pytest.mark.parametrize("warp_size", [32, None])
@@ -90,6 +117,7 @@ def test_reduction_cpu(target, dtype, op):
 def test_reduction_gpu(
    dtype: str,
    op: str,
+    dims: int,
    assume_warp_aligned_block_size: bool,
    use_block_fitting: bool,
    warp_size: int | None,
@@ -101,17 +129,17 @@ def test_reduction_gpu(
    if warp_size:
        cfg.gpu.warp_size = warp_size

-    ast_reduction = get_reduction_assign_ast(dtype, op, cfg)
+    ast_reduction = get_reduction_assign_ast(dtype, op, dims, cfg)
    kernel_reduction = ast_reduction.compile()

    if use_block_fitting and warp_size:
        kernel_reduction.launch_config.fit_block_size((warp_size, 1, 1))

-    array = get_cpu_array(op, dtype)
-    reduction_array = np.full((1,), INIT_W, dtype=dtype)
+    array = get_cpu_array(dtype, op, dims)
+    reduction_array = np.full((1,), init_w, dtype=dtype)

    array_gpu = cp.asarray(array)
    reduction_array_gpu = cp.asarray(reduction_array)

    kernel_reduction(x=array_gpu, w=reduction_array_gpu)
-    assert np.allclose(reduction_array_gpu.get(), SOLUTION[op])
+    assert np.allclose(reduction_array_gpu.get(), get_expected_solution(op, array))