diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md
index 5b45a921cc9cdb61a369733b007573e61f70c0eb..c3bb6b9012fe736bd93ba4d3ac13e086cc582476 100644
--- a/docs/source/user_manual/reductions.md
+++ b/docs/source/user_manual/reductions.md
@@ -88,8 +88,8 @@ create the kernel object via the {any}`create_kernel` function.
 For this example, we assume a kernel configuration for CPU platforms with no optimizations explicitly enabled.
 
 ```{code-cell} ipython3
-cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
-kernel = ps.create_kernel(assign_sum, cfg)
+cpu_cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
+kernel = ps.create_kernel(assign_sum, cpu_cfg)
 
 ps.inspect(kernel)
 ```
@@ -111,14 +111,14 @@ but will be incorporated in the reduction computation.
 Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value.
 
 ```{code-cell} ipython3
-    kernel_func = kernel.compile()
+kernel_func = kernel.compile()
 
-    x_array = np.ones((4, 4, 4), dtype="float64")
-    reduction_result = np.zeros((1,), dtype="float64")
+x_array = np.ones((4, 4, 4), dtype="float64")
+reduction_result = np.zeros((1,), dtype="float64")
 
-    kernel_func(x=x_array, r=reduction_result)
-    
-    reduction_result[0]
+kernel_func(x=x_array, r=reduction_result)
+
+reduction_result[0]
 ```
 
 ### GPU Platforms
@@ -128,11 +128,11 @@ Similar to the CPU section, a base variant for NVIDIA GPUs without
 explicitly employing any optimizations is shown:
 
 ```{code-cell} ipython3
-    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+gpu_cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
 
-    kernel_gpu = ps.create_kernel(assign_sum, cfg)
+kernel_gpu = ps.create_kernel(assign_sum, gpu_cfg)
 
-    ps.inspect(kernel_gpu)
+ps.inspect(kernel_gpu)
 ```
 
 The steps for running the generated code on NVIDIA GPUs are identical but the fields and the write-back pointer 
@@ -159,17 +159,17 @@ which are not supported yet.
 
 ```{code-cell} ipython3
 # configure SIMD vectorization
-cfg = ps.CreateKernelConfig(
+cpu_cfg_opt = ps.CreateKernelConfig(
   target=ps.Target.X86_AVX,
 )
-cfg.cpu.vectorize.enable = True
-cfg.cpu.vectorize.assume_inner_stride_one = True
+cpu_cfg_opt.cpu.vectorize.enable = True
+cpu_cfg_opt.cpu.vectorize.assume_inner_stride_one = True
 
 # configure OpenMP parallelization
-cfg.cpu.openmp.enable = True
-cfg.cpu.openmp.num_threads = 8
+cpu_cfg_opt.cpu.openmp.enable = True
+cpu_cfg_opt.cpu.openmp.num_threads = 8
 
-kernel_cpu_opt = ps.create_kernel(assign_sum, cfg)
+kernel_cpu_opt = ps.create_kernel(assign_sum, cpu_cfg_opt)
 
 ps.inspect(kernel_cpu_opt)
 ```
@@ -190,13 +190,14 @@ we employ a block fitting algorithm to obtain a block size that is also optimize
 You can find more detailed information about warp size alignment in {ref}`gpu_codegen`.
 
 ```{code-cell} ipython3
-    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
-    cfg.gpu.assume_warp_aligned_block_size = True
+gpu_cfg_opt = ps.CreateKernelConfig(target=ps.Target.CUDA)
+gpu_cfg_opt.gpu.assume_warp_aligned_block_size = True
+gpu_cfg_opt.gpu.warp_size = 32
+
+kernel_gpu_opt = ps.create_kernel(assign_sum, gpu_cfg_opt)
 
-    kernel_gpu_opt = ps.create_kernel(assign_sum, cfg)
-    
-    kernel_func = kernel_gpu_opt.compile()
-    kernel_func.launch_config.fit_block_size((32, 1, 1))
+kernel_func = kernel_gpu_opt.compile()
+kernel_func.launch_config.fit_block_size((32, 1, 1))
 
-    ps.inspect(kernel_gpu_opt)
+ps.inspect(kernel_gpu_opt)
 ```
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 44c44b2f390d7a345bdb0f795181c82a6a590206..18dab8e1fa3deea42a250860341135bdd4bee856 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -256,21 +256,18 @@ class GenericGpu(Platform):
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
     @staticmethod
-    def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
+    def _block_local_thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
         """Returns thread indices multiplied with block dimension strides per dimension."""
 
         return tuple(
-            idx
-            * PsConstantExpr(
-                PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
-            )
+            idx * reduce(operator.mul, BLOCK_DIM[:i]) if i > 0 else idx
             for i, idx in enumerate(THREAD_IDX[: ispace.rank])
         )
 
     def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression:
         """Returns expression that determines whether a thread is the first within a warp."""
 
-        tids_per_dim = GenericGpu._thread_index_per_dim(ispace)
+        tids_per_dim = GenericGpu._block_local_thread_index_per_dim(ispace)
         tid: PsExpression = tids_per_dim[0]
         for t in tids_per_dim[1:]:
             tid = PsAdd(tid, t)
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 171e5406f2a2f4a34f58e3131b28a29ca6e60e7c..d5c798b3240f48ad2ad662fc672812b5064743e1 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -5,21 +5,35 @@ import pystencils as ps
 from pystencils import Target
 from pystencils.sympyextensions import ReductionOp, reduction_assignment
 
-INIT_W = 5
-INIT_ARR = 2
-SIZE = 15
-SOLUTION = {
-    ReductionOp.Add: INIT_W + INIT_ARR * SIZE,
-    ReductionOp.Sub: INIT_W - INIT_ARR * SIZE,
-    ReductionOp.Mul: INIT_W * INIT_ARR**SIZE,
-    ReductionOp.Min: min(INIT_W, INIT_ARR),
-    ReductionOp.Max: max(INIT_W, INIT_ARR),
-}
+init_w = 5.0
+arr_size = 16
+
+
+def init_arr(op):
+    match op:
+        case ReductionOp.Mul:
+            return 0.99  # avoid value overflow for large array sizes
+        case _:
+            return 2.0
+
+
+def get_expected_solution(op, array):
+    match op:
+        case ReductionOp.Add:
+            return init_w + np.sum(array)
+        case ReductionOp.Sub:
+            return init_w - np.sum(array)
+        case ReductionOp.Mul:
+            return init_w * np.prod(array)
+        case ReductionOp.Min:
+            return min(init_w, np.min(array))
+        case ReductionOp.Max:
+            return max(init_w, np.max(array))
 
 
 # get AST for kernel with reduction assignment
-def get_reduction_assign_ast(dtype, op, config):
-    x = ps.fields(f"x: {dtype}[1d]")
+def get_reduction_assign_ast(dtype, op, dims, config):
+    x = ps.fields(f"x: {dtype}[{dims}d]")
     w = ps.TypedSymbol("w", dtype)
 
     red_assign = reduction_assignment(w, op, x.center())
@@ -27,15 +41,22 @@ def get_reduction_assign_ast(dtype, op, config):
     return ps.create_kernel([red_assign], config, default_dtype=dtype)
 
 
-def get_cpu_array(op, dtype):
+def get_cpu_array(dtype, op, dims):
+    shape = (arr_size,) * dims
+
     # increase difficulty of min/max tests by using range of values
     match op:
-        case ReductionOp.Min:
-            return np.linspace(INIT_ARR, INIT_ARR + SIZE, SIZE, dtype=dtype)
-        case ReductionOp.Max:
-            return np.linspace(INIT_ARR - SIZE, INIT_ARR, SIZE, dtype=dtype)
+        case ReductionOp.Min | ReductionOp.Max:
+            lo = init_arr(op) - arr_size
+            mi = init_arr(op)
+            hi = init_arr(op) + arr_size
+
+            if op is ReductionOp.Min:
+                return np.random.randint(mi, hi, size=shape).astype(dtype)
+            else:
+                return np.random.randint(lo, mi, size=shape).astype(dtype)
         case _:
-            return np.full((SIZE,), INIT_ARR, dtype=dtype)
+            return np.full(shape, init_arr(op), dtype=dtype)
 
 
 @pytest.mark.parametrize(
@@ -52,7 +73,12 @@ def get_cpu_array(op, dtype):
         ReductionOp.Max,
     ],
 )
-def test_reduction_cpu(target, dtype, op):
+@pytest.mark.parametrize("dims", [1, 2, 3])
+def test_reduction_cpu(
+        target: ps.Target,
+        dtype: str,
+        op: str,
+        dims: int):
     config = ps.CreateKernelConfig(target=target)
     config.cpu.openmp.enable = True
 
@@ -60,14 +86,14 @@ def test_reduction_cpu(target, dtype, op):
         config.cpu.vectorize.enable = True
         config.cpu.vectorize.assume_inner_stride_one = True
 
-    ast_reduction = get_reduction_assign_ast(dtype, op, config)
+    ast_reduction = get_reduction_assign_ast(dtype, op, dims, config)
     kernel_reduction = ast_reduction.compile()
 
-    array = get_cpu_array(op, dtype)
-    reduction_array = np.full((1,), INIT_W, dtype=dtype)
+    array = get_cpu_array(dtype, op, dims)
+    reduction_array = np.full((1,), init_w, dtype=dtype)
 
     kernel_reduction(x=array, w=reduction_array)
-    assert np.allclose(reduction_array, SOLUTION[op])
+    assert np.allclose(reduction_array, get_expected_solution(op, array))
 
 
 @pytest.mark.parametrize("dtype", ["float64", "float32"])
@@ -81,6 +107,7 @@ def test_reduction_cpu(target, dtype, op):
         ReductionOp.Max,
     ],
 )
+@pytest.mark.parametrize("dims", [1, 2, 3])
 @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
 @pytest.mark.parametrize("use_block_fitting", [True, False])
 @pytest.mark.parametrize("warp_size", [32, None])
@@ -90,6 +117,7 @@ def test_reduction_cpu(target, dtype, op):
 def test_reduction_gpu(
     dtype: str,
     op: str,
+    dims: int,
     assume_warp_aligned_block_size: bool,
     use_block_fitting: bool,
     warp_size: int | None,
@@ -101,17 +129,17 @@ def test_reduction_gpu(
     if warp_size:
         cfg.gpu.warp_size = warp_size
 
-    ast_reduction = get_reduction_assign_ast(dtype, op, cfg)
+    ast_reduction = get_reduction_assign_ast(dtype, op, dims, cfg)
     kernel_reduction = ast_reduction.compile()
 
     if use_block_fitting and warp_size:
         kernel_reduction.launch_config.fit_block_size((warp_size, 1, 1))
 
-    array = get_cpu_array(op, dtype)
-    reduction_array = np.full((1,), INIT_W, dtype=dtype)
+    array = get_cpu_array(dtype, op, dims)
+    reduction_array = np.full((1,), init_w, dtype=dtype)
 
     array_gpu = cp.asarray(array)
     reduction_array_gpu = cp.asarray(reduction_array)
 
     kernel_reduction(x=array_gpu, w=reduction_array_gpu)
-    assert np.allclose(reduction_array_gpu.get(), SOLUTION[op])
+    assert np.allclose(reduction_array_gpu.get(), get_expected_solution(op, array))