diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 6e2b2f3fed3a00061d4289e1d2f0742c0ff2fdeb..cd1710cf5ffe8e5ced135867be2c9e947c7e6b64 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -52,7 +52,14 @@ def test_reduction_cpu(instruction_set, dtype, op):
 
 @pytest.mark.parametrize("dtype", ["float64", "float32"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
-def test_reduction_gpu(dtype, op):
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
+@pytest.mark.parametrize("use_block_fitting", [True, False])
+def test_reduction_gpu(
+        dtype: str,
+        op: str,
+        assume_warp_aligned_block_size: bool,
+        use_block_fitting: bool,
+):
     try:
         import cupy as cp
         from cupy_backends.cuda.api.runtime import CUDARuntimeError
@@ -66,12 +73,16 @@ def test_reduction_gpu(dtype, op):
             reason="No CUDA capable device is detected", allow_module_level=True
         )
 
-    config = ps.CreateKernelConfig(target=ps.Target.GPU)
+    cfg = ps.CreateKernelConfig(target=ps.Target.GPU)
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
-    ast_reduction = get_reduction_assign_ast(dtype, op, config)
+    ast_reduction = get_reduction_assign_ast(dtype, op, cfg)
     ps.show_code(ast_reduction)
     kernel_reduction = ast_reduction.compile()
 
+    if use_block_fitting:
+        kernel_reduction.launch_config.fit_block_size((32, 1, 1))
+
     array = np.full((SIZE,), INIT_ARR, dtype=dtype)
     reduction_array = np.full((1,), INIT_W, dtype=dtype)