diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 6e2b2f3fed3a00061d4289e1d2f0742c0ff2fdeb..cd1710cf5ffe8e5ced135867be2c9e947c7e6b64 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -52,7 +52,14 @@ def test_reduction_cpu(instruction_set, dtype, op): @pytest.mark.parametrize("dtype", ["float64", "float32"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) -def test_reduction_gpu(dtype, op): +@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) +@pytest.mark.parametrize("use_block_fitting", [True, False]) +def test_reduction_gpu( + dtype: str, + op: str, + assume_warp_aligned_block_size: bool, + use_block_fitting: bool, +): try: import cupy as cp from cupy_backends.cuda.api.runtime import CUDARuntimeError @@ -66,12 +73,16 @@ def test_reduction_gpu(dtype, op): reason="No CUDA capable device is detected", allow_module_level=True ) - config = ps.CreateKernelConfig(target=ps.Target.GPU) + cfg = ps.CreateKernelConfig(target=ps.Target.GPU) + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size - ast_reduction = get_reduction_assign_ast(dtype, op, config) + ast_reduction = get_reduction_assign_ast(dtype, op, cfg) ps.show_code(ast_reduction) kernel_reduction = ast_reduction.compile() + if use_block_fitting: + kernel_reduction.launch_config.fit_block_size((32, 1, 1)) + array = np.full((SIZE,), INIT_ARR, dtype=dtype) reduction_array = np.full((1,), INIT_W, dtype=dtype)