diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md index 5b45a921cc9cdb61a369733b007573e61f70c0eb..c3bb6b9012fe736bd93ba4d3ac13e086cc582476 100644 --- a/docs/source/user_manual/reductions.md +++ b/docs/source/user_manual/reductions.md @@ -88,8 +88,8 @@ create the kernel object via the {any}`create_kernel` function. For this example, we assume a kernel configuration for CPU platforms with no optimizations explicitly enabled. ```{code-cell} ipython3 -cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) -kernel = ps.create_kernel(assign_sum, cfg) +cpu_cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) +kernel = ps.create_kernel(assign_sum, cpu_cfg) ps.inspect(kernel) ``` @@ -111,14 +111,14 @@ but will be incorporated in the reduction computation. Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value. ```{code-cell} ipython3 - kernel_func = kernel.compile() +kernel_func = kernel.compile() - x_array = np.ones((4, 4, 4), dtype="float64") - reduction_result = np.zeros((1,), dtype="float64") +x_array = np.ones((4, 4, 4), dtype="float64") +reduction_result = np.zeros((1,), dtype="float64") - kernel_func(x=x_array, r=reduction_result) - - reduction_result[0] +kernel_func(x=x_array, r=reduction_result) + +reduction_result[0] ``` ### GPU Platforms @@ -128,11 +128,11 @@ Similar to the CPU section, a base variant for NVIDIA GPUs without explicitly employing any optimizations is shown: ```{code-cell} ipython3 - cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) +gpu_cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) - kernel_gpu = ps.create_kernel(assign_sum, cfg) +kernel_gpu = ps.create_kernel(assign_sum, gpu_cfg) - ps.inspect(kernel_gpu) +ps.inspect(kernel_gpu) ``` The steps for running the generated code on NVIDIA GPUs are identical but the fields and the write-back pointer @@ -159,17 +159,17 @@ which are not supported yet. ```{code-cell} ipython3 # configure SIMD vectorization -cfg = ps.CreateKernelConfig( +cpu_cfg_opt = ps.CreateKernelConfig( target=ps.Target.X86_AVX, ) -cfg.cpu.vectorize.enable = True -cfg.cpu.vectorize.assume_inner_stride_one = True +cpu_cfg_opt.cpu.vectorize.enable = True +cpu_cfg_opt.cpu.vectorize.assume_inner_stride_one = True # configure OpenMP parallelization -cfg.cpu.openmp.enable = True -cfg.cpu.openmp.num_threads = 8 +cpu_cfg_opt.cpu.openmp.enable = True +cpu_cfg_opt.cpu.openmp.num_threads = 8 -kernel_cpu_opt = ps.create_kernel(assign_sum, cfg) +kernel_cpu_opt = ps.create_kernel(assign_sum, cpu_cfg_opt) ps.inspect(kernel_cpu_opt) ``` @@ -190,13 +190,14 @@ we employ a block fitting algorithm to obtain a block size that is also optimize You can find more detailed information about warp size alignment in {ref}`gpu_codegen`. ```{code-cell} ipython3 - cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) - cfg.gpu.assume_warp_aligned_block_size = True +gpu_cfg_opt = ps.CreateKernelConfig(target=ps.Target.CUDA) +gpu_cfg_opt.gpu.assume_warp_aligned_block_size = True +gpu_cfg_opt.gpu.warp_size = 32 + +kernel_gpu_opt = ps.create_kernel(assign_sum, gpu_cfg_opt) - kernel_gpu_opt = ps.create_kernel(assign_sum, cfg) - - kernel_func = kernel_gpu_opt.compile() - kernel_func.launch_config.fit_block_size((32, 1, 1)) +kernel_func = kernel_gpu_opt.compile() +kernel_func.launch_config.fit_block_size((32, 1, 1)) - ps.inspect(kernel_gpu_opt) +ps.inspect(kernel_gpu_opt) ``` diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 44c44b2f390d7a345bdb0f795181c82a6a590206..18dab8e1fa3deea42a250860341135bdd4bee856 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -256,21 +256,18 @@ class GenericGpu(Platform): raise MaterializationError(f"Unknown type of iteration space: {ispace}") @staticmethod - def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]: + def _block_local_thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]: """Returns thread indices multiplied with block dimension strides per dimension.""" return tuple( - idx - * PsConstantExpr( - PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)) - ) + idx * reduce(operator.mul, BLOCK_DIM[:i]) if i > 0 else idx for i, idx in enumerate(THREAD_IDX[: ispace.rank]) ) def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression: """Returns expression that determines whether a thread is the first within a warp.""" - tids_per_dim = GenericGpu._thread_index_per_dim(ispace) + tids_per_dim = GenericGpu._block_local_thread_index_per_dim(ispace) tid: PsExpression = tids_per_dim[0] for t in tids_per_dim[1:]: tid = PsAdd(tid, t) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 171e5406f2a2f4a34f58e3131b28a29ca6e60e7c..d5c798b3240f48ad2ad662fc672812b5064743e1 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -5,21 +5,35 @@ import pystencils as ps from pystencils import Target from pystencils.sympyextensions import ReductionOp, reduction_assignment -INIT_W = 5 -INIT_ARR = 2 -SIZE = 15 -SOLUTION = { - ReductionOp.Add: INIT_W + INIT_ARR * SIZE, - ReductionOp.Sub: INIT_W - INIT_ARR * SIZE, - ReductionOp.Mul: INIT_W * INIT_ARR**SIZE, - ReductionOp.Min: min(INIT_W, INIT_ARR), - ReductionOp.Max: max(INIT_W, INIT_ARR), -} +init_w = 5.0 +arr_size = 16 + + +def init_arr(op): + match op: + case ReductionOp.Mul: + return 0.99 # avoid value overflow for large array sizes + case _: + return 2.0 + + +def get_expected_solution(op, array): + match op: + case ReductionOp.Add: + return init_w + np.sum(array) + case ReductionOp.Sub: + return init_w - np.sum(array) + case ReductionOp.Mul: + return init_w * np.prod(array) + case ReductionOp.Min: + return min(init_w, np.min(array)) + case ReductionOp.Max: + return max(init_w, np.max(array)) # get AST for kernel with reduction assignment -def get_reduction_assign_ast(dtype, op, config): - x = ps.fields(f"x: {dtype}[1d]") +def get_reduction_assign_ast(dtype, op, dims, config): + x = ps.fields(f"x: {dtype}[{dims}d]") w = ps.TypedSymbol("w", dtype) red_assign = reduction_assignment(w, op, x.center()) @@ -27,15 +41,22 @@ def get_reduction_assign_ast(dtype, op, config): return ps.create_kernel([red_assign], config, default_dtype=dtype) -def get_cpu_array(op, dtype): +def get_cpu_array(dtype, op, dims): + shape = (arr_size,) * dims + # increase difficulty of min/max tests by using range of values match op: - case ReductionOp.Min: - return np.linspace(INIT_ARR, INIT_ARR + SIZE, SIZE, dtype=dtype) - case ReductionOp.Max: - return np.linspace(INIT_ARR - SIZE, INIT_ARR, SIZE, dtype=dtype) + case ReductionOp.Min | ReductionOp.Max: + lo = init_arr(op) - arr_size + mi = init_arr(op) + hi = init_arr(op) + arr_size + + if op is ReductionOp.Min: + return np.random.randint(mi, hi, size=shape).astype(dtype) + else: + return np.random.randint(lo, mi, size=shape).astype(dtype) case _: - return np.full((SIZE,), INIT_ARR, dtype=dtype) + return np.full(shape, init_arr(op), dtype=dtype) @pytest.mark.parametrize( @@ -52,7 +73,12 @@ def get_cpu_array(op, dtype): ReductionOp.Max, ], ) -def test_reduction_cpu(target, dtype, op): +@pytest.mark.parametrize("dims", [1, 2, 3]) +def test_reduction_cpu( + target: ps.Target, + dtype: str, + op: str, + dims: int): config = ps.CreateKernelConfig(target=target) config.cpu.openmp.enable = True @@ -60,14 +86,14 @@ def test_reduction_cpu(target, dtype, op): config.cpu.vectorize.enable = True config.cpu.vectorize.assume_inner_stride_one = True - ast_reduction = get_reduction_assign_ast(dtype, op, config) + ast_reduction = get_reduction_assign_ast(dtype, op, dims, config) kernel_reduction = ast_reduction.compile() - array = get_cpu_array(op, dtype) - reduction_array = np.full((1,), INIT_W, dtype=dtype) + array = get_cpu_array(dtype, op, dims) + reduction_array = np.full((1,), init_w, dtype=dtype) kernel_reduction(x=array, w=reduction_array) - assert np.allclose(reduction_array, SOLUTION[op]) + assert np.allclose(reduction_array, get_expected_solution(op, array)) @pytest.mark.parametrize("dtype", ["float64", "float32"]) @@ -81,6 +107,7 @@ def test_reduction_cpu(target, dtype, op): ReductionOp.Max, ], ) +@pytest.mark.parametrize("dims", [1, 2, 3]) @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) @pytest.mark.parametrize("use_block_fitting", [True, False]) @pytest.mark.parametrize("warp_size", [32, None]) @@ -90,6 +117,7 @@ def test_reduction_cpu(target, dtype, op): def test_reduction_gpu( dtype: str, op: str, + dims: int, assume_warp_aligned_block_size: bool, use_block_fitting: bool, warp_size: int | None, @@ -101,17 +129,17 @@ def test_reduction_gpu( if warp_size: cfg.gpu.warp_size = warp_size - ast_reduction = get_reduction_assign_ast(dtype, op, cfg) + ast_reduction = get_reduction_assign_ast(dtype, op, dims, cfg) kernel_reduction = ast_reduction.compile() if use_block_fitting and warp_size: kernel_reduction.launch_config.fit_block_size((warp_size, 1, 1)) - array = get_cpu_array(op, dtype) - reduction_array = np.full((1,), INIT_W, dtype=dtype) + array = get_cpu_array(dtype, op, dims) + reduction_array = np.full((1,), init_w, dtype=dtype) array_gpu = cp.asarray(array) reduction_array_gpu = cp.asarray(reduction_array) kernel_reduction(x=array_gpu, w=reduction_array_gpu) - assert np.allclose(reduction_array_gpu.get(), SOLUTION[op]) + assert np.allclose(reduction_array_gpu.get(), get_expected_solution(op, array))