From 5bef84a8e28279c46cd73b67352201622d9aa89e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 2 Apr 2025 19:47:49 +0200 Subject: [PATCH] Write howto guide for reductions --- docs/source/backend/gpu_codegen.md | 1 + docs/source/backend/index.rst | 1 + docs/source/backend/reduction_codegen.md | 122 +++++++++++++++++++++ docs/source/index.rst | 1 + docs/source/user_manual/reductions.md | 132 +++++++++++++++++++++++ 5 files changed, 257 insertions(+) create mode 100644 docs/source/backend/reduction_codegen.md create mode 100644 docs/source/user_manual/reductions.md diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 3fe00840e..a95c36566 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -1,3 +1,4 @@ +(gpu_codegen)= # GPU Code Generation The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components: diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst index 0d384c55b..b9b400544 100644 --- a/docs/source/backend/index.rst +++ b/docs/source/backend/index.rst @@ -16,6 +16,7 @@ who wish to customize or extend the behaviour of the code generator in their app iteration_space translation platforms + reduction_codegen transformations gpu_codegen errors diff --git a/docs/source/backend/reduction_codegen.md b/docs/source/backend/reduction_codegen.md new file mode 100644 index 000000000..360c69256 --- /dev/null +++ b/docs/source/backend/reduction_codegen.md @@ -0,0 +1,122 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.4 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +mystnb: + execution_mode: cache +--- + +```{code-cell} ipython3 +:tags: [remove-cell, raises-exception] + +import pystencils as ps +import numpy as np +import cupy as cp +``` + +(codegen_reductions)= +# Code Generation for Reductions + +In this guide, we demonstrate how reduction kernels can be generated for different platforms and what impact certain +optimization strategies have. +For this, we set up the update rule for a simple dot product kernel: + +```{code-cell} ipython3 +r = ps.TypedSymbol("r", "double") +x, y = ps.fields(f"x, y: double[3D]", layout="fzyx") + +assign_dot_prod = ps.AddReductionAssignment(r, x.center() * y.center()) +``` + +## CPU Platforms + +We first consider a base variant for CPUs without employing any optimizations. +The generated code for this variant looks as follows: + +```{code-cell} ipython3 +cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) +kernel_cpu = ps.create_kernel(assign_dot_prod, cfg) + +ps.inspect(kernel_cpu) +``` + +We want the reduction kernel to be SIMD vectorized and employ shared-memory parallelism using OpenMP. +The supported SIMD instruction sets for reductions are: +* SSE3 +* AVX/AVX2 +* AVX512 + +Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`. +**Note that reductions require `assume_inner_stride_one` to be enabled.** +This is due to the fact that other inner strides would require masked SIMD operations +which are not supported yet. + +```{code-cell} ipython3 +# configure SIMD vectorization +cfg = ps.CreateKernelConfig( + target=ps.Target.X86_AVX, +) +cfg.cpu.vectorize.enable = True +cfg.cpu.vectorize.assume_inner_stride_one = True + +# configure OpenMP parallelization +cfg.cpu.openmp.enable = True +cfg.cpu.openmp.num_threads = 8 + +kernel_cpu_opt = ps.create_kernel(assign_dot_prod, cfg) + +ps.inspect(kernel_cpu_opt) +``` + +## GPU Platforms + +Reductions are currently only supported for CUDA platforms. +Similar to the CPU section, a base variant for GPUs without explicitly employing any optimizations is shown: + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + + kernel_gpu = ps.create_kernel(assign_dot_prod, cfg) + + ps.inspect(kernel_gpu) +``` + +As evident from the code, the generated kernel employs atomic operations for updating the pointer +holding the reduction result. +Using the explicit warp-level instructions provided by CUDA allows us to achieve higher performance compared to +only using atomic operations. +To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by +the hardware's warp size. +**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this +criterion by enabling `assume_warp_aligned_block_size`.** +While the default block sizes provided by the code generator already fulfill this criterion, +we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space. + +You can find more detailed information about warp size alignment in {ref}`gpu_codegen`. + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + cfg.gpu.assume_warp_aligned_block_size = True + + kernel_gpu_opt = ps.create_kernel(assign_dot_prod, cfg) + + kernel_func = kernel_gpu_opt.compile() + kernel_func.launch_config.fit_block_size((32, 1, 1)) + + ps.inspect(kernel_gpu_opt) +``` + +:::{admonition} Developers To Do: + +- Support for HIP platforms +- Support vectorization using NEON intrinsics +::: + diff --git a/docs/source/index.rst b/docs/source/index.rst index 6dba50af1..4e1070979 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,6 +81,7 @@ Topics user_manual/symbolic_language user_manual/kernelcreation + user_manual/reductions user_manual/gpu_kernels user_manual/WorkingWithTypes diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md new file mode 100644 index 000000000..454fd7df4 --- /dev/null +++ b/docs/source/user_manual/reductions.md @@ -0,0 +1,132 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.4 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +mystnb: + execution_mode: cache +--- + +```{code-cell} ipython3 +:tags: [remove-cell, raises-exception] + +import sympy as sp +import pystencils as ps +import numpy as np +import cupy as cp + +from enum import Enum +``` + +(guide_reductions)= +# Reductions in Pystencils + +Reductions play a vital role in numerical simulations as they allow aggregating data across multiple elements, +such as computing sums, products over an array or finding its minima or maxima. + +## Specifying Assignments with Reductions + +In pystencils, reductions are made available via specialized assignments, namely `ReductionAssignment`. +Here is a snippet creating a reduction assignment for adding up all elements of a field: + +```{code-cell} ipython3 +r = ps.TypedSymbol("r", "double") +x = ps.fields(f"x: double[3D]", layout="fzyx") + +assign_sum = ps.AddReductionAssignment(r, x.center()) +``` + +For each point in the iteration space, the left-hand side symbol `r` accumulates the contents of the +right-hand side `x.center()`. In our case, the `AddReductionAssignment` denotes an accumulation via additions. + +**Pystencils requires type information about the reduction symbols and thus requires `r` to be a `TypedSymbol`.** + +The following reduction assignment classes are available in pystencils: +* `AddReductionAssignment`: Builds sum over elements +* `SubReductionAssignment`: Builds difference over elements +* `MulReductionAssignment`: Builds product over elements +* `MinReductionAssignment`: Finds minimum element +* `MaxReductionAssignment`: Finds maximum element + +:::{note} +AlternatÃvely, you can also make use of the `reduction_assignment` or `reduction_assignment_from_str` functions +to specify reduction assignments: +::: + +```{code-cell} ipython3 +from pystencils.sympyextensions import reduction_assignment, reduction_assignment_from_str +from pystencils.sympyextensions.reduction import ReductionOp + +assign_sum = reduction_assignment(r, ReductionOp.Add, x.center()) + +assign_sum = reduction_assignment_from_str(r, "+", x.center()) +``` + +For other reduction operations, the following enums can be passed to `reduction_assignment` +or the corresponding strings can be passed to `reduction_assignment_from_str`. + +```{code-cell} python3 +class ReductionOp(Enum): + Add = "+" + Sub = "-" + Mul = "*" + Min = "min" + Max = "max" +``` + +## Generating Reduction Kernels + +With the assignments being fully assembled, we can finally invoke the code generator and +create the kernel object via the {any}`create_kernel` function. +For this example, we assume a kernel configuration where no optimizations are explicitly enabled. + +```{code-cell} ipython3 +cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) +kernel = ps.create_kernel(assign_sum, cfg) + +ps.inspect(kernel) +``` + +:::{note} +The generated reduction kernels may vary vastly for different platforms and optimizations. +For the sake of compactness, the impact of different backend or optimization choices is left out. + +A detailed description of configuration choices and their impact on the generated code can be found in +{ref}`codegen_reductions`. +::: + +The kernel can be compiled and run immediately. + +To execute the kernel on CPUs, not only a {any}`numpy.ndarray` has to be passed for each field +but also one for exporting reduction results. +The export mechanism can be seen in the previously generated code snippet. +Here, the kernel obtains a pointer with the name of the reduction symbol (here: `r`). +This pointer not only allows providing initial values for the reduction but is also used for writing back the +reduction result. +Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value. + +```{code-cell} ipython3 + kernel_func = kernel.compile() + + x_array = np.ones((4, 4, 4), dtype="float64") + reduction_result = np.zeros((1,), dtype="float64") + + kernel_func(x=x_array, r=reduction_result) + + reduction_result[0] +``` + +For GPU platforms, the concepts remain the same but the fields and the write-back pointer now require device memory, +i.e. instances of {any}`cupy.ndarray`. + +:::{admonition} Developers To Do: + +- Support for higher-order data types for reductions, e.g. vector/matrix reductions +::: -- GitLab