From 51c03215aedefc3b36ebea07355fbe164e09709a Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Sat, 15 Mar 2025 13:17:36 +0100
Subject: [PATCH] extend GPU section of composer guide. Fix and clarify some
 aspects in the API docs and user guide.

---
 docs/source/api/composer.rst         |   7 +
 docs/source/usage/config_and_cli.md  |  30 ++--
 docs/source/usage/how_to_composer.md | 209 +++++++++++++++++++++------
 src/pystencilssfg/lang/gpu.py        |   4 +-
 4 files changed, 196 insertions(+), 54 deletions(-)

diff --git a/docs/source/api/composer.rst b/docs/source/api/composer.rst
index 078e0eb..8b470b0 100644
--- a/docs/source/api/composer.rst
+++ b/docs/source/api/composer.rst
@@ -40,6 +40,7 @@ Helper Methods and Builders
 
 .. autoclass:: SfgFunctionSequencer
     :members:
+    :inherited-members:
 
 .. autoclass:: SfgNodeBuilder
     :members:
@@ -50,6 +51,12 @@ Helper Methods and Builders
 .. autoclass:: SfgSwitchBuilder
     :members:
 
+.. module:: pystencilssfg.composer.class_composer
+
+.. autoclass:: SfgMethodSequencer
+    :members:
+    :inherited-members:
+
 Context and Cursor
 ==================
 
diff --git a/docs/source/usage/config_and_cli.md b/docs/source/usage/config_and_cli.md
index 785ff52..b6060c0 100644
--- a/docs/source/usage/config_and_cli.md
+++ b/docs/source/usage/config_and_cli.md
@@ -12,7 +12,7 @@ different configuration sources:
   the generator script to set some of its configuration options; see [Command-Line Options](#cmdline_options)
 - **Project Configuration:** When embedded into a larger project, using a build system such as CMake, generator scripts
   may be configured globally within that project by the use of a *configuration module*.
-  Settings specified inside that configuration module are always overridden by the former to configuration sources.
+  Settings specified inside that configuration module are always overridden by the two other configuration sources listed above.
   For details on configuration modules, refer to the guide on [Project and Build System Integration](#guide_project_integration).
 
 (inline_config)=
@@ -60,14 +60,26 @@ set {any}`cfg.outer_namespace <SfgConfig.outer_namespace>`.
 
 ### Code Style and Formatting
 
- - Modify the values in the {any}`cfg.code_style <CodeStyle>` category to affect
-   certain formatting aspects of the generated code.
- - To change, enforce, or disable auto-formatting of generated code through `clang-format`,
-   take a look at the {any}`cfg.clang_format <ClangFormatOptions>` category.
- - Clang-format will, by default, sort `#include` statements alphabetically and separate
-   local and system header includes.
-   To override this, you can set a custom sorting key for `#include` sorting via
-   {any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.
+Pystencils-sfg gives you some options to affect its output code style.
+These are controlled by the options in the {any}`cfg.code_style <CodeStyle>` category.
+
+Furthermore, pystencils-sfg uses `clang-format` to beautify generated code.
+The behaviour of the clang-format integration is managed by the
+the {any}`cfg.clang_format <ClangFormatOptions>` category,
+where you can set options to skip or enforce formatting,
+or change the formatter binary.
+To set the code style used by `clang-format` either create a `.clang-format` file
+in any of the parent folders of your generator script,
+or modify the {any}`cfg.clang_format.code_style <ClangFormatOptions.code_style>` option.
+
+:::{seealso}
+[Clang-Format Style Options](https://clang.llvm.org/docs/ClangFormatStyleOptions.html)
+:::
+
+Clang-format will, by default, sort `#include` statements alphabetically and separate
+local and system header includes.
+To override this, you can set a custom sorting key for `#include` sorting via
+{any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.
 
 (cmdline_options)=
 ## Command-Line Options
diff --git a/docs/source/usage/how_to_composer.md b/docs/source/usage/how_to_composer.md
index 12a8435..849de18 100644
--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -283,7 +283,7 @@ The composer gives us access to the default kernel namespace (`<current_namespac
 via `sfg.kernels`.
 
 To add a kernel,
- - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.reate() <KernelsAdder.create>`,
+ - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.create() <KernelsAdder.create>`,
  - or create the kernel separately through {any}`pystencils.create_kernel <pystencils.codegen.create_kernel>` and register it using
    {any}`kernels.add() <KernelsAdder.add>`.
 
@@ -344,9 +344,63 @@ cause them to be added to its signature.
 We don't want to expose this complexity, but instead hide it by using appropriate data structures.
 The next section explains how that is achieved in pystencils-sfg.
 
-#### Invoking GPU Kernels
+#### Mapping Fields to Data Structures
+
+Pystencils kernels operate on n-dimensional contiguous or strided arrays,
+There exist many classes with diverse APIs modelling such arrays throughout the scientific
+computing landscape, including [Kokkos Views][kokkos_view], [C++ std::mdspan][mdspan],
+[SYCL buffers][sycl_buffer], and many framework-specific custom-built classes.
+Using the protocols behind {any}`sfg.map_field <SfgBasicComposer.map_field>`,
+it is possible to automatically emit code
+that extracts the indexing information required by a kernel from any of these classes,
+as long as a suitable API reflection is available.
+
+:::{seealso}
+[](#field_data_structure_reflection) for instructions on how to set up field API
+reflection for a custom nd-array data structure.
+:::
+
+Pystencils-sfg natively provides field extraction for a number of C++ STL-classes,
+such as `std::vector` and `std::span` (for 1D fields) and `std::mdspan`.
+Import any of them from `pystencilssfg.lang.cpp.std` and create an instance for a given
+field using `.from_field()`.
+Then, inside the wrapper function, pass the symbolic field and its associated data structure to
+{any}`sfg.map_field <SfgBasicComposer.map_field>`.
+before calling the kernel:
+
+```{code-cell} ipython3
+import pystencils as ps
+from pystencilssfg.lang.cpp import std
+
+with SourceFileGenerator() as sfg:
+    #   Create symbolic fields
+    f, g = ps.fields("f, g: double[1D]")
+
+    #   Create data structure reflections
+    f_vec = std.vector.from_field(f)
+    g_span = std.span.from_field(g)
+
+    #   Create the kernel
+    asm = ps.Assignment(f(0), g(0))
+    khandle = sfg.kernels.create(asm, "my_kernel")
+
+    #   Create the wrapper function
+    sfg.function("call_my_kernel")(
+        sfg.map_field(f, f_vec),
+        sfg.map_field(g, g_span),
+        sfg.call(khandle)
+    )
+```
+
+## GPU Kernels
 
 Pystencils also allows us to generate kernels for the CUDA and HIP GPU programming models.
+This section describes how to generate GPU kernels through pystencils-sfg;
+how to invoke them with various launch configurations,
+and how GPU execution streams are reflected.
+
+### Generate and Invoke CUDA and HIP Kernels
+
 To generate a kernel targetting either of these, set the
 {any}`target <pystencils.codegen.config.CreateKernelConfig.target>`
 code generator option to either `Target.CUDA` or `Target.HIP`.
@@ -364,7 +418,7 @@ with SourceFileGenerator(sfg_config) as sfg:
     cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
 
     #   Create fields, assemble assignments
-    f, g = ps.fields("f, g: double[2D]")
+    f, g = ps.fields("f, g: double[128, 128]")
     asm = ps.Assignment(f(0), g(0))
 
     #   Register kernel
@@ -384,61 +438,130 @@ When investigating the generated `.cu` file, you can see that the GPU launch con
 This behavior can be changed by modifying options in the {any}`gpu <pystencils.codegen.config.GpuOptions>`
 category of the `CreateKernelConfig`.
 
-#### Mapping Fields to Data Structures
+### Adapting the Launch Configuration
 
-Pystencils kernels operate on n-dimensional contiguous or strided arrays,
-There exist many classes with diverse APIs modelling such arrays throughout the scientific
-computing landscape, including [Kokkos Views][kokkos_view], [C++ std::mdspan][mdspan],
-[SYCL buffers][sycl_buffer], and many framework-specific custom-built classes.
-Using the protocols behind {any}`sfg.map_field <SfgBasicComposer.map_field>`,
-it is possible to automatically emit code
-that extracts the indexing information required by a kernel from any of these classes,
-as long as a suitable API reflection is available.
+GPU kernel invocations usually require the user to provide a launch grid, defined
+by the GPU thread block size and the number of blocks on the grid.
+In the simplest case (seen above), pystencils-sfg will emit code that automatically
+computes these parameters from the size of the arrays passed to the kernel,
+using a default block size defined by pystencils.
 
-:::{seealso}
-[](#field_data_structure_reflection) for instructions on how to set up field API
-reflection for a custom nd-array data structure.
-:::
+The code generator also permits customization of the launch configuration.
+You may provide a custom block size to override the default, in which case the
+grid size will still be computed by dividing the array sizes by your block size.
+Otherwise, you can also fully take over control of both block and grid size.
+For both cases, instructions are given in the following.
 
-Pystencils-sfg natively provides field extraction for a number of C++ STL-classes,
-such as `std::vector` and `std::span` (for 1D fields) and `std::mdspan`.
-Import any of them from `pystencilssfg.lang.cpp.std` and create an instance for a given
-field using `.from_field()`.
-Then, inside the wrapper function, pass the symbolic field and its associated data structure to
-{any}`sfg.map_field <SfgBasicComposer.map_field>`.
-before calling the kernel:
+#### User-Defined Block Size for Auto-Computed Grid Size
+
+To merely modify the block size argument while still automatically inferring the grid size,
+pass a variable or expression of type `dim3` to the `block_size` parameter of `gpu_invoke`.
+Pystencils-sfg exposes two versions of `dim3`, which differ primarily in their associated
+runtime headers:
+
+ - {any}`pystencilssfg.lang.gpu.cuda.dim3 <CudaAPI.dim3>` for CUDA, and
+ - {any}`pystencilssfg.lang.gpu.hip.dim3 <HipAPI.dim3>` for HIP.
+
+The following snippet selects the correct `dim3` type according to the kernel target;
+it then creates a variable of that type and turns that into an argument to the kernel invocation:
 
 ```{code-cell} ipython3
-import pystencils as ps
-from pystencilssfg.lang.cpp import std
+:tags: [remove-cell]
+target = ps.Target.HIP
+cfg = ps.CreateKernelConfig(target=target)
+f, g = ps.fields("f, g: double[128, 128]")
+asm = ps.Assignment(f(0), g(0))
+```
 
-with SourceFileGenerator() as sfg:
-    #   Create symbolic fields
-    f, g = ps.fields("f, g: double[1D]")
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
 
-    #   Create data structure reflections
-    f_vec = std.vector.from_field(f)
-    g_span = std.span.from_field(g)
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
 
-    #   Create the kernel
-    asm = ps.Assignment(f(0), g(0))
-    khandle = sfg.kernels.create(asm, "my_kernel")
+    #   Select dim3 reflection
+    match target:
+        case ps.Target.CUDA:
+            from pystencilssfg.lang.gpu import cuda as gpu_api
+        case ps.Target.HIP:
+            from pystencilssfg.lang.gpu import hip as gpu_api
+    
+    #   Create dim3 variable and pass it to kernel invocation
+    block_size = gpu_api.dim3(const=True).var("block_size")
 
-    #   Create the wrapper function
-    sfg.function("call_my_kernel")(
-        sfg.map_field(f, f_vec),
-        sfg.map_field(g, g_span),
-        sfg.call(khandle)
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, block_size=block_size)
     )
 ```
 
-(exposed_inline_kernels)=
-### Exposed and Inline Kernels
+#### Manual Launch Configurations
+
+To take full control of the launch configuration, we must disable its automatic inferrence
+by setting the {any}`gpu.manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+code generator option to `True`.
+Then, we must pass `dim3` arguments for both `block_size` and `grid_size` to the kernel invocation:
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+
+    #   Configure for manual launch config
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+    cfg.gpu.manual_launch_grid = True
+
+    #   Register kernel
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+    
+    #   Create dim3 variables
+    from pystencilssfg.lang.gpu import cuda
+    block_size = cuda.dim3(const=True).var("block_size")
+    grid_size = cuda.dim3(const=True).var("grid_size")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size)
+    )
+```
+
+### Using Streams
+
+CUDA and HIP kernels can be enqueued into streams for concurrent execution.
+This is mirrored in pystencils-sfg;
+all overloads of `gpu_invoke` take an optional `stream` argument.
+The `stream_t` data types of both CUDA and HIP are made available
+through the respective API reflections:
+
+ - {any}`lang.gpu.cuda.stream_t <CudaAPI.stream_t>` reflects `cudaStream_t`, and
+ - {any}`lang.gpu.hip.stream_t <HipAPI.stream_t>` reflects `hipStream_t`.
+
+Here is an example that creates a variable of the HIP stream type
+and passes it to `gpu_invoke`:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+cfg = ps.CreateKernelConfig(target=ps.Target.HIP)
+f, g = ps.fields("f, g: double[128, 128]")
+asm = ps.Assignment(f(0), g(0))
+```
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    stream = hip.stream_t(const=True).var("stream")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, stream=stream)
+    )
+```
 
 :::{admonition} To Do
 
- - Modifying GPU kernel launch configs
- - GPU API Reflections
  - Defining classes, their fields constructors, and methods
 
 :::
diff --git a/src/pystencilssfg/lang/gpu.py b/src/pystencilssfg/lang/gpu.py
index c9736fb..e3b5516 100644
--- a/src/pystencilssfg/lang/gpu.py
+++ b/src/pystencilssfg/lang/gpu.py
@@ -56,7 +56,7 @@ class CudaAPI(ProvidesGpuRuntimeAPI):
 
 
 cuda = CudaAPI
-"""Reflection of the CUDA runtime API"""
+"""Alias for `CudaAPI`"""
 
 
 class HipAPI(ProvidesGpuRuntimeAPI):
@@ -72,4 +72,4 @@ class HipAPI(ProvidesGpuRuntimeAPI):
 
 
 hip = HipAPI
-"""Reflection of the HIP runtime API"""
+"""Alias for `HipAPI`"""
-- 
GitLab