From 21c0ba7fd47f41c7b38852e9ba59e6f255858d91 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 11 Mar 2025 09:54:54 +0100
Subject: [PATCH] documentation on GPU invocation

---
 docs/source/_util/sfg_monkeypatch.py         | 17 ++---
 docs/source/api/composer.rst                 |  3 +
 docs/source/usage/how_to_composer.md         | 38 +++++++++++
 src/pystencilssfg/composer/__init__.py       |  2 +
 src/pystencilssfg/composer/basic_composer.py |  4 +-
 src/pystencilssfg/composer/gpu_composer.py   | 67 ++++++++++++++------
 6 files changed, 101 insertions(+), 30 deletions(-)

diff --git a/docs/source/_util/sfg_monkeypatch.py b/docs/source/_util/sfg_monkeypatch.py
index 0269d40..1277603 100644
--- a/docs/source/_util/sfg_monkeypatch.py
+++ b/docs/source/_util/sfg_monkeypatch.py
@@ -1,6 +1,8 @@
 import pystencilssfg
 from pystencilssfg.config import SfgConfig
 
+from os.path import splitext
+
 
 class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
     """Mockup wrapper around SourceFileGenerator for use in documentation
@@ -30,21 +32,20 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
             self._finish_files()
 
             header_code = self._emitter.dumps(self._header_file)
-            impl_code = (
-                None
-                if self._impl_file is None
-                else self._emitter.dumps(self._impl_file)
-            )
+            header_ext = splitext(self._header_file.name)[1]
 
             mdcode = ":::::{tab-set}\n"
 
-            mdcode += "::::{tab-item} Generated Header (.hpp)\n"
+            mdcode += f"::::{{tab-item}} Generated Header ({header_ext})\n"
             mdcode += ":::{code-block} C++\n\n"
             mdcode += header_code
             mdcode += "\n:::\n::::\n"
 
-            if impl_code:
-                mdcode += "::::{tab-item} Generated Implementation (.cpp)\n"
+            if self._impl_file is not None:
+                impl_code = self._emitter.dumps(self._impl_file)
+                impl_ext = splitext(self._impl_file.name)[1]
+
+                mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n"
                 mdcode += ":::{code-block} C++\n\n"
                 mdcode += impl_code
                 mdcode += "\n:::\n::::\n"
diff --git a/docs/source/api/composer.rst b/docs/source/api/composer.rst
index 124d0fb..078e0eb 100644
--- a/docs/source/api/composer.rst
+++ b/docs/source/api/composer.rst
@@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``)
 .. autoclass:: SfgClassComposer
     :members:
 
+.. autoclass:: SfgGpuComposer
+    :members:
+
 Custom Generators
 =================
 
diff --git a/docs/source/usage/how_to_composer.md b/docs/source/usage/how_to_composer.md
index 966a9a6..4610d07 100644
--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -344,6 +344,44 @@ cause them to be added to its signature.
 We don't want to expose this complexity, but instead hide it by using appropriate data structures.
 The next section explains how that is achieved in pystencils-sfg.
 
+#### Invoking GPU Kernels
+
+Pystencils also allows us to generate kernels for the CUDA and HIP GPU platforms.
+First, we need to decide for one of the two systems by calling either
+{any}`sfg.use_cuda <SfgGpuComposer.use_cuda>` or {any}`sfg.use_hip <SfgGpuComposer.use_hip>`.
+After registering a GPU kernel,
+you can render its invocation using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
+
+Here is a basic example:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+f, g = ps.fields("f, g: double[2D]")
+asm = ps.Assignment(f(0), g(0))
+```
+
+```{code-cell} ipython3
+from pystencilssfg import SfgConfig
+sfg_config = SfgConfig()
+sfg_config.extensions.impl = "cu"
+
+with SourceFileGenerator(sfg_config) as sfg:
+    #   Activate CUDA
+    sfg.use_cuda()
+
+    #   Register the GPU kernel
+    cfg = ps.CreateKernelConfig()
+    cfg.target = ps.Target.CUDA
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    #   Invoke it
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle)
+    )
+
+```
+
 #### Mapping Fields to Data Structures
 
 Pystencils kernels operate on n-dimensional contiguous or strided arrays,
diff --git a/src/pystencilssfg/composer/__init__.py b/src/pystencilssfg/composer/__init__.py
index f6af76b..c8f279e 100644
--- a/src/pystencilssfg/composer/__init__.py
+++ b/src/pystencilssfg/composer/__init__.py
@@ -9,6 +9,7 @@ from .basic_composer import (
 )
 from .mixin import SfgComposerMixIn
 from .class_composer import SfgClassComposer
+from .gpu_composer import SfgGpuComposer
 
 __all__ = [
     "SfgIComposer",
@@ -20,4 +21,5 @@ __all__ = [
     "ExprLike",
     "SfgBasicComposer",
     "SfgClassComposer",
+    "SfgGpuComposer",
 ]
diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py
index 97334db..d78e43d 100644
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -390,8 +390,8 @@ class SfgBasicComposer(SfgIComposer):
         """Use inside a function body to directly call a kernel.
 
         When using `call`, the given kernel will simply be called as a function.
-        To invoke a GPU kernel on a specified launch grid, use `cuda_invoke`
-        or the interfaces of ``pystencilssfg.extensions.sycl`` instead.
+        To invoke a GPU kernel on a specified launch grid,
+        use `gpu_invoke <SfgGpuComposer.gpu_invoke>` instead.
 
         Args:
             kernel_handle: Handle to a kernel previously added to some kernel namespace.
diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index b24afcd..274c81c 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -24,6 +24,50 @@ from ..lang.gpu import ProvidesGpuRuntimeAPI
 
 
 class SfgGpuComposer(SfgComposerMixIn):
+    """Composer mix-in providing methods to generate GPU kernel invocations.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs)
+
+        Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration.
+
+        The overloads of this method are listed below.
+        They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax;
+        for details on the launch configuration arguments,
+        refer to `Launch Configurations in CUDA`_
+        or `Launch Configurations in HIP`_.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a manual launch grid.
+
+        Requires that the kernel was generated
+        with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+        set to `True`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with an automatic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that
+        causes the launch grid to be determined automatically,
+        such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a dynamic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that permits a user-defined
+        blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`.
+        The grid size is calculated automatically by dividing the number of work items in each
+        dimension by the block size, rounding up.
+
+    .. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
+
+    .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
+    """  # NOQA: E501
 
     def __init__(self) -> None:
         self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
@@ -63,12 +107,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         block_size: ExprLike,
         shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with a manual launch grid.
-
-        Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
-        set to `True`.
-        """
+    ) -> SfgCallTreeNode: ...
 
     @overload
     def gpu_invoke(
@@ -77,12 +116,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         *,
         shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with an automatic launch grid.
-
-        This signature accepts kernels generated with an indexing scheme that permits
-        the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
-        """
+    ) -> SfgCallTreeNode: ...
 
     @overload
     def gpu_invoke(
@@ -92,14 +126,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         block_size: ExprLike | None = None,
         shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with a dynamic launch grid.
-
-        This signature accepts kernels generated with an indexing scheme that permits a user-defined
-        blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
-        The grid size is calculated automatically by dividing the number of work items in each
-        dimension by the block size, rounding up.
-        """
+    ) -> SfgCallTreeNode: ...
 
     def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
         assert isinstance(
-- 
GitLab