documentation on GPU invocation

21c0ba7f · Frederik Hennig · 3ff729e9 · 21c0ba7f · 21c0ba7f · 21c0ba7f
Commit 21c0ba7f authored 5 months ago by Frederik Hennig
--- a/docs/source/_util/sfg_monkeypatch.py
+++ b/docs/source/_util/sfg_monkeypatch.py
 import pystencilssfg
 from pystencilssfg.config import SfgConfig
+from os.path import splitext
 class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
    """Mockup wrapper around SourceFileGenerator for use in documentation
@@ -30,21 +32,20 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
            self._finish_files()
            header_code = self._emitter.dumps(self._header_file)
-            impl_code = (
+            header_ext = splitext(self._header_file.name)[1]
-                None
-                if self._impl_file is None
-                else self._emitter.dumps(self._impl_file)
-            )
            mdcode = ":::::{tab-set}\n"
-            mdcode += "::::{tab-item} Generated Header (.hpp)\n"
+            mdcode += f"::::{{tab-item}} Generated Header ({header_ext})\n"
            mdcode += ":::{code-block} C++\n\n"
            mdcode += header_code
            mdcode += "\n:::\n::::\n"
-            if impl_code:
+            if self._impl_file is not None:
-                mdcode += "::::{tab-item} Generated Implementation (.cpp)\n"
+                impl_code = self._emitter.dumps(self._impl_file)
+                impl_ext = splitext(self._impl_file.name)[1]
+                mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n"
                mdcode += ":::{code-block} C++\n\n"
                mdcode += impl_code
                mdcode += "\n:::\n::::\n"

--- a/docs/source/api/composer.rst
+++ b/docs/source/api/composer.rst
@@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``)
 .. autoclass:: SfgClassComposer
    :members:
+.. autoclass:: SfgGpuComposer
+    :members:
 Custom Generators
 =================

--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -344,6 +344,44 @@ cause them to be added to its signature.
 We don't want to expose this complexity, but instead hide it by using appropriate data structures.
 The next section explains how that is achieved in pystencils-sfg.
+#### Invoking GPU Kernels
+Pystencils also allows us to generate kernels for the CUDA and HIP GPU platforms.
+First, we need to decide for one of the two systems by calling either
+{any}`sfg.use_cuda <SfgGpuComposer.use_cuda>` or {any}`sfg.use_hip <SfgGpuComposer.use_hip>`.
+After registering a GPU kernel,
+you can render its invocation using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
+Here is a basic example:
+```{code-cell} ipython3
+:tags: [remove-cell]
+f, g = ps.fields("f, g: double[2D]")
+asm = ps.Assignment(f(0), g(0))
+```
+```{code-cell} ipython3
+from pystencilssfg import SfgConfig
+sfg_config = SfgConfig()
+sfg_config.extensions.impl = "cu"
+with SourceFileGenerator(sfg_config) as sfg:
+    #   Activate CUDA
+    sfg.use_cuda()
+    #   Register the GPU kernel
+    cfg = ps.CreateKernelConfig()
+    cfg.target = ps.Target.CUDA
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+    #   Invoke it
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle)
+    )
+```
 #### Mapping Fields to Data Structures
 Pystencils kernels operate on n-dimensional contiguous or strided arrays,

--- a/src/pystencilssfg/composer/__init__.py
+++ b/src/pystencilssfg/composer/__init__.py
@@ -9,6 +9,7 @@ from .basic_composer import (
 )
 from .mixin import SfgComposerMixIn
 from .class_composer import SfgClassComposer
+from .gpu_composer import SfgGpuComposer
 __all__ = [
    "SfgIComposer",
@@ -20,4 +21,5 @@ __all__ = [
    "ExprLike",
    "SfgBasicComposer",
    "SfgClassComposer",
+    "SfgGpuComposer",
 ]
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -390,8 +390,8 @@ class SfgBasicComposer(SfgIComposer):
        """Use inside a function body to directly call a kernel.
        When using `call`, the given kernel will simply be called as a function.
-        To invoke a GPU kernel on a specified launch grid, use `cuda_invoke`
+        To invoke a GPU kernel on a specified launch grid,
-        or the interfaces of ``pystencilssfg.extensions.sycl`` instead.
+        use `gpu_invoke <SfgGpuComposer.gpu_invoke>` instead.
        Args:
            kernel_handle: Handle to a kernel previously added to some kernel namespace.

--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -24,6 +24,50 @@ from ..lang.gpu import ProvidesGpuRuntimeAPI
 class SfgGpuComposer(SfgComposerMixIn):
+    """Composer mix-in providing methods to generate GPU kernel invocations.
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs)
+        Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration.
+        The overloads of this method are listed below.
+        They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax;
+        for details on the launch configuration arguments,
+        refer to `Launch Configurations in CUDA`_
+        or `Launch Configurations in HIP`_.
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+        Invoke a GPU kernel with a manual launch grid.
+        Requires that the kernel was generated
+        with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+        set to `True`.
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+        Invoke a GPU kernel with an automatic launch grid.
+        This signature accepts kernels generated with an indexing scheme that
+        causes the launch grid to be determined automatically,
+        such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`.
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+        Invoke a GPU kernel with a dynamic launch grid.
+        This signature accepts kernels generated with an indexing scheme that permits a user-defined
+        blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`.
+        The grid size is calculated automatically by dividing the number of work items in each
+        dimension by the block size, rounding up.
+    .. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
+    .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
+    """  # NOQA: E501
    def __init__(self) -> None:
        self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
@@ -63,12 +107,7 @@ class SfgGpuComposer(SfgComposerMixIn):
        block_size: ExprLike,
        shared_memory_bytes: ExprLike = "0",
        stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
+    ) -> SfgCallTreeNode: ...
-        """Invoke a CUDA kernel with a manual launch grid.
-        Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
-        set to `True`.
-        """
    @overload
    def gpu_invoke(
@@ -77,12 +116,7 @@ class SfgGpuComposer(SfgComposerMixIn):
        *,
        shared_memory_bytes: ExprLike = "0",
        stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
+    ) -> SfgCallTreeNode: ...
-        """Invoke a CUDA kernel with an automatic launch grid.
-        This signature accepts kernels generated with an indexing scheme that permits
-        the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
-        """
    @overload
    def gpu_invoke(
@@ -92,14 +126,7 @@ class SfgGpuComposer(SfgComposerMixIn):
        block_size: ExprLike | None = None,
        shared_memory_bytes: ExprLike = "0",
        stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
+    ) -> SfgCallTreeNode: ...
-        """Invoke a CUDA kernel with a dynamic launch grid.
-        This signature accepts kernels generated with an indexing scheme that permits a user-defined
-        blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
-        The grid size is calculated automatically by dividing the number of work items in each
-        dimension by the block size, rounding up.
-        """
    def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
        assert isinstance(