diff --git a/docs/source/_util/sfg_monkeypatch.py b/docs/source/_util/sfg_monkeypatch.py
index 0269d40f43492ea1540f51f49c8e78c5ebebf37d..127760338002be1edd0490a2887c819290fdbcf3 100644
--- a/docs/source/_util/sfg_monkeypatch.py
+++ b/docs/source/_util/sfg_monkeypatch.py
@@ -1,6 +1,8 @@
 import pystencilssfg
 from pystencilssfg.config import SfgConfig
 
+from os.path import splitext
+
 
 class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
     """Mockup wrapper around SourceFileGenerator for use in documentation
@@ -30,21 +32,20 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
             self._finish_files()
 
             header_code = self._emitter.dumps(self._header_file)
-            impl_code = (
-                None
-                if self._impl_file is None
-                else self._emitter.dumps(self._impl_file)
-            )
+            header_ext = splitext(self._header_file.name)[1]
 
             mdcode = ":::::{tab-set}\n"
 
-            mdcode += "::::{tab-item} Generated Header (.hpp)\n"
+            mdcode += f"::::{{tab-item}} Generated Header ({header_ext})\n"
             mdcode += ":::{code-block} C++\n\n"
             mdcode += header_code
             mdcode += "\n:::\n::::\n"
 
-            if impl_code:
-                mdcode += "::::{tab-item} Generated Implementation (.cpp)\n"
+            if self._impl_file is not None:
+                impl_code = self._emitter.dumps(self._impl_file)
+                impl_ext = splitext(self._impl_file.name)[1]
+
+                mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n"
                 mdcode += ":::{code-block} C++\n\n"
                 mdcode += impl_code
                 mdcode += "\n:::\n::::\n"
diff --git a/docs/source/api/composer.rst b/docs/source/api/composer.rst
index 124d0fb97ac8f94bc2a6d4c38815edee8403c65b..078e0ebfa368aed242f48da505a20b2634767496 100644
--- a/docs/source/api/composer.rst
+++ b/docs/source/api/composer.rst
@@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``)
 .. autoclass:: SfgClassComposer
     :members:
 
+.. autoclass:: SfgGpuComposer
+    :members:
+
 Custom Generators
 =================
 
diff --git a/docs/source/usage/how_to_composer.md b/docs/source/usage/how_to_composer.md
index 966a9a661b8f7c5d4d863b07c2a9549a95032591..4610d07b9614e17481506b40e822cdd9dfe6bcdc 100644
--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -344,6 +344,44 @@ cause them to be added to its signature.
 We don't want to expose this complexity, but instead hide it by using appropriate data structures.
 The next section explains how that is achieved in pystencils-sfg.
 
+#### Invoking GPU Kernels
+
+Pystencils also allows us to generate kernels for the CUDA and HIP GPU platforms.
+First, we need to decide for one of the two systems by calling either
+{any}`sfg.use_cuda <SfgGpuComposer.use_cuda>` or {any}`sfg.use_hip <SfgGpuComposer.use_hip>`.
+After registering a GPU kernel,
+you can render its invocation using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
+
+Here is a basic example:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+f, g = ps.fields("f, g: double[2D]")
+asm = ps.Assignment(f(0), g(0))
+```
+
+```{code-cell} ipython3
+from pystencilssfg import SfgConfig
+sfg_config = SfgConfig()
+sfg_config.extensions.impl = "cu"
+
+with SourceFileGenerator(sfg_config) as sfg:
+    #   Activate CUDA
+    sfg.use_cuda()
+
+    #   Register the GPU kernel
+    cfg = ps.CreateKernelConfig()
+    cfg.target = ps.Target.CUDA
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    #   Invoke it
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle)
+    )
+
+```
+
 #### Mapping Fields to Data Structures
 
 Pystencils kernels operate on n-dimensional contiguous or strided arrays,
diff --git a/src/pystencilssfg/composer/__init__.py b/src/pystencilssfg/composer/__init__.py
index f6af76b8b9c36445990fc451983fec5c14a4cf34..c8f279ecd43c9e7809e8f7796c5ad4ad36ba7a76 100644
--- a/src/pystencilssfg/composer/__init__.py
+++ b/src/pystencilssfg/composer/__init__.py
@@ -9,6 +9,7 @@ from .basic_composer import (
 )
 from .mixin import SfgComposerMixIn
 from .class_composer import SfgClassComposer
+from .gpu_composer import SfgGpuComposer
 
 __all__ = [
     "SfgIComposer",
@@ -20,4 +21,5 @@ __all__ = [
     "ExprLike",
     "SfgBasicComposer",
     "SfgClassComposer",
+    "SfgGpuComposer",
 ]
diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py
index 97334db7cca63228cd0462d13152ad65d4762a6b..d78e43deb53345aceca56baa6cf63f07b3a1d8de 100644
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -390,8 +390,8 @@ class SfgBasicComposer(SfgIComposer):
         """Use inside a function body to directly call a kernel.
 
         When using `call`, the given kernel will simply be called as a function.
-        To invoke a GPU kernel on a specified launch grid, use `cuda_invoke`
-        or the interfaces of ``pystencilssfg.extensions.sycl`` instead.
+        To invoke a GPU kernel on a specified launch grid,
+        use `gpu_invoke <SfgGpuComposer.gpu_invoke>` instead.
 
         Args:
             kernel_handle: Handle to a kernel previously added to some kernel namespace.
diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index b24afcdc9b18151febb0df16d1d9b3c52b06d6ef..274c81ccb6c1e537996630f5d1b83f3b0ea7cf1b 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -24,6 +24,50 @@ from ..lang.gpu import ProvidesGpuRuntimeAPI
 
 
 class SfgGpuComposer(SfgComposerMixIn):
+    """Composer mix-in providing methods to generate GPU kernel invocations.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs)
+
+        Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration.
+
+        The overloads of this method are listed below.
+        They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax;
+        for details on the launch configuration arguments,
+        refer to `Launch Configurations in CUDA`_
+        or `Launch Configurations in HIP`_.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a manual launch grid.
+
+        Requires that the kernel was generated
+        with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+        set to `True`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with an automatic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that
+        causes the launch grid to be determined automatically,
+        such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a dynamic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that permits a user-defined
+        blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`.
+        The grid size is calculated automatically by dividing the number of work items in each
+        dimension by the block size, rounding up.
+
+    .. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
+
+    .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
+    """  # NOQA: E501
 
     def __init__(self) -> None:
         self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
@@ -63,12 +107,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         block_size: ExprLike,
         shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with a manual launch grid.
-
-        Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
-        set to `True`.
-        """
+    ) -> SfgCallTreeNode: ...
 
     @overload
     def gpu_invoke(
@@ -77,12 +116,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         *,
         shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with an automatic launch grid.
-
-        This signature accepts kernels generated with an indexing scheme that permits
-        the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
-        """
+    ) -> SfgCallTreeNode: ...
 
     @overload
     def gpu_invoke(
@@ -92,14 +126,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         block_size: ExprLike | None = None,
         shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with a dynamic launch grid.
-
-        This signature accepts kernels generated with an indexing scheme that permits a user-defined
-        blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
-        The grid size is calculated automatically by dividing the number of work items in each
-        dimension by the block size, rounding up.
-        """
+    ) -> SfgCallTreeNode: ...
 
     def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
         assert isinstance(