Frederik Hennig · Christoph Alt · Christoph Alt · 020347e7 · 020347e7 · 020347e7
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -24,7 +24,6 @@ typechecker:
    - nox --session typecheck

 .testsuite-base:
-  extends: .nox-base
  stage: "Tests"
  needs: []
  coverage: '/TOTAL.*\s+(\d+%)$/'
@@ -38,13 +37,18 @@ typechecker:
        coverage_format: cobertura
        path: coverage.xml

-"testsuite-py3.10":
+"testsuite-py3.10+cuda":
  extends: .testsuite-base
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:ubuntu24.04-cuda12.6
  script:
    - nox --session testsuite-3.10
+  tags:
+    - docker
+    - cuda11

 "testsuite-py3.13":
  extends: .testsuite-base
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:alpine
  script:
    - nox --session testsuite-3.13


--- a/docs/source/_util/sfg_monkeypatch.py
+++ b/docs/source/_util/sfg_monkeypatch.py
 import pystencilssfg
 from pystencilssfg.config import SfgConfig

+from os.path import splitext
+

 class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
    """Mockup wrapper around SourceFileGenerator for use in documentation
@@ -28,23 +30,23 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
    def __exit__(self, exc_type, exc_value, traceback):
        if exc_type is None:
            self._finish_files()
+            emitter = self._get_emitter()

-            header_code = self._emitter.dumps(self._header_file)
-            impl_code = (
-                None
-                if self._impl_file is None
-                else self._emitter.dumps(self._impl_file)
-            )
+            header_code = emitter.dumps(self._header_file)
+            header_ext = splitext(self._header_file.name)[1]

            mdcode = ":::::{tab-set}\n"

-            mdcode += "::::{tab-item} Generated Header (.hpp)\n"
+            mdcode += f"::::{{tab-item}} Generated Header ({header_ext})\n"
            mdcode += ":::{code-block} C++\n\n"
            mdcode += header_code
            mdcode += "\n:::\n::::\n"

-            if impl_code:
-                mdcode += "::::{tab-item} Generated Implementation (.cpp)\n"
+            if self._impl_file is not None:
+                impl_code = emitter.dumps(self._impl_file)
+                impl_ext = splitext(self._impl_file.name)[1]
+
+                mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n"
                mdcode += ":::{code-block} C++\n\n"
                mdcode += impl_code
                mdcode += "\n:::\n::::\n"

--- a/docs/source/api/composer.rst
+++ b/docs/source/api/composer.rst
@@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``)
 .. autoclass:: SfgClassComposer
    :members:

+.. autoclass:: SfgGpuComposer
+    :members:
+
 Custom Generators
 =================

@@ -37,6 +40,7 @@ Helper Methods and Builders

 .. autoclass:: SfgFunctionSequencer
    :members:
+    :inherited-members:

 .. autoclass:: SfgNodeBuilder
    :members:
@@ -47,6 +51,12 @@ Helper Methods and Builders
 .. autoclass:: SfgSwitchBuilder
    :members:

+.. module:: pystencilssfg.composer.class_composer
+
+.. autoclass:: SfgMethodSequencer
+    :members:
+    :inherited-members:
+
 Context and Cursor
 ==================


--- a/docs/source/api/lang.rst
+++ b/docs/source/api/lang.rst
@@ -41,3 +41,9 @@ Implementation

 .. automodule:: pystencilssfg.lang.cpp
    :members:
+
+GPU Runtime APIs
+----------------
+
+.. automodule:: pystencilssfg.lang.gpu
+    :members:
--- a/docs/source/usage/config_and_cli.md
+++ b/docs/source/usage/config_and_cli.md
@@ -12,7 +12,7 @@ different configuration sources:
  the generator script to set some of its configuration options; see [Command-Line Options](#cmdline_options)
 - **Project Configuration:** When embedded into a larger project, using a build system such as CMake, generator scripts
  may be configured globally within that project by the use of a *configuration module*.
-  Settings specified inside that configuration module are always overridden by the former to configuration sources.
+  Settings specified inside that configuration module are always overridden by the two other configuration sources listed above.
  For details on configuration modules, refer to the guide on [Project and Build System Integration](#guide_project_integration).

 (inline_config)=
@@ -60,14 +60,26 @@ set {any}`cfg.outer_namespace <SfgConfig.outer_namespace>`.

 ### Code Style and Formatting

- - Modify the values in the {any}`cfg.code_style <CodeStyle>` category to affect
-   certain formatting aspects of the generated code.
- - To change, enforce, or disable auto-formatting of generated code through `clang-format`,
-   take a look at the {any}`cfg.clang_format <ClangFormatOptions>` category.
- - Clang-format will, by default, sort `#include` statements alphabetically and separate
-   local and system header includes.
-   To override this, you can set a custom sorting key for `#include` sorting via
-   {any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.
+Pystencils-sfg gives you some options to affect its output code style.
+These are controlled by the options in the {any}`cfg.code_style <CodeStyle>` category.
+
+Furthermore, pystencils-sfg uses `clang-format` to beautify generated code.
+The behaviour of the clang-format integration is managed by the
+the {any}`cfg.clang_format <ClangFormatOptions>` category,
+where you can set options to skip or enforce formatting,
+or change the formatter binary.
+To set the code style used by `clang-format` either create a `.clang-format` file
+in any of the parent folders of your generator script,
+or modify the {any}`cfg.clang_format.code_style <ClangFormatOptions.code_style>` option.
+
+:::{seealso}
+[Clang-Format Style Options](https://clang.llvm.org/docs/ClangFormatStyleOptions.html)
+:::
+
+Clang-format will, by default, sort `#include` statements alphabetically and separate
+local and system header includes.
+To override this, you can set a custom sorting key for `#include` sorting via
+{any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.

 (cmdline_options)=
 ## Command-Line Options

--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -283,7 +283,7 @@ The composer gives us access to the default kernel namespace (`<current_namespac
 via `sfg.kernels`.

 To add a kernel,
- - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.reate() <KernelsAdder.create>`,
+ - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.create() <KernelsAdder.create>`,
 - or create the kernel separately through {any}`pystencils.create_kernel <pystencils.codegen.create_kernel>` and register it using
   {any}`kernels.add() <KernelsAdder.add>`.

@@ -392,13 +392,176 @@ with SourceFileGenerator() as sfg:
    )
 ```

-(exposed_inline_kernels)=
-### Exposed and Inline Kernels
+## GPU Kernels
+
+Pystencils also allows us to generate kernels for the CUDA and HIP GPU programming models.
+This section describes how to generate GPU kernels through pystencils-sfg;
+how to invoke them with various launch configurations,
+and how GPU execution streams are reflected.
+
+### Generate and Invoke CUDA and HIP Kernels
+
+To generate a kernel targetting either of these, set the
+{any}`target <pystencils.codegen.config.CreateKernelConfig.target>`
+code generator option to either `Target.CUDA` or `Target.HIP`.
+After registering a GPU kernel,
+its invocation can be rendered using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
+Here is an example using CUDA:
+
+```{code-cell} ipython3
+from pystencilssfg import SfgConfig
+sfg_config = SfgConfig()
+sfg_config.extensions.impl = "cu"
+
+with SourceFileGenerator(sfg_config) as sfg:
+    #   Configure the code generator to use CUDA
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    #   Create fields, assemble assignments
+    f, g = ps.fields("f, g: double[128, 128]")
+    asm = ps.Assignment(f(0), g(0))
+
+    #   Register kernel
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    #   Invoke it
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle)
+    )
+```
+
+In this snippet, we used the [generator configuration](#how_to_generator_scripts_config)
+to change the suffix of the generated implementation file to `.cu`.
+
+When investigating the generated `.cu` file, you can see that the GPU launch configuration parameters
+*grid size* and *block size* are being computed automatically from the array sizes.
+This behavior can be changed by modifying options in the {any}`gpu <pystencils.codegen.config.GpuOptions>`
+category of the `CreateKernelConfig`.
+
+### Adapting the Launch Configuration
+
+GPU kernel invocations usually require the user to provide a launch grid, defined
+by the GPU thread block size and the number of blocks on the grid.
+In the simplest case (seen above), pystencils-sfg will emit code that automatically
+computes these parameters from the size of the arrays passed to the kernel,
+using a default block size defined by pystencils.
+
+The code generator also permits customization of the launch configuration.
+You may provide a custom block size to override the default, in which case the
+grid size will still be computed by dividing the array sizes by your block size.
+Otherwise, you can also fully take over control of both block and grid size.
+For both cases, instructions are given in the following.
+
+#### User-Defined Block Size for Auto-Computed Grid Size
+
+To merely modify the block size argument while still automatically inferring the grid size,
+pass a variable or expression of type `dim3` to the `block_size` parameter of `gpu_invoke`.
+Pystencils-sfg exposes two versions of `dim3`, which differ primarily in their associated
+runtime headers:
+
+ - {any}`pystencilssfg.lang.gpu.cuda.dim3 <CudaAPI.dim3>` for CUDA, and
+ - {any}`pystencilssfg.lang.gpu.hip.dim3 <HipAPI.dim3>` for HIP.
+
+The following snippet selects the correct `dim3` type according to the kernel target;
+it then creates a variable of that type and turns that into an argument to the kernel invocation:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+target = ps.Target.HIP
+cfg = ps.CreateKernelConfig(target=target)
+f, g = ps.fields("f, g: double[128, 128]")
+asm = ps.Assignment(f(0), g(0))
+```
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    #   Select dim3 reflection
+    match target:
+        case ps.Target.CUDA:
+            from pystencilssfg.lang.gpu import cuda as gpu_api
+        case ps.Target.HIP:
+            from pystencilssfg.lang.gpu import hip as gpu_api
+    
+    #   Create dim3 variable and pass it to kernel invocation
+    block_size = gpu_api.dim3(const=True).var("block_size")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, block_size=block_size)
+    )
+```
+
+#### Manual Launch Configurations
+
+To take full control of the launch configuration, we must disable its automatic inferrence
+by setting the {any}`gpu.manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+code generator option to `True`.
+Then, we must pass `dim3` arguments for both `block_size` and `grid_size` to the kernel invocation:
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+
+    #   Configure for manual launch config
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+    cfg.gpu.manual_launch_grid = True
+
+    #   Register kernel
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+    
+    #   Create dim3 variables
+    from pystencilssfg.lang.gpu import cuda
+    block_size = cuda.dim3(const=True).var("block_size")
+    grid_size = cuda.dim3(const=True).var("grid_size")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size)
+    )
+```
+
+### Using Streams
+
+CUDA and HIP kernels can be enqueued into streams for concurrent execution.
+This is mirrored in pystencils-sfg;
+all overloads of `gpu_invoke` take an optional `stream` argument.
+The `stream_t` data types of both CUDA and HIP are made available
+through the respective API reflections:
+
+ - {any}`lang.gpu.cuda.stream_t <CudaAPI.stream_t>` reflects `cudaStream_t`, and
+ - {any}`lang.gpu.hip.stream_t <HipAPI.stream_t>` reflects `hipStream_t`.
+
+Here is an example that creates a variable of the HIP stream type
+and passes it to `gpu_invoke`:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+cfg = ps.CreateKernelConfig(target=ps.Target.HIP)
+f, g = ps.fields("f, g: double[128, 128]")
+asm = ps.Assignment(f(0), g(0))
+```
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    stream = hip.stream_t(const=True).var("stream")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, stream=stream)
+    )
+```

 :::{admonition} To Do

- - Creating and calling kernels
- - Invoking GPU kernels and the CUDA API Mirror
 - Defining classes, their fields constructors, and methods

 :::

--- a/src/pystencilssfg/composer/__init__.py
+++ b/src/pystencilssfg/composer/__init__.py
@@ -9,6 +9,7 @@ from .basic_composer import (
 )
 from .mixin import SfgComposerMixIn
 from .class_composer import SfgClassComposer
+from .gpu_composer import SfgGpuComposer

 __all__ = [
    "SfgIComposer",
@@ -20,4 +21,5 @@ __all__ = [
    "ExprLike",
    "SfgBasicComposer",
    "SfgClassComposer",
+    "SfgGpuComposer",
 ]
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -13,7 +13,7 @@ from pystencils import (
    Assignment,
    AssignmentCollection,
 )
-from pystencils.codegen import Kernel
+from pystencils.codegen import Kernel, Lambda
 from pystencils.types import create_type, UserTypeSpec, PsType

 from ..context import SfgContext, SfgCursor
@@ -21,7 +21,6 @@ from .custom import CustomGenerator
 from ..ir import (
    SfgCallTreeNode,
    SfgKernelCallNode,
-    SfgCudaKernelInvocation,
    SfgStatements,
    SfgFunctionParams,
    SfgRequireIncludes,
@@ -53,6 +52,7 @@ from ..lang import (
    HeaderFile,
    includes,
    SfgVar,
+    SfgKernelParamVar,
    AugExpr,
    SupportsFieldExtraction,
    SupportsVectorExtraction,
@@ -390,34 +390,14 @@ class SfgBasicComposer(SfgIComposer):
        """Use inside a function body to directly call a kernel.

        When using `call`, the given kernel will simply be called as a function.
-        To invoke a GPU kernel on a specified launch grid, use `cuda_invoke`
-        or the interfaces of ``pystencilssfg.extensions.sycl`` instead.
+        To invoke a GPU kernel on a specified launch grid,
+        use `gpu_invoke <SfgGpuComposer.gpu_invoke>` instead.

        Args:
            kernel_handle: Handle to a kernel previously added to some kernel namespace.
        """
        return SfgKernelCallNode(kernel_handle)

-    def cuda_invoke(
-        self,
-        kernel_handle: SfgKernelHandle,
-        num_blocks: ExprLike,
-        threads_per_block: ExprLike,
-        stream: ExprLike | None,
-    ):
-        """Dispatch a CUDA kernel to the device."""
-        num_blocks_str = str(num_blocks)
-        tpb_str = str(threads_per_block)
-        stream_str = str(stream) if stream is not None else None
-
-        deps = depends(num_blocks) | depends(threads_per_block)
-        if stream is not None:
-            deps |= depends(stream)
-
-        return SfgCudaKernelInvocation(
-            kernel_handle, num_blocks_str, tpb_str, stream_str, deps
-        )
-
    def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence:
        """Syntax sequencing. For details, see `make_sequence`"""
        return make_sequence(*args)
@@ -511,6 +491,11 @@ class SfgBasicComposer(SfgIComposer):
        """
        return AugExpr.format(fmt, *deps, **kwdeps)

+    def expr_from_lambda(self, lamb: Lambda) -> AugExpr:
+        depends = set(SfgKernelParamVar(p) for p in lamb.parameters)
+        code = lamb.c_code()
+        return AugExpr.make(code, depends, dtype=lamb.return_type)
+
    @property
    def branch(self) -> SfgBranchBuilder:
        """Use inside a function body to create an if/else conditonal branch.
@@ -564,7 +549,11 @@ class SfgBasicComposer(SfgIComposer):
        var: SfgVar | sp.Symbol = asvar(param) if isinstance(param, _VarLike) else param
        return SfgDeferredParamSetter(var, expr)

-    def map_vector(self, lhs_components: Sequence[VarLike | sp.Symbol], rhs: SupportsVectorExtraction):
+    def map_vector(
+        self,
+        lhs_components: Sequence[VarLike | sp.Symbol],
+        rhs: SupportsVectorExtraction,
+    ):
        """Extracts scalar numerical values from a vector data type.

        Args:

--- a/src/pystencilssfg/composer/composer.py
+++ b/src/pystencilssfg/composer/composer.py
@@ -3,12 +3,13 @@ from typing import TYPE_CHECKING

 from .basic_composer import SfgBasicComposer
 from .class_composer import SfgClassComposer
+from .gpu_composer import SfgGpuComposer

 if TYPE_CHECKING:
    from ..context import SfgContext


-class SfgComposer(SfgBasicComposer, SfgClassComposer):
+class SfgComposer(SfgBasicComposer, SfgClassComposer, SfgGpuComposer):
    """Primary interface for constructing source files in pystencils-sfg.

    The SfgComposer combines the `SfgBasicComposer`
@@ -19,3 +20,4 @@ class SfgComposer(SfgBasicComposer, SfgClassComposer):
    def __init__(self, sfg: SfgContext | SfgBasicComposer):
        SfgBasicComposer.__init__(self, sfg)
        SfgClassComposer.__init__(self)
+        SfgGpuComposer.__init__(self)
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
+from __future__ import annotations
+
+from typing import overload
+
+from pystencils.codegen import GpuKernel, Target
+from pystencils.codegen.gpu_indexing import (
+    ManualLaunchConfiguration,
+    AutomaticLaunchConfiguration,
+    DynamicBlockSizeLaunchConfiguration,
+)
+
+from .mixin import SfgComposerMixIn
+from .basic_composer import make_statements, make_sequence
+
+from ..context import SfgContext
+from ..ir import (
+    SfgKernelHandle,
+    SfgCallTreeNode,
+    SfgGpuKernelInvocation,
+    SfgBlock,
+    SfgSequence,
+)
+from ..lang import ExprLike, AugExpr
+from ..lang.gpu import CudaAPI, HipAPI, ProvidesGpuRuntimeAPI
+
+
+class SfgGpuComposer(SfgComposerMixIn):
+    """Composer mix-in providing methods to generate GPU kernel invocations.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs)
+
+        Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration.
+
+        The overloads of this method are listed below.
+        They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax;
+        for details on the launch configuration arguments,
+        refer to `Launch Configurations in CUDA`_
+        or `Launch Configurations in HIP`_.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a manual launch grid.
+
+        Requires that the kernel was generated
+        with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+        set to `True`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with an automatic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that
+        causes the launch grid to be determined automatically,
+        such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a dynamic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that permits a user-defined
+        blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`.
+        The grid size is calculated automatically by dividing the number of work items in each
+        dimension by the block size, rounding up.
+
+    .. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
+
+    .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
+    """  # NOQA: E501
+
+    @overload
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        grid_size: ExprLike,
+        block_size: ExprLike,
+        shared_memory_bytes: ExprLike = "0",
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode: ...
+
+    @overload
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        shared_memory_bytes: ExprLike = "0",
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode: ...
+
+    @overload
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        block_size: ExprLike | None = None,
+        shared_memory_bytes: ExprLike = "0",
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode: ...
+
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        shared_memory_bytes: ExprLike = "0",
+        stream: ExprLike | None = None,
+        **kwargs,
+    ) -> SfgCallTreeNode:
+        builder = GpuInvocationBuilder(self._ctx, kernel_handle)
+        builder.shared_memory_bytes = shared_memory_bytes
+        builder.stream = stream
+
+        return builder(**kwargs)
+
+    def cuda_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        num_blocks: ExprLike,
+        threads_per_block: ExprLike,
+        stream: ExprLike | None,
+    ):
+        from warnings import warn
+
+        warn(
+            "cuda_invoke is deprecated and will be removed before version 0.1. "
+            "Use `gpu_invoke` instead.",
+            FutureWarning,
+        )
+
+        return self.gpu_invoke(
+            kernel_handle,
+            grid_size=num_blocks,
+            block_size=threads_per_block,
+            stream=stream,
+        )
+
+
+class GpuInvocationBuilder:
+    def __init__(
+        self,
+        ctx: SfgContext,
+        kernel_handle: SfgKernelHandle,
+    ):
+        self._ctx = ctx
+        self._kernel_handle = kernel_handle
+
+        ker = kernel_handle.kernel
+
+        if not isinstance(ker, GpuKernel):
+            raise ValueError(f"Non-GPU kernel was passed to `gpu_invoke`: {ker}")
+
+        launch_config = ker.get_launch_configuration()
+
+        self._launch_config = launch_config
+
+        gpu_api: type[ProvidesGpuRuntimeAPI]
+        match ker.target:
+            case Target.CUDA:
+                gpu_api = CudaAPI
+            case Target.HIP:
+                gpu_api = HipAPI
+            case _:
+                assert False, "unexpected GPU target"
+
+        self._gpu_api = gpu_api
+        self._dim3 = gpu_api.dim3
+
+        self._shared_memory_bytes: ExprLike = "0"
+        self._stream: ExprLike | None = None
+
+    @property
+    def shared_memory_bytes(self) -> ExprLike:
+        return self._shared_memory_bytes
+
+    @shared_memory_bytes.setter
+    def shared_memory_bytes(self, bs: ExprLike):
+        self._shared_memory_bytes = bs
+
+    @property
+    def stream(self) -> ExprLike | None:
+        return self._stream
+
+    @stream.setter
+    def stream(self, s: ExprLike | None):
+        self._stream = s
+
+    def _render_invocation(
+        self, grid_size: ExprLike, block_size: ExprLike
+    ) -> SfgSequence:
+        stmt_grid_size = make_statements(grid_size)
+        stmt_block_size = make_statements(block_size)
+        stmt_smem = make_statements(self._shared_memory_bytes)
+        stmt_stream = (
+            make_statements(self._stream) if self._stream is not None else None
+        )
+
+        return make_sequence(
+            "// clang-format off: "
+            "[pystencils-sfg] Formatting may add illegal spaces between angular brackets in `<<< >>>`.",
+            SfgGpuKernelInvocation(
+                self._kernel_handle,
+                stmt_grid_size,
+                stmt_block_size,
+                shared_memory_bytes=stmt_smem,
+                stream=stmt_stream,
+            ),
+            "// clang-format on",
+        )
+
+    def __call__(self, **kwargs: ExprLike) -> SfgCallTreeNode:
+        match self._launch_config:
+            case ManualLaunchConfiguration():
+                return self._invoke_manual(**kwargs)
+            case AutomaticLaunchConfiguration():
+                return self._invoke_automatic(**kwargs)
+            case DynamicBlockSizeLaunchConfiguration():
+                return self._invoke_dynamic(**kwargs)
+            case _:
+                raise ValueError(
+                    f"Unexpected launch configuration: {self._launch_config}"
+                )
+
+    def _invoke_manual(self, grid_size: ExprLike, block_size: ExprLike):
+        assert isinstance(self._launch_config, ManualLaunchConfiguration)
+        return self._render_invocation(grid_size, block_size)
+
+    def _invoke_automatic(self):
+        assert isinstance(self._launch_config, AutomaticLaunchConfiguration)
+
+        from .composer import SfgComposer
+
+        sfg = SfgComposer(self._ctx)
+
+        grid_size_entries = [
+            self._to_uint32_t(sfg.expr_from_lambda(gs))
+            for gs in self._launch_config._grid_size
+        ]
+        grid_size_var = self._dim3(const=True).var("__grid_size")
+
+        block_size_entries = [
+            self._to_uint32_t(sfg.expr_from_lambda(bs))
+            for bs in self._launch_config._block_size
+        ]
+        block_size_var = self._dim3(const=True).var("__block_size")
+
+        nodes = [
+            sfg.init(grid_size_var)(*grid_size_entries),
+            sfg.init(block_size_var)(*block_size_entries),
+            self._render_invocation(grid_size_var, block_size_var),
+        ]
+
+        return SfgBlock(SfgSequence(nodes))
+
+    def _invoke_dynamic(self, block_size: ExprLike | None = None):
+        assert isinstance(self._launch_config, DynamicBlockSizeLaunchConfiguration)
+
+        from .composer import SfgComposer
+
+        sfg = SfgComposer(self._ctx)
+
+        block_size_init_args: tuple[ExprLike, ...]
+        if block_size is None:
+            block_size_init_args = tuple(
+                str(bs) for bs in self._launch_config.default_block_size
+            )
+        else:
+            block_size_init_args = (block_size,)
+
+        block_size_var = self._dim3(const=True).var("__block_size")
+
+        from ..lang.cpp import std
+
+        work_items_entries = [
+            sfg.expr_from_lambda(wit) for wit in self._launch_config.num_work_items
+        ]
+        work_items_var = std.tuple("uint32_t", "uint32_t", "uint32_t", const=True).var(
+            "__work_items"
+        )
+
+        def _div_ceil(a: ExprLike, b: ExprLike):
+            return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
+
+        grid_size_entries = [
+            _div_ceil(work_items_var.get(i), bs)
+            for i, bs in enumerate(
+                [
+                    block_size_var.x,
+                    block_size_var.y,
+                    block_size_var.z,
+                ]
+            )
+        ]
+        grid_size_var = self._dim3(const=True).var("__grid_size")
+
+        nodes = [
+            sfg.init(block_size_var)(*block_size_init_args),
+            sfg.init(work_items_var)(*work_items_entries),
+            sfg.init(grid_size_var)(*grid_size_entries),
+            self._render_invocation(grid_size_var, block_size_var),
+        ]
+
+        return SfgBlock(SfgSequence(nodes))
+
+    @staticmethod
+    def _to_uint32_t(expr: AugExpr) -> AugExpr:
+        return AugExpr("uint32_t").format("uint32_t({})", expr)
--- a/src/pystencilssfg/context.py
+++ b/src/pystencilssfg/context.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import Sequence, Any, Generator
 from contextlib import contextmanager

-from .config import CodeStyle
+from .config import CodeStyle, ClangFormatOptions
 from .ir import (
    SfgSourceFile,
    SfgNamespace,
@@ -23,6 +23,7 @@ class SfgContext:
        impl_file: SfgSourceFile | None,
        namespace: str | None = None,
        codestyle: CodeStyle | None = None,
+        clang_format_opts: ClangFormatOptions | None = None,
        argv: Sequence[str] | None = None,
        project_info: Any = None,
    ):
@@ -33,6 +34,9 @@ class SfgContext:
        self._inner_namespace: str | None = None

        self._codestyle = codestyle if codestyle is not None else CodeStyle()
+        self._clang_format: ClangFormatOptions = (
+            clang_format_opts if clang_format_opts is not None else ClangFormatOptions()
+        )

        self._header_file = header_file
        self._impl_file = impl_file
@@ -73,6 +77,10 @@ class SfgContext:
        """The code style object for this generation context."""
        return self._codestyle

+    @property
+    def clang_format(self) -> ClangFormatOptions:
+        return self._clang_format
+
    @property
    def header_file(self) -> SfgSourceFile:
        return self._header_file
@@ -150,6 +158,9 @@ class SfgCursor:
            self._loc[f].append(block)
            self._loc[f] = block.elements

+        outer_namespace = self._cur_namespace
+        self._cur_namespace = namespace
+
        @contextmanager
        def ctxmgr():
            try:
@@ -157,5 +168,6 @@ class SfgCursor:
            finally:
                #   Have the cursor step back out of the nested namespace blocks
                self._loc = outer_locs
+                self._cur_namespace = outer_namespace

        return ctxmgr()
--- a/src/pystencilssfg/extensions/gpu.py
+++ b/src/pystencilssfg/extensions/gpu.py
-from pystencilssfg import lang
-
-
-def dim3class(gpu_runtime_header: str, *, cls_name: str = "dim3"):
-    """
-    >>> dim3 = dim3class("<hip/hip_runtime.h>")
-    >>> dim3().ctor(64, 1, 1)
-    dim3{64, 1, 1}
-
-    Args:
-        gpu_runtime_header: String with the name of the gpu runtime header
-        cls_name: String with the acutal name (default "dim3")
-    Returns:
-        Dim3Class: A `lang.CppClass` that mimics cuda's/hip's `dim3`
-    """
-    @lang.cppclass(cls_name, gpu_runtime_header)
-    class Dim3Class:
-        def ctor(self, dim0=1, dim1=1, dim2=1):
-            return self.ctor_bind(dim0, dim1, dim2)
-
-        @property
-        def x(self):
-            return lang.AugExpr.format("{}.x", self)
-
-        @property
-        def y(self):
-            return lang.AugExpr.format("{}.y", self)
-
-        @property
-        def z(self):
-            return lang.AugExpr.format("{}.z", self)
-
-        @property
-        def dims(self):
-            """The dims property."""
-            return [self.x, self.y, self.z]
-
-    return Dim3Class
--- a/src/pystencilssfg/generator.py
+++ b/src/pystencilssfg/generator.py
@@ -95,9 +95,7 @@ class SourceFileGenerator:
            self._impl_file = SfgSourceFile(
                output_files[1].name, SfgSourceFileType.TRANSLATION_UNIT
            )
-            self._impl_file.includes.append(
-                HeaderFile.parse(self._header_file.name)
-            )
+            self._impl_file.includes.append(HeaderFile.parse(self._header_file.name))

        #   TODO: Find a way to not hard-code the restrict qualifier in pystencils
        self._header_file.elements.append("#define RESTRICT __restrict__")
@@ -115,14 +113,11 @@ class SourceFileGenerator:
            self._impl_file,
            namespace,
            config.codestyle,
+            config.clang_format,
            argv=script_args,
            project_info=cli_params.get_project_info(),
        )

-        self._emitter = SfgCodeEmitter(
-            self._output_dir, config.codestyle, config.clang_format
-        )
-
        sort_key = config.codestyle.get_option("includes_sorting_key")
        if sort_key is None:

@@ -161,6 +156,13 @@ class SourceFileGenerator:
            )
            self._impl_file.includes.sort(key=self._include_sort_key)

+    def _get_emitter(self):
+        return SfgCodeEmitter(
+            self._output_dir,
+            self._context.codestyle,
+            self._context.clang_format,
+        )
+
    def __enter__(self) -> SfgComposer:
        self.clean_files()
        return SfgComposer(self._context)
@@ -169,6 +171,7 @@ class SourceFileGenerator:
        if exc_type is None:
            self._finish_files()

-            self._emitter.emit(self._header_file)
+            emitter = self._get_emitter()
+            emitter.emit(self._header_file)
            if self._impl_file is not None:
-                self._emitter.emit(self._impl_file)
+                emitter.emit(self._impl_file)
--- a/src/pystencilssfg/ir/__init__.py
+++ b/src/pystencilssfg/ir/__init__.py
@@ -3,7 +3,7 @@ from .call_tree import (
    SfgCallTreeLeaf,
    SfgEmptyNode,
    SfgKernelCallNode,
-    SfgCudaKernelInvocation,
+    SfgGpuKernelInvocation,
    SfgBlock,
    SfgSequence,
    SfgStatements,
@@ -47,7 +47,7 @@ __all__ = [
    "SfgCallTreeLeaf",
    "SfgEmptyNode",
    "SfgKernelCallNode",
-    "SfgCudaKernelInvocation",
+    "SfgGpuKernelInvocation",
    "SfgSequence",
    "SfgBlock",
    "SfgStatements",

--- a/src/pystencilssfg/ir/call_tree.py
+++ b/src/pystencilssfg/ir/call_tree.py
@@ -19,6 +19,7 @@ class SfgCallTreeNode(ABC):
    Therefore, every instantiable call tree node must implement the method `get_code`.
    By convention, the string returned by `get_code` should not contain a trailing newline.
    """
+
    def __init__(self) -> None:
        self._includes: set[HeaderFile] = set()

@@ -34,6 +35,11 @@ class SfgCallTreeNode(ABC):
        By convention, the code block emitted by this function should not contain a trailing newline.
        """

+    @property
+    def depends(self) -> set[SfgVar]:
+        """Set of objects this leaf depends on"""
+        return set()
+
    @property
    def required_includes(self) -> set[HeaderFile]:
        """Return a set of header includes required by this node"""
@@ -53,11 +59,6 @@ class SfgCallTreeLeaf(SfgCallTreeNode, ABC):
    def children(self) -> Sequence[SfgCallTreeNode]:
        return ()

-    @property
-    @abstractmethod
-    def depends(self) -> set[SfgVar]:
-        """Set of objects this leaf depends on"""
-

 class SfgEmptyNode(SfgCallTreeLeaf):
    """A leaf node that does not emit any code.
@@ -202,52 +203,76 @@ class SfgKernelCallNode(SfgCallTreeLeaf):
        return set(self._kernel_handle.parameters)

    def get_code(self, cstyle: CodeStyle) -> str:
-        ast_params = self._kernel_handle.parameters
+        kparams = self._kernel_handle.parameters
        fnc_name = self._kernel_handle.fqname
-        call_parameters = ", ".join([p.name for p in ast_params])
+        call_parameters = ", ".join([p.name for p in kparams])

        return f"{fnc_name}({call_parameters});"


-class SfgCudaKernelInvocation(SfgCallTreeLeaf):
+class SfgGpuKernelInvocation(SfgCallTreeNode):
+    """A CUDA or HIP kernel invocation.
+
+    See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
+    or https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
+    for the syntax.
+    """
+
    def __init__(
        self,
        kernel_handle: SfgKernelHandle,
-        num_blocks_code: str,
-        threads_per_block_code: str,
-        stream_code: str | None,
-        depends: set[SfgVar],
+        grid_size: SfgStatements,
+        block_size: SfgStatements,
+        shared_memory_bytes: SfgStatements | None,
+        stream: SfgStatements | None,
    ):
-        from pystencils import Target
        from pystencils.codegen import GpuKernel

        kernel = kernel_handle.kernel
-        if not (isinstance(kernel, GpuKernel) and kernel.target == Target.CUDA):
+        if not isinstance(kernel, GpuKernel):
            raise ValueError(
-                "An `SfgCudaKernelInvocation` node can only call a CUDA kernel."
+                "An `SfgGpuKernelInvocation` node can only call GPU kernels."
            )

        super().__init__()
        self._kernel_handle = kernel_handle
-        self._num_blocks = num_blocks_code
-        self._threads_per_block = threads_per_block_code
-        self._stream = stream_code
-        self._depends = depends
+        self._grid_size = grid_size
+        self._block_size = block_size
+        self._shared_memory_bytes = shared_memory_bytes
+        self._stream = stream
+
+    @property
+    def children(self) -> Sequence[SfgCallTreeNode]:
+        return (
+            (
+                self._grid_size,
+                self._block_size,
+            )
+            + (
+                (self._shared_memory_bytes,)
+                if self._shared_memory_bytes is not None
+                else ()
+            )
+            + ((self._stream,) if self._stream is not None else ())
+        )

    @property
    def depends(self) -> set[SfgVar]:
-        return set(self._kernel_handle.parameters) | self._depends
+        return set(self._kernel_handle.parameters)

    def get_code(self, cstyle: CodeStyle) -> str:
-        ast_params = self._kernel_handle.parameters
+        kparams = self._kernel_handle.parameters
        fnc_name = self._kernel_handle.fqname
-        call_parameters = ", ".join([p.name for p in ast_params])
+        call_parameters = ", ".join([p.name for p in kparams])
+
+        grid_args = [self._grid_size, self._block_size]
+        if self._shared_memory_bytes is not None:
+            grid_args += [self._shared_memory_bytes]

-        grid_args = [self._num_blocks, self._threads_per_block]
        if self._stream is not None:
            grid_args += [self._stream]

-        grid = "<<< " + ", ".join(grid_args) + " >>>"
+        grid = "<<< " + ", ".join(arg.get_code(cstyle) for arg in grid_args) + " >>>"
        return f"{fnc_name}{grid}({call_parameters});"



--- a/src/pystencilssfg/ir/postprocessing.py
+++ b/src/pystencilssfg/ir/postprocessing.py
 from __future__ import annotations
 from typing import Sequence, Iterable
 import warnings
-from functools import reduce
 from dataclasses import dataclass

 from abc import ABC, abstractmethod
@@ -15,7 +14,7 @@ from pystencils.codegen.properties import FieldBasePtr, FieldShape, FieldStride
 from ..exceptions import SfgException
 from ..config import CodeStyle

-from .call_tree import SfgCallTreeNode, SfgCallTreeLeaf, SfgSequence, SfgStatements
+from .call_tree import SfgCallTreeNode, SfgSequence, SfgStatements
 from ..lang.expressions import SfgKernelParamVar
 from ..lang import (
    SfgVar,
@@ -163,17 +162,12 @@ class CallTreePostProcessing:
                self.handle_sequence(node, ppc)
                return ppc.live_variables

-            case SfgCallTreeLeaf():
-                return node.depends
-
            case SfgDeferredNode():
                raise SfgException("Deferred nodes can only occur inside a sequence.")

            case _:
-                return reduce(
-                    lambda x, y: x | y,
-                    (self.get_live_variables(c) for c in node.children),
-                    set(),
+                return node.depends.union(
+                    *(self.get_live_variables(c) for c in node.children)
                )



--- a/src/pystencilssfg/lang/cpp/std_tuple.py
+++ b/src/pystencilssfg/lang/cpp/std_tuple.py
@@ -19,10 +19,13 @@ class StdTuple(AugExpr, SupportsVectorExtraction):
        dtype = self._template(ts=", ".join(elt_type_strings), const=const, ref=ref)
        super().__init__(dtype)

+    def get(self, idx: int | str) -> AugExpr:
+        return AugExpr.format("std::get< {} >({})", idx, self)
+
    def _extract_component(self, coordinate: int) -> AugExpr:
        if coordinate < 0 or coordinate >= self._length:
            raise ValueError(
                f"Index {coordinate} out-of-bounds for std::tuple with {self._length} entries."
            )

-        return AugExpr.format("std::get< {} >({})", coordinate, self)
+        return self.get(coordinate)
--- a/src/pystencilssfg/lang/expressions.py
+++ b/src/pystencilssfg/lang/expressions.py
@@ -218,8 +218,12 @@ class AugExpr:
        return self._bind(expr)

    @staticmethod
-    def make(code: str, depends: Iterable[SfgVar | AugExpr]):
-        return AugExpr()._bind(DependentExpression(code, depends))
+    def make(
+        code: str,
+        depends: Iterable[SfgVar | AugExpr],
+        dtype: UserTypeSpec | None = None,
+    ):
+        return AugExpr(dtype)._bind(DependentExpression(code, depends))

    @staticmethod
    def format(fmt: str, *deps, **kwdeps) -> AugExpr:

--- a/src/pystencilssfg/lang/gpu.py
+++ b/src/pystencilssfg/lang/gpu.py
+from __future__ import annotations
+
+from typing import Protocol
+
+from .expressions import CppClass, cpptype, AugExpr
+
+
+class Dim3Interface(CppClass):
+    """Interface definition for the ``dim3`` struct of Cuda and HIP."""
+
+    def ctor(self, dim0=1, dim1=1, dim2=1):
+        """Constructor invocation of ``dim3``"""
+        return self.ctor_bind(dim0, dim1, dim2)
+
+    @property
+    def x(self) -> AugExpr:
+        """The `x` coordinate member."""
+        return AugExpr.format("{}.x", self)
+
+    @property
+    def y(self) -> AugExpr:
+        """The `y` coordinate member."""
+        return AugExpr.format("{}.y", self)
+
+    @property
+    def z(self) -> AugExpr:
+        """The `z` coordinate member."""
+        return AugExpr.format("{}.z", self)
+
+    @property
+    def dims(self) -> tuple[AugExpr, AugExpr, AugExpr]:
+        """`x`, `y`, and `z` as a tuple."""
+        return (self.x, self.y, self.z)
+
+
+class ProvidesGpuRuntimeAPI(Protocol):
+    """Protocol definition for a GPU runtime API provider."""
+
+    dim3: type[Dim3Interface]
+    """The ``dim3`` struct type for this GPU runtime"""
+
+    stream_t: type[AugExpr]
+    """The ``stream_t`` type for this GPU runtime"""
+
+
+class CudaAPI(ProvidesGpuRuntimeAPI):
+    """Reflection of the CUDA runtime API"""
+
+    class dim3(Dim3Interface):
+        """Implements `Dim3Interface` for CUDA"""
+
+        template = cpptype("dim3", "<cuda_runtime.h>")
+
+    class stream_t(CppClass):
+        template = cpptype("cudaStream_t", "<cuda_runtime.h>")
+
+
+cuda = CudaAPI
+"""Alias for `CudaAPI`"""
+
+
+class HipAPI(ProvidesGpuRuntimeAPI):
+    """Reflection of the HIP runtime API"""
+
+    class dim3(Dim3Interface):
+        """Implements `Dim3Interface` for HIP"""
+
+        template = cpptype("dim3", "<hip/hip_runtime.h>")
+
+    class stream_t(CppClass):
+        template = cpptype("hipStream_t", "<hip/hip_runtime.h>")
+
+
+hip = HipAPI
+"""Alias for `HipAPI`"""
--- a/tests/extensions/test_gpu.py
+++ b/tests/extensions/test_gpu.py
-from pystencilssfg.extensions.gpu import dim3class
-from pystencilssfg.lang import HeaderFile, AugExpr
-
-
-def test_dim3():
-    cuda_runtime = "<cuda_runtime.h>"
-    dim3 = dim3class(cuda_runtime, cls_name="dim3")
-    assert HeaderFile.parse(cuda_runtime) in dim3.template.includes
-    assert str(dim3().ctor(128, 1, 1)) == "dim3{128, 1, 1}"
-    assert str(dim3().ctor()) == "dim3{1, 1, 1}"
-    assert str(dim3().ctor(1, 1, 128)) == "dim3{1, 1, 128}"
-
-    block = dim3(ref=True, const=True).var("block")
-
-    dims = [
-        AugExpr.format(
-            "uint32_t(({} + {} - 1)/ {})",
-            1024,
-            block.dims[i],
-            block.dims[i],
-        )
-        for i in range(3)
-    ]
-
-    grid = dim3().ctor(*dims)
-    assert str(grid) == f"dim3{{{', '.join((str(d) for d in dims))}}}"
No results found