diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py
index 31337a6282932420f8d5b6d9093deec5c2caea1a..8a076745eac0d2cb98b88490469a58975de1e896 100644
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Sequence, TypeAlias
+from typing import Sequence, TypeAlias, overload
 from abc import ABC, abstractmethod
 import sympy as sp
 from functools import reduce
@@ -13,7 +13,12 @@ from pystencils import (
     Assignment,
     AssignmentCollection,
 )
-from pystencils.codegen import Kernel
+from pystencils.codegen import Kernel, GpuKernel, Lambda
+from pystencils.codegen.gpu_indexing import (
+    ManualLaunchConfiguration,
+    AutomaticLaunchConfiguration,
+    DynamicBlockSizeLaunchConfiguration,
+)
 from pystencils.types import create_type, UserTypeSpec, PsType
 
 from ..context import SfgContext, SfgCursor
@@ -53,6 +58,7 @@ from ..lang import (
     HeaderFile,
     includes,
     SfgVar,
+    SfgKernelParamVar,
     AugExpr,
     SupportsFieldExtraction,
     SupportsVectorExtraction,
@@ -398,25 +404,155 @@ class SfgBasicComposer(SfgIComposer):
         """
         return SfgKernelCallNode(kernel_handle)
 
+    @overload
     def cuda_invoke(
         self,
         kernel_handle: SfgKernelHandle,
-        num_blocks: ExprLike,
-        threads_per_block: ExprLike,
-        stream: ExprLike | None,
-    ):
-        """Dispatch a CUDA kernel to the device."""
-        num_blocks_str = str(num_blocks)
-        tpb_str = str(threads_per_block)
-        stream_str = str(stream) if stream is not None else None
+        *,
+        grid_size: ExprLike,
+        block_size: ExprLike,
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode:
+        """Invoke a CUDA kernel with a manual launch grid.
+
+        Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
+        set to `True`.
+        """
 
-        deps = depends(num_blocks) | depends(threads_per_block)
-        if stream is not None:
-            deps |= depends(stream)
+    @overload
+    def cuda_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode:
+        """Invoke a CUDA kernel with an automatic launch grid.
 
-        return SfgCudaKernelInvocation(
-            kernel_handle, num_blocks_str, tpb_str, stream_str, deps
-        )
+        This signature accepts kernels generated with an indexing scheme that permits
+        the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
+        """
+
+    @overload
+    def cuda_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        block_size: ExprLike | None = None,
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode:
+        """Invoke a CUDA kernel with a dynamic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that permits a user-defined
+        blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
+        The grid size is calculated automatically.
+        """
+
+    def cuda_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
+        ker = kernel_handle.kernel
+
+        if not isinstance(ker, GpuKernel):
+            raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}")
+
+        launch_config = ker.get_launch_configuration()
+
+        from ..lang.cuda import dim3
+
+        def _render_invocation(
+            grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None
+        ):
+            stmt_grid_size = make_statements(grid_size)
+            stmt_block_size = make_statements(block_size)
+            stmt_stream = make_statements(stream) if stream is not None else None
+
+            return SfgCudaKernelInvocation(
+                kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream
+            )
+
+        grid_size: ExprLike
+        block_size: ExprLike
+        stream: ExprLike | None
+
+        match launch_config:
+            case ManualLaunchConfiguration():
+                grid_size = kwargs["grid_size"]
+                block_size = kwargs["block_size"]
+                stream = kwargs["stream"]
+
+                return _render_invocation(grid_size, block_size, stream)
+
+            case AutomaticLaunchConfiguration():
+                stream = kwargs["stream"]
+
+                grid_size_entries = [
+                    self.expr_from_lambda(gs) for gs in launch_config._grid_size
+                ]
+                grid_size_var = dim3(const=True).var("__grid_size")
+
+                block_size_entries = [
+                    self.expr_from_lambda(bs) for bs in launch_config._block_size
+                ]
+                block_size_var = dim3(const=True).var("__block_size")
+
+                nodes: list[SfgCallTreeNode] = [
+                    self.init(grid_size_var)(*grid_size_entries),
+                    self.init(block_size_var)(*block_size_entries),
+                    _render_invocation(grid_size_var, block_size_var, stream),
+                ]
+
+                return SfgBlock(SfgSequence(nodes))
+
+            case DynamicBlockSizeLaunchConfiguration():
+                block_size = kwargs["block_size"]
+                stream = kwargs["stream"]
+
+                from ..lang.cpp import std
+
+                witem_types = [lmb.return_type for lmb in launch_config.num_work_items]
+                work_items_entries = [
+                    self.expr_from_lambda(wit) for wit in launch_config.num_work_items
+                ]
+                work_items_var = std.tuple(*witem_types, const=True).var("__work_items")
+
+                def _min(a: ExprLike, b: ExprLike):
+                    return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
+
+                def _div_ceil(a: ExprLike, b: ExprLike):
+                    return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
+
+                block_size_var = dim3(const=True).var("__block_size")
+
+                reduced_block_size_entries = [
+                    _min(work_items_var.get(i), bs)
+                    for i, bs in enumerate(
+                        [block_size_var.x(), block_size_var.y(), block_size_var.z()]
+                    )
+                ]
+                reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
+
+                grid_size_entries = [
+                    _div_ceil(work_items_var.get(i), bs)
+                    for i, bs in enumerate(
+                        [
+                            reduced_block_size_var.x(),
+                            reduced_block_size_var.y(),
+                            reduced_block_size_var.z(),
+                        ]
+                    )
+                ]
+                grid_size_var = dim3(const=True).var("__grid_size")
+
+                nodes = [
+                    self.init(block_size_var)(block_size),
+                    self.init(work_items_var)(*work_items_entries),
+                    self.init(reduced_block_size_var)(*reduced_block_size_entries),
+                    self.init(grid_size_var)(*grid_size_entries),
+                    _render_invocation(grid_size_var, reduced_block_size_var, stream),
+                ]
+
+                return SfgBlock(SfgSequence(nodes))
+
+            case _:
+                raise ValueError(f"Unexpected launch configuration: {launch_config}")
 
     def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence:
         """Syntax sequencing. For details, see `make_sequence`"""
@@ -511,6 +647,11 @@ class SfgBasicComposer(SfgIComposer):
         """
         return AugExpr.format(fmt, *deps, **kwdeps)
 
+    def expr_from_lambda(self, lamb: Lambda) -> AugExpr:
+        depends = set(SfgKernelParamVar(p) for p in lamb.parameters)
+        code = lamb.c_code()
+        return AugExpr.make(code, depends, dtype=lamb.return_type)
+
     @property
     def branch(self) -> SfgBranchBuilder:
         """Use inside a function body to create an if/else conditonal branch.
@@ -564,7 +705,11 @@ class SfgBasicComposer(SfgIComposer):
         var: SfgVar | sp.Symbol = asvar(param) if isinstance(param, _VarLike) else param
         return SfgDeferredParamSetter(var, expr)
 
-    def map_vector(self, lhs_components: Sequence[VarLike | sp.Symbol], rhs: SupportsVectorExtraction):
+    def map_vector(
+        self,
+        lhs_components: Sequence[VarLike | sp.Symbol],
+        rhs: SupportsVectorExtraction,
+    ):
         """Extracts scalar numerical values from a vector data type.
 
         Args:
diff --git a/src/pystencilssfg/ir/call_tree.py b/src/pystencilssfg/ir/call_tree.py
index 24a315d5a0ae0319cbc1b906f98deacc72828176..78ba84117cadf9f83f8d0cfdd4afc2ff2e169e81 100644
--- a/src/pystencilssfg/ir/call_tree.py
+++ b/src/pystencilssfg/ir/call_tree.py
@@ -19,6 +19,7 @@ class SfgCallTreeNode(ABC):
     Therefore, every instantiable call tree node must implement the method `get_code`.
     By convention, the string returned by `get_code` should not contain a trailing newline.
     """
+
     def __init__(self) -> None:
         self._includes: set[HeaderFile] = set()
 
@@ -34,6 +35,11 @@ class SfgCallTreeNode(ABC):
         By convention, the code block emitted by this function should not contain a trailing newline.
         """
 
+    @property
+    def depends(self) -> set[SfgVar]:
+        """Set of objects this leaf depends on"""
+        return set()
+
     @property
     def required_includes(self) -> set[HeaderFile]:
         """Return a set of header includes required by this node"""
@@ -53,11 +59,6 @@ class SfgCallTreeLeaf(SfgCallTreeNode, ABC):
     def children(self) -> Sequence[SfgCallTreeNode]:
         return ()
 
-    @property
-    @abstractmethod
-    def depends(self) -> set[SfgVar]:
-        """Set of objects this leaf depends on"""
-
 
 class SfgEmptyNode(SfgCallTreeLeaf):
     """A leaf node that does not emit any code.
@@ -202,21 +203,20 @@ class SfgKernelCallNode(SfgCallTreeLeaf):
         return set(self._kernel_handle.parameters)
 
     def get_code(self, cstyle: CodeStyle) -> str:
-        ast_params = self._kernel_handle.parameters
+        kparams = self._kernel_handle.parameters
         fnc_name = self._kernel_handle.fqname
-        call_parameters = ", ".join([p.name for p in ast_params])
+        call_parameters = ", ".join([p.name for p in kparams])
 
         return f"{fnc_name}({call_parameters});"
 
 
-class SfgCudaKernelInvocation(SfgCallTreeLeaf):
+class SfgCudaKernelInvocation(SfgCallTreeNode):
     def __init__(
         self,
         kernel_handle: SfgKernelHandle,
-        num_blocks_code: str,
-        threads_per_block_code: str,
-        stream_code: str | None,
-        depends: set[SfgVar],
+        grid_size: SfgStatements,
+        block_size: SfgStatements,
+        stream: SfgStatements | None,
     ):
         from pystencils import Target
         from pystencils.codegen import GpuKernel
@@ -229,25 +229,31 @@ class SfgCudaKernelInvocation(SfgCallTreeLeaf):
 
         super().__init__()
         self._kernel_handle = kernel_handle
-        self._num_blocks = num_blocks_code
-        self._threads_per_block = threads_per_block_code
-        self._stream = stream_code
-        self._depends = depends
+        self._grid_size = grid_size
+        self._block_size = block_size
+        self._stream = stream
+
+    @property
+    def children(self) -> Sequence[SfgCallTreeNode]:
+        return (
+            self._grid_size,
+            self._block_size,
+        ) + ((self._stream,) if self._stream is not None else ())
 
     @property
     def depends(self) -> set[SfgVar]:
-        return set(self._kernel_handle.parameters) | self._depends
+        return set(self._kernel_handle.parameters)
 
     def get_code(self, cstyle: CodeStyle) -> str:
-        ast_params = self._kernel_handle.parameters
+        kparams = self._kernel_handle.parameters
         fnc_name = self._kernel_handle.fqname
-        call_parameters = ", ".join([p.name for p in ast_params])
+        call_parameters = ", ".join([p.name for p in kparams])
 
-        grid_args = [self._num_blocks, self._threads_per_block]
+        grid_args = [self._grid_size, self._block_size]
         if self._stream is not None:
             grid_args += [self._stream]
 
-        grid = "<<< " + ", ".join(grid_args) + " >>>"
+        grid = "<<< " + ", ".join(arg.get_code(cstyle) for arg in grid_args) + " >>>"
         return f"{fnc_name}{grid}({call_parameters});"
 
 
diff --git a/src/pystencilssfg/ir/postprocessing.py b/src/pystencilssfg/ir/postprocessing.py
index 1e692b0aa9f37368da1688ec2d3bac6892c5ac60..896693317c02dee67302221f810d64c01b5eb233 100644
--- a/src/pystencilssfg/ir/postprocessing.py
+++ b/src/pystencilssfg/ir/postprocessing.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 from typing import Sequence, Iterable
 import warnings
-from functools import reduce
 from dataclasses import dataclass
 
 from abc import ABC, abstractmethod
@@ -15,7 +14,7 @@ from pystencils.codegen.properties import FieldBasePtr, FieldShape, FieldStride
 from ..exceptions import SfgException
 from ..config import CodeStyle
 
-from .call_tree import SfgCallTreeNode, SfgCallTreeLeaf, SfgSequence, SfgStatements
+from .call_tree import SfgCallTreeNode, SfgSequence, SfgStatements
 from ..lang.expressions import SfgKernelParamVar
 from ..lang import (
     SfgVar,
@@ -163,17 +162,12 @@ class CallTreePostProcessing:
                 self.handle_sequence(node, ppc)
                 return ppc.live_variables
 
-            case SfgCallTreeLeaf():
-                return node.depends
-
             case SfgDeferredNode():
                 raise SfgException("Deferred nodes can only occur inside a sequence.")
 
             case _:
-                return reduce(
-                    lambda x, y: x | y,
-                    (self.get_live_variables(c) for c in node.children),
-                    set(),
+                return node.depends.union(
+                    *(self.get_live_variables(c) for c in node.children)
                 )
 
 
diff --git a/src/pystencilssfg/lang/cpp/std_tuple.py b/src/pystencilssfg/lang/cpp/std_tuple.py
index 645b6b56fbeb515d6324feafdb8588f7ca22e992..6d1e1c0da0987be055006ce696779a1e2ebaa56b 100644
--- a/src/pystencilssfg/lang/cpp/std_tuple.py
+++ b/src/pystencilssfg/lang/cpp/std_tuple.py
@@ -19,10 +19,13 @@ class StdTuple(AugExpr, SupportsVectorExtraction):
         dtype = self._template(ts=", ".join(elt_type_strings), const=const, ref=ref)
         super().__init__(dtype)
 
+    def get(self, idx: int | str) -> AugExpr:
+        return AugExpr.format("std::get< {} >({})", idx, self)
+
     def _extract_component(self, coordinate: int) -> AugExpr:
         if coordinate < 0 or coordinate >= self._length:
             raise ValueError(
                 f"Index {coordinate} out-of-bounds for std::tuple with {self._length} entries."
             )
 
-        return AugExpr.format("std::get< {} >({})", coordinate, self)
+        return self.get(coordinate)
diff --git a/src/pystencilssfg/lang/cuda.py b/src/pystencilssfg/lang/cuda.py
new file mode 100644
index 0000000000000000000000000000000000000000..28794dab927def14cd083900a1fedf895e693c3a
--- /dev/null
+++ b/src/pystencilssfg/lang/cuda.py
@@ -0,0 +1,13 @@
+from .expressions import CppClass, cpptype
+
+
+from ..extensions.gpu import dim3class
+
+dim3 = dim3class(
+    "<cuda_runtime.h>"
+)
+"""Reflection of CUDA's `dim3`."""
+
+
+class cudaStream_t(CppClass):
+    template = cpptype("cudaStream_t", "<cuda_runtime.h>")
diff --git a/src/pystencilssfg/lang/expressions.py b/src/pystencilssfg/lang/expressions.py
index 135a54eed92e4ba214244c8f46323ea81f6610db..8be59b0b37d72113814fa1489a44e7816f0e6f75 100644
--- a/src/pystencilssfg/lang/expressions.py
+++ b/src/pystencilssfg/lang/expressions.py
@@ -218,8 +218,12 @@ class AugExpr:
         return self._bind(expr)
 
     @staticmethod
-    def make(code: str, depends: Iterable[SfgVar | AugExpr]):
-        return AugExpr()._bind(DependentExpression(code, depends))
+    def make(
+        code: str,
+        depends: Iterable[SfgVar | AugExpr],
+        dtype: UserTypeSpec | None = None,
+    ):
+        return AugExpr(dtype)._bind(DependentExpression(code, depends))
 
     @staticmethod
     def format(fmt: str, *deps, **kwdeps) -> AugExpr: