5557c6cc · 5557c6cc · 5557c6cc · 5557c6cc · 5557c6cc · 5557c6cc
--- a/docs/source/reference/symbolic_language.rst
+++ b/docs/source/reference/symbolic_language.rst
@@ -4,14 +4,6 @@
 Symbolic Language
 *****************

-.. toctree::
-    :maxdepth: 2
-    :hidden:
-
-    field
-    sympyextensions
-
-
 Pystencils allows you to define near-arbitrarily complex numerical kernels in its symbolic
 language, which is based on the computer algebra system `SymPy <https://www.sympy.org>`_.
 The pystencils code generator is able to parse and translate a large portion of SymPy's
@@ -64,7 +56,7 @@ An assignment collection contains two separate lists of assignments:
  into fields.

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:
    :template: autosummary/recursive_class.rst


--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -11,7 +11,7 @@ Type Creation and Conversion
 ----------------------------

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:

    create_type
@@ -34,7 +34,7 @@ unless you have very particular needs.
    :parts: 1

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:
    :template: autosummary/entire_class.rst

@@ -82,10 +82,10 @@ Exceptions
 .. currentmodule:: pystencils.types

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:

-    pystencils.types.PsTypeError
+    PsTypeError


 Implementation Details

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,12 +46,13 @@ use_cython = [
 ]
 doc = [
    'sphinx',
-    'furo',
-    'nbsphinx',
+    'pydata-sphinx-theme==0.15.4',
+    'sphinx-book-theme==1.1.3',  # workaround for https://github.com/executablebooks/sphinx-book-theme/issues/865
    'sphinxcontrib-bibtex',
    'sphinx_autodoc_typehints',
    'pandoc',
    'sphinx_design',
+    'myst-nb'
 ]
 tests = [
    'pytest',

--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -5,6 +5,7 @@ from .defaults import DEFAULTS
 from . import fd
 from . import stencil as stencil
 from .display_utils import get_code_obj, get_code_str, show_code, to_dot
+from .inspection import inspect
 from .field import Field, FieldType, fields
 from .types import create_type, create_numeric_type
 from .cache import clear_cache
@@ -37,7 +38,6 @@ from .sympyextensions.typed_sympy import TypedSymbol, DynamicType
 from .sympyextensions import SymbolCreator
 from .datahandling import create_data_handling

-
 __all__ = [
    "Field",
    "FieldType",
@@ -63,6 +63,7 @@ __all__ = [
    "to_dot",
    "get_code_obj",
    "get_code_str",
+    "inspect",
    "AssignmentCollection",
    "Assignment",
    "AddAugmentedAssignment",

--- a/src/pystencils/backend/emission/__init__.py
+++ b/src/pystencils/backend/emission/__init__.py
+from .base_printer import EmissionError
 from .c_printer import emit_code, CAstPrinter
 from .ir_printer import emit_ir, IRAstPrinter

-__all__ = ["emit_code", "CAstPrinter", "emit_ir", "IRAstPrinter"]
+__all__ = ["emit_code", "CAstPrinter", "emit_ir", "IRAstPrinter", "EmissionError"]
--- a/src/pystencils/backend/emission/base_printer.py
+++ b/src/pystencils/backend/emission/base_printer.py
@@ -189,7 +189,7 @@ class BasePrinter(ABC):
                pc.indent_level += self._indent_width
                interior = "\n".join(self.visit(stmt, pc) for stmt in statements) + "\n"
                pc.indent_level -= self._indent_width
-                return pc.indent("{\n") + interior + pc.indent("}\n")
+                return pc.indent("{\n") + interior + pc.indent("}")

            case PsStatement(expr):
                return pc.indent(f"{self.visit(expr, pc)};")

--- a/src/pystencils/backend/emission/c_printer.py
+++ b/src/pystencils/backend/emission/c_printer.py
@@ -5,7 +5,7 @@ from pystencils.backend.memory import PsSymbol
 from .base_printer import BasePrinter

 from ..kernelfunction import KernelFunction
-from ...types import PsType, PsArrayType, PsScalarType
+from ...types import PsType, PsArrayType, PsScalarType, PsTypeError
 from ..ast.expressions import PsBufferAcc
 from ..ast.vector import PsVecMemAcc

@@ -23,7 +23,10 @@ class CAstPrinter(BasePrinter):
    def visit(self, node: PsAstNode, pc: PrinterCtx) -> str:
        match node:
            case PsVecMemAcc():
-                raise EmissionError("Cannot print vectorized array accesses to C code.")
+                raise EmissionError(
+                    f"Unable to print C code for vector memory access {node}.\n"
+                    f"Vectorized memory accesses must be mapped to intrinsics before emission."
+                )

            case PsBufferAcc():
                raise EmissionError(
@@ -33,7 +36,7 @@ class CAstPrinter(BasePrinter):

            case _:
                return super().visit(node, pc)
-            
+
    def _symbol_decl(self, symb: PsSymbol):
        dtype = symb.get_dtype()

@@ -52,11 +55,12 @@ class CAstPrinter(BasePrinter):
    def _constant_literal(self, constant: PsConstant):
        dtype = constant.get_dtype()
        if not isinstance(dtype, PsScalarType):
-            raise EmissionError(
-                "Cannot print literals for non-scalar constants."
-            )
+            raise EmissionError("Cannot print literals for non-scalar constants.")

        return dtype.create_literal(constant.value)

    def _type_str(self, dtype: PsType):
-        return dtype.c_string()
+        try:
+            return dtype.c_string()
+        except PsTypeError:
+            raise EmissionError(f"Unable to print type {dtype} as a C data type.")
--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -59,7 +59,7 @@ class IRAstPrinter(BasePrinter):

                stride_code = "" if stride is None else f", stride={stride}"

-                code = f"vec_load< {lanes}{stride_code} >({ptr_code}, {offset_code})"
+                code = f"vec_memacc< {lanes}{stride_code} >({ptr_code}, {offset_code})"
                return pc.parenthesize(code, Ops.Subscript)

            case PsVecBroadcast(lanes, operand):

--- a/src/pystencils/backend/jit/gpu_cupy.py
+++ b/src/pystencils/backend/jit/gpu_cupy.py
@@ -41,6 +41,7 @@ class CupyKernelWrapper(KernelWrapper):
        self._kfunc: GpuKernelFunction = kfunc
        self._raw_kernel = raw_kernel
        self._block_size = block_size
+        self._num_blocks: tuple[int, int, int] | None = None
        self._args_cache: dict[Any, tuple] = dict()

    @property
@@ -59,6 +60,14 @@ class CupyKernelWrapper(KernelWrapper):
    def block_size(self, bs: tuple[int, int, int]):
        self._block_size = bs

+    @property
+    def num_blocks(self) -> tuple[int, int, int] | None:
+        return self._num_blocks
+
+    @num_blocks.setter
+    def num_blocks(self, nb: tuple[int, int, int] | None):
+        self._num_blocks = nb
+
    def __call__(self, **kwargs: Any):
        kernel_args, launch_grid = self._get_cached_args(**kwargs)
        device = self._get_device(kernel_args)
@@ -72,7 +81,7 @@ class CupyKernelWrapper(KernelWrapper):
        return devices.pop()

    def _get_cached_args(self, **kwargs):
-        key = (self._block_size,) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items())

        if key not in self._args_cache:
            args = self._get_args(**kwargs)
@@ -185,25 +194,36 @@ class CupyKernelWrapper(KernelWrapper):

        symbolic_threads_range = self._kfunc.threads_range

-        threads_range: list[int] = [
-            evaluate_expression(expr, valuation)
-            for expr in symbolic_threads_range.num_work_items
-        ]
+        if self._num_blocks is not None:
+            launch_grid = LaunchGrid(self._num_blocks, self._block_size)

-        if symbolic_threads_range.dim < 3:
-            threads_range += [1] * (3 - symbolic_threads_range.dim)
+        elif symbolic_threads_range is not None:
+            threads_range: list[int] = [
+                evaluate_expression(expr, valuation)
+                for expr in symbolic_threads_range.num_work_items
+            ]

-        def div_ceil(a, b):
-            return a // b if a % b == 0 else a // b + 1
+            if symbolic_threads_range.dim < 3:
+                threads_range += [1] * (3 - symbolic_threads_range.dim)

-        #   TODO: Refine this?
-        grid_size = tuple(
-            div_ceil(threads, tpb)
-            for threads, tpb in zip(threads_range, self._block_size)
-        )
-        assert len(grid_size) == 3
+            def div_ceil(a, b):
+                return a // b if a % b == 0 else a // b + 1
+
+            #   TODO: Refine this?
+            num_blocks = tuple(
+                div_ceil(threads, tpb)
+                for threads, tpb in zip(threads_range, self._block_size)
+            )
+            assert len(num_blocks) == 3
+
+            launch_grid = LaunchGrid(num_blocks, self._block_size)

-        launch_grid = LaunchGrid(grid_size, self._block_size)
+        else:
+            raise JitError(
+                "Unable to determine launch grid for GPU kernel invocation: "
+                "No manual grid size was specified, and the number of threads could not "
+                "be determined automatically."
+            )

        return tuple(args), launch_grid


--- a/src/pystencils/backend/kernelcreation/ast_factory.py
+++ b/src/pystencils/backend/kernelcreation/ast_factory.py
@@ -139,6 +139,13 @@ class AstFactory:
                self._typify(self.parse_index(iter_slice) + self.parse_index(1))
            )
            step = self.parse_index(1)
+
+            if normalize_to is not None:
+                upper_limit = self.parse_index(normalize_to)
+                if isinstance(start, PsConstantExpr) and start.constant.value < 0:
+                    start = fold(self._typify(upper_limit.clone() + start))
+                    stop = fold(self._typify(upper_limit.clone() + stop))
+
        else:
            start = self._parse_any_index(
                iter_slice.start if iter_slice.start is not None else 0
@@ -157,21 +164,21 @@ class AstFactory:
                    f"Invalid value for `slice.step`: {step.constant.value}"
                )

-        if normalize_to is not None:
-            upper_limit = self.parse_index(normalize_to)
-            if isinstance(start, PsConstantExpr) and start.constant.value < 0:
-                start = fold(self._typify(upper_limit.clone() + start))
+            if normalize_to is not None:
+                upper_limit = self.parse_index(normalize_to)
+                if isinstance(start, PsConstantExpr) and start.constant.value < 0:
+                    start = fold(self._typify(upper_limit.clone() + start))

-            if stop is None:
-                stop = upper_limit
-            elif isinstance(stop, PsConstantExpr) and stop.constant.value < 0:
-                stop = fold(self._typify(upper_limit.clone() + stop))
+                if stop is None:
+                    stop = upper_limit
+                elif isinstance(stop, PsConstantExpr) and stop.constant.value < 0:
+                    stop = fold(self._typify(upper_limit.clone() + stop))
+
+            elif stop is None:
+                raise ValueError(
+                    "Cannot parse a slice with `stop == None` if no normalization limit is given"
+                )

-        elif stop is None:
-            raise ValueError(
-                "Cannot parse a slice with `stop == None` if no normalization limit is given"
-            )
-        
        assert stop is not None  # for mypy

        return start, stop, step

--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -6,6 +6,7 @@ from functools import reduce
 from operator import mul

 from ...defaults import DEFAULTS
+from ...config import _AUTO_TYPE, AUTO
 from ...simp import AssignmentCollection
 from ...field import Field, FieldType

@@ -195,21 +196,25 @@ class FullIterationSpace(IterationSpace):
    def dimensions(self):
        """The dimensions of this iteration space"""
        return self._dimensions
+    
+    @property
+    def counters(self) -> tuple[PsSymbol, ...]:
+        return tuple(dim.counter for dim in self._dimensions)

    @property
-    def lower(self):
+    def lower(self) -> tuple[PsExpression, ...]:
        """Lower limits of each dimension"""
-        return (dim.start for dim in self._dimensions)
+        return tuple(dim.start for dim in self._dimensions)

    @property
-    def upper(self):
+    def upper(self) -> tuple[PsExpression, ...]:
        """Upper limits of each dimension"""
-        return (dim.stop for dim in self._dimensions)
+        return tuple(dim.stop for dim in self._dimensions)

    @property
-    def steps(self):
+    def steps(self) -> tuple[PsExpression, ...]:
        """Iteration steps of each dimension"""
-        return (dim.step for dim in self._dimensions)
+        return tuple(dim.step for dim in self._dimensions)

    @property
    def archetype_field(self) -> Field | None:
@@ -412,7 +417,7 @@ def create_sparse_iteration_space(
 def create_full_iteration_space(
    ctx: KernelCreationContext,
    assignments: AssignmentCollection,
-    ghost_layers: None | int | Sequence[int | tuple[int, int]] = None,
+    ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None,
    iteration_slice: None | int | slice | tuple[int | slice, ...] = None,
 ) -> IterationSpace:
    assert not ctx.fields.index_fields
@@ -452,16 +457,7 @@ def create_full_iteration_space(
    # Otherwise, if an iteration slice was specified, use that
    # Otherwise, use the inferred ghost layers

-    if ghost_layers is not None:
-        ctx.metadata["ghost_layers"] = ghost_layers
-        return FullIterationSpace.create_with_ghost_layers(
-            ctx, ghost_layers, archetype_field
-        )
-    elif iteration_slice is not None:
-        return FullIterationSpace.create_from_slice(
-            ctx, iteration_slice, archetype_field
-        )
-    else:
+    if ghost_layers is AUTO:
        if len(domain_field_accesses) > 0:
            inferred_gls = max(
                [fa.required_ghost_layers for fa in domain_field_accesses]
@@ -473,3 +469,15 @@ def create_full_iteration_space(
        return FullIterationSpace.create_with_ghost_layers(
            ctx, inferred_gls, archetype_field
        )
+    elif ghost_layers is not None:
+        assert not isinstance(ghost_layers, _AUTO_TYPE)
+        ctx.metadata["ghost_layers"] = ghost_layers
+        return FullIterationSpace.create_with_ghost_layers(
+            ctx, ghost_layers, archetype_field
+        )
+    elif iteration_slice is not None:
+        return FullIterationSpace.create_from_slice(
+            ctx, iteration_slice, archetype_field
+        )
+    else:
+        assert False, "unreachable code"
--- a/src/pystencils/backend/kernelfunction.py
+++ b/src/pystencils/backend/kernelfunction.py
@@ -259,10 +259,12 @@ def create_cpu_kernel_function(


 class GpuKernelFunction(KernelFunction):
+    """Internal representation of a kernel function targeted at CUDA GPUs."""
+
    def __init__(
        self,
        body: PsBlock,
-        threads_range: GpuThreadsRange,
+        threads_range: GpuThreadsRange | None,
        target: Target,
        name: str,
        parameters: Sequence[KernelParameter],
@@ -276,7 +278,8 @@ class GpuKernelFunction(KernelFunction):
        self._threads_range = threads_range

    @property
-    def threads_range(self) -> GpuThreadsRange:
+    def threads_range(self) -> GpuThreadsRange | None:
+        """Object exposing the total size of the launch grid this kernel expects to be executed with."""
        return self._threads_range


@@ -284,14 +287,16 @@ def create_gpu_kernel_function(
    ctx: KernelCreationContext,
    platform: Platform,
    body: PsBlock,
-    threads_range: GpuThreadsRange,
+    threads_range: GpuThreadsRange | None,
    function_name: str,
    target_spec: Target,
    jit: JitBase,
 ):
    undef_symbols = collect_undefined_symbols(body)
-    for threads in threads_range.num_work_items:
-        undef_symbols |= collect_undefined_symbols(threads)
+
+    if threads_range is not None:
+        for threads in threads_range.num_work_items:
+            undef_symbols |= collect_undefined_symbols(threads)

    params = _get_function_params(ctx, undef_symbols)
    req_headers = _get_headers(ctx, platform, body)

--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
+from warnings import warn
+
 from ...types import constify
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu, GpuThreadsRange
@@ -7,7 +9,7 @@ from ..kernelcreation import (
    IterationSpace,
    FullIterationSpace,
    SparseIterationSpace,
-    AstFactory
+    AstFactory,
 )

 from ..kernelcreation.context import KernelCreationContext
@@ -43,6 +45,7 @@ GRID_DIM = [


 class CudaPlatform(GenericGpu):
+    """Platform for CUDA-based GPUs."""

    def __init__(
        self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None
@@ -57,7 +60,7 @@ class CudaPlatform(GenericGpu):

    def materialize_iteration_space(
        self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
        if isinstance(ispace, FullIterationSpace):
            return self._prepend_dense_translation(body, ispace)
        elif isinstance(ispace, SparseIterationSpace):
@@ -112,6 +115,11 @@ class CudaPlatform(GenericGpu):
                case MathFunctions.Abs if dtype.width == 16:
                    cfunc = CFunction(" __habs", arg_types, dtype)

+                case _:
+                    raise MaterializationError(
+                        f"Cannot materialize call to function {func}"
+                    )
+
            call.function = cfunc
            return call

@@ -123,9 +131,21 @@ class CudaPlatform(GenericGpu):

    def _prepend_dense_translation(
        self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
        dimensions = ispace.dimensions_in_loop_order()
-        launch_config = GpuThreadsRange.from_ispace(ispace)
+
+        if not self._cfg.manual_launch_grid:
+            try:
+                threads_range = GpuThreadsRange.from_ispace(ispace)
+            except MaterializationError as e:
+                warn(
+                    str(e.args[0])
+                    + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
+                    UserWarning,
+                )
+                threads_range = None
+        else:
+            threads_range = None

        indexing_decls = []
        conds = []
@@ -146,6 +166,8 @@ class CudaPlatform(GenericGpu):
            if not self._cfg.omit_range_check:
                conds.append(PsLt(ctr, dim.stop))

+        indexing_decls = indexing_decls[::-1]
+
        if conds:
            condition: PsExpression = conds[0]
            for cond in conds[1:]:
@@ -155,7 +177,7 @@ class CudaPlatform(GenericGpu):
            body.statements = indexing_decls + body.statements
            ast = body

-        return ast, launch_config
+        return ast, threads_range

    def _prepend_sparse_translation(
        self, body: PsBlock, ispace: SparseIterationSpace

--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -10,6 +10,7 @@ from ..kernelcreation.iteration_space import (
    SparseIterationSpace,
 )
 from .platform import Platform
+from ..exceptions import MaterializationError


 class GpuThreadsRange:
@@ -48,6 +49,15 @@ class GpuThreadsRange:
    @property
    def dim(self) -> int:
        return self._dim
+    
+    def __str__(self) -> str:
+        rep = "GpuThreadsRange { "
+        rep += "; ".join(f"{x}: {w}" for x, w in zip("xyz", self._num_work_items))
+        rep += " }"
+        return rep
+    
+    def _repr_html_(self) -> str:
+        return str(self)

    @staticmethod
    def _from_full_ispace(ispace: FullIterationSpace) -> GpuThreadsRange:
@@ -56,6 +66,19 @@ class GpuThreadsRange:
            raise NotImplementedError(
                f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
            )
+        
+        from ..ast.analysis import collect_undefined_symbols as collect
+
+        for dim in dimensions:
+            symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
+            for ctr in ispace.counters:
+                if ctr in symbs:
+                    raise MaterializationError(
+                        "Unable to construct GPU threads range for iteration space: "
+                        f"Limits of dimension counter {dim.counter.name} "
+                        f"depend on another dimension's counter {ctr.name}"
+                    )
+
        work_items = [ispace.actual_iterations(dim) for dim in dimensions]
        return GpuThreadsRange(work_items)

@@ -63,6 +86,6 @@ class GpuThreadsRange:
 class GenericGpu(Platform):
    @abstractmethod
    def materialize_iteration_space(
-        self, block: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+        self, body: PsBlock, ispace: IterationSpace
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
        pass
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -27,7 +27,7 @@ class Platform(ABC):

    @abstractmethod
    def materialize_iteration_space(
-        self, block: PsBlock, ispace: IterationSpace
+        self, body: PsBlock, ispace: IterationSpace
    ) -> PsBlock | tuple[PsBlock, Any]:
        pass


--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -8,7 +8,7 @@ from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
 from ..ast.structural import PsLoop, PsBlock, PsDeclaration
-from ..ast.expressions import PsExpression
+from ..ast.expressions import PsExpression, PsTernary, PsGt
 from ..ast.vector import PsVecBroadcast
 from ..ast.analysis import collect_undefined_symbols

@@ -18,7 +18,7 @@ from .rewrite import substitute_symbols

 class LoopVectorizer:
    """Vectorize loops.
-    
+
    The loop vectorizer provides methods to vectorize single loops inside an AST
    using a given number of vector lanes.
    During vectorization, the loop body is transformed using the `AstVectorizer`,
@@ -64,29 +64,26 @@ class LoopVectorizer:
    @overload
    def vectorize_select_loops(
        self, node: PsBlock, predicate: Callable[[PsLoop], bool]
-    ) -> PsBlock:
-        ...
+    ) -> PsBlock: ...

    @overload
    def vectorize_select_loops(
        self, node: PsLoop, predicate: Callable[[PsLoop], bool]
-    ) -> PsLoop | PsBlock:
-        ...
+    ) -> PsLoop | PsBlock: ...

    @overload
    def vectorize_select_loops(
        self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
-    ) -> PsAstNode:
-        ...
+    ) -> PsAstNode: ...

    def vectorize_select_loops(
        self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
    ) -> PsAstNode:
        """Select and vectorize loops from a syntax tree according to a predicate.
-        
+
        Finds each loop inside a subtree and evaluates ``predicate`` on them.
        If ``predicate(loop)`` evaluates to `True`, the loop is vectorized.
-        
+
        Loops nested inside a vectorized loop will not be processed.

        Args:
@@ -139,7 +136,7 @@ class LoopVectorizer:

        #   Generate vectorized loop body
        simd_body = self._vectorize_ast(loop.body, vc)
-        
+
        if vector_ctr in collect_undefined_symbols(simd_body):
            simd_body.statements.insert(0, vector_counter_decl)

@@ -186,20 +183,31 @@ class LoopVectorizer:
                trailing_start = self._ctx.get_new_symbol(
                    f"__{scalar_ctr.name}_trailing_start", scalar_ctr.get_dtype()
                )
+
                trailing_start_decl = self._type_fold(
                    PsDeclaration(
                        PsExpression.make(trailing_start),
-                        (
+                        PsTernary(
+                            #   If at least one vectorized iteration took place...
+                            PsGt(
+                                PsExpression.make(simd_stop),
+                                simd_start.clone(),
+                            ),
+                            #   start from the smallest non-valid multiple of simd_step, offset from simd_start
                            (
-                                PsExpression.make(simd_stop)
-                                - simd_start.clone()
-                                - PsExpression.make(PsConstant(1))
+                                (
+                                    PsExpression.make(simd_stop)
+                                    - simd_start.clone()
+                                    - PsExpression.make(PsConstant(1))
+                                )
+                                / PsExpression.make(simd_step)
+                                + PsExpression.make(PsConstant(1))
                            )
-                            / PsExpression.make(simd_step)
-                            + PsExpression.make(PsConstant(1))
-                        )
-                        * PsExpression.make(simd_step)
-                        + simd_start.clone(),
+                            * PsExpression.make(simd_step)
+                            + simd_start.clone(),
+                            #   otherwise start at zero
+                            simd_start.clone(),
+                        ),
                    )
                )


--- a/src/pystencils/boundaries/boundaryhandling.py
+++ b/src/pystencils/boundaries/boundaryhandling.py
@@ -314,7 +314,7 @@ class BoundaryHandling:

    def _create_boundary_kernel(self, symbolic_field, symbolic_index_field, boundary_obj):
        return create_boundary_kernel(symbolic_field, symbolic_index_field, self.stencil, boundary_obj,
-                                      target=self._target,)  # cpu_openmp=self._openmp) TODO: replace
+                                      target=self._target, cpu_openmp=self._openmp)

    def _create_index_fields(self):
        dh = self._data_handling

--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -28,6 +28,19 @@ class PsOptionsError(Exception):
    """Indicates an option clash in the `CreateKernelConfig`."""


+class _AUTO_TYPE:
+    ...
+
+
+AUTO = _AUTO_TYPE()
+"""Special value that can be passed to some options for invoking automatic behaviour.
+
+Currently, these options permit `AUTO`:
+
+- `ghost_layers <CreateKernelConfig.ghost_layers>`
+"""
+
+
 @dataclass
 class OpenMpConfig:
    """Parameters controlling kernel parallelization using OpenMP."""
@@ -182,6 +195,14 @@ class GpuIndexingConfig:
    block_size: tuple[int, int, int] | None = None
    """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""

+    manual_launch_grid: bool = False
+    """Always require a manually specified launch grid when running this kernel.
+    
+    If set to `True`, the code generator will not attempt to infer the size of
+    the launch grid from the kernel.
+    The launch grid will then have to be specified manually at runtime.
+    """
+
    sycl_automatic_block_size: bool = True
    """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size.

@@ -213,32 +234,43 @@ class CreateKernelConfig:
    function_name: str = "kernel"
    """Name of the generated function"""

-    ghost_layers: None | int | Sequence[int | tuple[int, int]] = None
+    ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None
    """Specifies the number of ghost layers of the iteration region.
    
    Options:
-     - `None`: Required ghost layers are inferred from field accesses
+     - :py:data:`AUTO <pystencils.config.AUTO>`: Required ghost layers are inferred from field accesses
     - `int`:  A uniform number of ghost layers in each spatial coordinate is applied
     - ``Sequence[int, tuple[int, int]]``: Ghost layers are specified for each spatial coordinate.
        In each coordinate, a single integer specifies the ghost layers at both the lower and upper iteration limit,
        while a pair of integers specifies the lower and upper ghost layers separately.

    When manually specifying ghost layers, it is the user's responsibility to avoid out-of-bounds memory accesses.
-    If ``ghost_layers=None`` is specified, the iteration region may otherwise be set using the `iteration_slice` option.
+
+    .. note::
+        At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set.
    """

-    iteration_slice: None | Sequence[slice] = None
+    iteration_slice: None | int | slice | tuple[int | slice] = None
    """Specifies the kernel's iteration slice.
-    
-    `iteration_slice` may only be set if ``ghost_layers=None``.
-    If it is set, a slice must be specified for each spatial coordinate.
-    TODO: Specification of valid slices and their behaviour
+
+    Example:
+        >>> cfg = CreateKernelConfig(
+        ...     iteration_slice=ps.make_slice[3:14, 2:-2]
+        ... )
+        >>> cfg.iteration_slice
+        (slice(3, 14, None), slice(2, -2, None))
+
+    .. note::
+        At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set.
    """

    index_field: Field | None = None
    """Index field for a sparse kernel.
    
    If this option is set, a sparse kernel with the given field as index field will be generated.
+
+    .. note::
+        At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set.
    """

    """Data Types"""

--- a/src/pystencils/datahandling/serial_datahandling.py
+++ b/src/pystencils/datahandling/serial_datahandling.py
@@ -291,7 +291,10 @@ class SerialDataHandling(DataHandling):
    def synchronization_function(self, names, stencil=None, target=None, functor=None, **_):
        if target is None:
            target = self.default_target
-        assert target in (Target.CPU, Target.GPU)
+            
+        if not (target.is_cpu() or target == Target.CUDA):
+            raise ValueError(f"Unsupported target: {target}")
+
        if not hasattr(names, '__len__') or type(names) is str:
            names = [names]

@@ -325,7 +328,7 @@ class SerialDataHandling(DataHandling):
                values_per_cell = values_per_cell[0]

            if len(filtered_stencil) > 0:
-                if target == Target.CPU:
+                if target.is_cpu():
                    if functor is None:
                        from pystencils.slicing import get_periodic_boundary_functor
                        functor = get_periodic_boundary_functor

--- a/src/pystencils/field.py
+++ b/src/pystencils/field.py
@@ -988,24 +988,35 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0


 def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]:
-    if layout_str in ('fzyx', 'zyxf'):
-        assert dim <= 3
-        return tuple(reversed(range(dim)))
+    if dim <= 0:
+        raise ValueError("Dimensionality must be positive")
+    
+    layout_str = layout_str.lower()

-    if layout_str in ('fzyx', 'f', 'reverse_numpy', 'SoA'):
+    if layout_str in ('fzyx', 'zyxf', 'soa', 'aos'):
+        if dim > 3:
+            raise ValueError(f"Invalid spatial dimensionality for layout descriptor {layout_str}: May be at most 3.")
+        return tuple(reversed(range(dim)))
+    
+    if layout_str in ('f', 'reverse_numpy'):
        return tuple(reversed(range(dim)))
-    elif layout_str in ('c', 'numpy', 'AoS'):
+    elif layout_str in ('c', 'numpy'):
        return tuple(range(dim))
    raise ValueError("Unknown layout descriptor " + layout_str)


 def layout_string_to_tuple(layout_str, dim):
+    if dim <= 0:
+        raise ValueError("Dimensionality must be positive")
+    
    layout_str = layout_str.lower()
    if layout_str == 'fzyx' or layout_str == 'soa':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim)))
    elif layout_str == 'zyxf' or layout_str == 'aos':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim - 1))) + (dim - 1,)
    elif layout_str == 'f' or layout_str == 'reverse_numpy':
        return tuple(reversed(range(dim)))
No results found