diff --git a/.flake8 b/.flake8
index 3f946922a5f68e39bc02c65d4a5f4ffdb10e05e1..05758405a65664809b3a0a71893554a1af8e3381 100644
--- a/.flake8
+++ b/.flake8
@@ -4,4 +4,4 @@ exclude=src/pystencils/jupyter.py,
         src/pystencils/plot.py
         src/pystencils/session.py
         src/pystencils/old
-ignore = W293 W503 W291 C901 E741
+ignore = W293 W503 W291 C901 E741 E704
diff --git a/conftest.py b/conftest.py
index 0c6c49153770bc40c90cb09e990d12561f7bad7a..4e8e2b73ab49ff1f881e755b19f9a46e07c0837e 100644
--- a/conftest.py
+++ b/conftest.py
@@ -203,3 +203,8 @@ else:
                 return IPyNbFile.from_parent(fspath=path, parent=parent)
             else:
                 return IPyNbFile(path, parent)
+
+
+#   Fixtures
+
+from tests.fixtures import *
diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/backend/jit/gpu_cupy.py
index 563a9c06a7261343e6435737c47854a55d54a05e..1dd18767160a626ff7972ebb78f83bb3e64a1efc 100644
--- a/src/pystencils/backend/jit/gpu_cupy.py
+++ b/src/pystencils/backend/jit/gpu_cupy.py
@@ -41,6 +41,7 @@ class CupyKernelWrapper(KernelWrapper):
         self._kfunc: GpuKernelFunction = kfunc
         self._raw_kernel = raw_kernel
         self._block_size = block_size
+        self._num_blocks: tuple[int, int, int] | None = None
         self._args_cache: dict[Any, tuple] = dict()
 
     @property
@@ -59,6 +60,14 @@ class CupyKernelWrapper(KernelWrapper):
     def block_size(self, bs: tuple[int, int, int]):
         self._block_size = bs
 
+    @property
+    def num_blocks(self) -> tuple[int, int, int] | None:
+        return self._num_blocks
+
+    @num_blocks.setter
+    def num_blocks(self, nb: tuple[int, int, int] | None):
+        self._num_blocks = nb
+
     def __call__(self, **kwargs: Any):
         kernel_args, launch_grid = self._get_cached_args(**kwargs)
         device = self._get_device(kernel_args)
@@ -72,7 +81,7 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        key = (self._block_size,) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items())
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -185,25 +194,36 @@ class CupyKernelWrapper(KernelWrapper):
 
         symbolic_threads_range = self._kfunc.threads_range
 
-        threads_range: list[int] = [
-            evaluate_expression(expr, valuation)
-            for expr in symbolic_threads_range.num_work_items
-        ]
+        if self._num_blocks is not None:
+            launch_grid = LaunchGrid(self._num_blocks, self._block_size)
 
-        if symbolic_threads_range.dim < 3:
-            threads_range += [1] * (3 - symbolic_threads_range.dim)
+        elif symbolic_threads_range is not None:
+            threads_range: list[int] = [
+                evaluate_expression(expr, valuation)
+                for expr in symbolic_threads_range.num_work_items
+            ]
 
-        def div_ceil(a, b):
-            return a // b if a % b == 0 else a // b + 1
+            if symbolic_threads_range.dim < 3:
+                threads_range += [1] * (3 - symbolic_threads_range.dim)
 
-        #   TODO: Refine this?
-        grid_size = tuple(
-            div_ceil(threads, tpb)
-            for threads, tpb in zip(threads_range, self._block_size)
-        )
-        assert len(grid_size) == 3
+            def div_ceil(a, b):
+                return a // b if a % b == 0 else a // b + 1
+
+            #   TODO: Refine this?
+            num_blocks = tuple(
+                div_ceil(threads, tpb)
+                for threads, tpb in zip(threads_range, self._block_size)
+            )
+            assert len(num_blocks) == 3
+
+            launch_grid = LaunchGrid(num_blocks, self._block_size)
 
-        launch_grid = LaunchGrid(grid_size, self._block_size)
+        else:
+            raise JitError(
+                "Unable to determine launch grid for GPU kernel invocation: "
+                "No manual grid size was specified, and the number of threads could not "
+                "be determined automatically."
+            )
 
         return tuple(args), launch_grid
 
diff --git a/src/pystencils/backend/kernelcreation/ast_factory.py b/src/pystencils/backend/kernelcreation/ast_factory.py
index 56bcea11db40aa4b71d9570952958791e7ef35da..3b07ef862d80bcc5058975cd2083fbe1704f2d82 100644
--- a/src/pystencils/backend/kernelcreation/ast_factory.py
+++ b/src/pystencils/backend/kernelcreation/ast_factory.py
@@ -139,6 +139,13 @@ class AstFactory:
                 self._typify(self.parse_index(iter_slice) + self.parse_index(1))
             )
             step = self.parse_index(1)
+
+            if normalize_to is not None:
+                upper_limit = self.parse_index(normalize_to)
+                if isinstance(start, PsConstantExpr) and start.constant.value < 0:
+                    start = fold(self._typify(upper_limit.clone() + start))
+                    stop = fold(self._typify(upper_limit.clone() + stop))
+
         else:
             start = self._parse_any_index(
                 iter_slice.start if iter_slice.start is not None else 0
@@ -157,21 +164,21 @@ class AstFactory:
                     f"Invalid value for `slice.step`: {step.constant.value}"
                 )
 
-        if normalize_to is not None:
-            upper_limit = self.parse_index(normalize_to)
-            if isinstance(start, PsConstantExpr) and start.constant.value < 0:
-                start = fold(self._typify(upper_limit.clone() + start))
+            if normalize_to is not None:
+                upper_limit = self.parse_index(normalize_to)
+                if isinstance(start, PsConstantExpr) and start.constant.value < 0:
+                    start = fold(self._typify(upper_limit.clone() + start))
 
-            if stop is None:
-                stop = upper_limit
-            elif isinstance(stop, PsConstantExpr) and stop.constant.value < 0:
-                stop = fold(self._typify(upper_limit.clone() + stop))
+                if stop is None:
+                    stop = upper_limit
+                elif isinstance(stop, PsConstantExpr) and stop.constant.value < 0:
+                    stop = fold(self._typify(upper_limit.clone() + stop))
+
+            elif stop is None:
+                raise ValueError(
+                    "Cannot parse a slice with `stop == None` if no normalization limit is given"
+                )
 
-        elif stop is None:
-            raise ValueError(
-                "Cannot parse a slice with `stop == None` if no normalization limit is given"
-            )
-        
         assert stop is not None  # for mypy
 
         return start, stop, step
diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py
index 50a0e31400e2c6793c7b2bd09108394373cc4052..9df9883ce67d6d856335a7a7a9537f829b7df11e 100644
--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -6,6 +6,7 @@ from functools import reduce
 from operator import mul
 
 from ...defaults import DEFAULTS
+from ...config import _AUTO_TYPE, AUTO
 from ...simp import AssignmentCollection
 from ...field import Field, FieldType
 
@@ -195,21 +196,25 @@ class FullIterationSpace(IterationSpace):
     def dimensions(self):
         """The dimensions of this iteration space"""
         return self._dimensions
+    
+    @property
+    def counters(self) -> tuple[PsSymbol, ...]:
+        return tuple(dim.counter for dim in self._dimensions)
 
     @property
-    def lower(self):
+    def lower(self) -> tuple[PsExpression, ...]:
         """Lower limits of each dimension"""
-        return (dim.start for dim in self._dimensions)
+        return tuple(dim.start for dim in self._dimensions)
 
     @property
-    def upper(self):
+    def upper(self) -> tuple[PsExpression, ...]:
         """Upper limits of each dimension"""
-        return (dim.stop for dim in self._dimensions)
+        return tuple(dim.stop for dim in self._dimensions)
 
     @property
-    def steps(self):
+    def steps(self) -> tuple[PsExpression, ...]:
         """Iteration steps of each dimension"""
-        return (dim.step for dim in self._dimensions)
+        return tuple(dim.step for dim in self._dimensions)
 
     @property
     def archetype_field(self) -> Field | None:
@@ -412,7 +417,7 @@ def create_sparse_iteration_space(
 def create_full_iteration_space(
     ctx: KernelCreationContext,
     assignments: AssignmentCollection,
-    ghost_layers: None | int | Sequence[int | tuple[int, int]] = None,
+    ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None,
     iteration_slice: None | int | slice | tuple[int | slice, ...] = None,
 ) -> IterationSpace:
     assert not ctx.fields.index_fields
@@ -452,16 +457,7 @@ def create_full_iteration_space(
     # Otherwise, if an iteration slice was specified, use that
     # Otherwise, use the inferred ghost layers
 
-    if ghost_layers is not None:
-        ctx.metadata["ghost_layers"] = ghost_layers
-        return FullIterationSpace.create_with_ghost_layers(
-            ctx, ghost_layers, archetype_field
-        )
-    elif iteration_slice is not None:
-        return FullIterationSpace.create_from_slice(
-            ctx, iteration_slice, archetype_field
-        )
-    else:
+    if ghost_layers is AUTO:
         if len(domain_field_accesses) > 0:
             inferred_gls = max(
                 [fa.required_ghost_layers for fa in domain_field_accesses]
@@ -473,3 +469,15 @@ def create_full_iteration_space(
         return FullIterationSpace.create_with_ghost_layers(
             ctx, inferred_gls, archetype_field
         )
+    elif ghost_layers is not None:
+        assert not isinstance(ghost_layers, _AUTO_TYPE)
+        ctx.metadata["ghost_layers"] = ghost_layers
+        return FullIterationSpace.create_with_ghost_layers(
+            ctx, ghost_layers, archetype_field
+        )
+    elif iteration_slice is not None:
+        return FullIterationSpace.create_from_slice(
+            ctx, iteration_slice, archetype_field
+        )
+    else:
+        assert False, "unreachable code"
diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py
index 2e08978fb239c72d86da44f28d81532bbc6e917f..afd54c1107e13e73d2141cd9b0d7da2e97f0ef3e 100644
--- a/src/pystencils/backend/kernelfunction.py
+++ b/src/pystencils/backend/kernelfunction.py
@@ -262,7 +262,7 @@ class GpuKernelFunction(KernelFunction):
     def __init__(
         self,
         body: PsBlock,
-        threads_range: GpuThreadsRange,
+        threads_range: GpuThreadsRange | None,
         target: Target,
         name: str,
         parameters: Sequence[KernelParameter],
@@ -276,7 +276,7 @@ class GpuKernelFunction(KernelFunction):
         self._threads_range = threads_range
 
     @property
-    def threads_range(self) -> GpuThreadsRange:
+    def threads_range(self) -> GpuThreadsRange | None:
         return self._threads_range
 
 
@@ -284,14 +284,16 @@ def create_gpu_kernel_function(
     ctx: KernelCreationContext,
     platform: Platform,
     body: PsBlock,
-    threads_range: GpuThreadsRange,
+    threads_range: GpuThreadsRange | None,
     function_name: str,
     target_spec: Target,
     jit: JitBase,
 ):
     undef_symbols = collect_undefined_symbols(body)
-    for threads in threads_range.num_work_items:
-        undef_symbols |= collect_undefined_symbols(threads)
+
+    if threads_range is not None:
+        for threads in threads_range.num_work_items:
+            undef_symbols |= collect_undefined_symbols(threads)
 
     params = _get_function_params(ctx, undef_symbols)
     req_headers = _get_headers(ctx, platform, body)
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 323dcc5a9b306990a11f85037f305794c8729625..7ebbd4fd4762821d3beb6f0cbbc04a33f775ecdf 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,3 +1,5 @@
+from warnings import warn
+
 from ...types import constify
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu, GpuThreadsRange
@@ -7,7 +9,7 @@ from ..kernelcreation import (
     IterationSpace,
     FullIterationSpace,
     SparseIterationSpace,
-    AstFactory
+    AstFactory,
 )
 
 from ..kernelcreation.context import KernelCreationContext
@@ -43,6 +45,7 @@ GRID_DIM = [
 
 
 class CudaPlatform(GenericGpu):
+    """Platform for CUDA-based GPUs."""
 
     def __init__(
         self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None
@@ -57,7 +60,7 @@ class CudaPlatform(GenericGpu):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
         if isinstance(ispace, FullIterationSpace):
             return self._prepend_dense_translation(body, ispace)
         elif isinstance(ispace, SparseIterationSpace):
@@ -112,6 +115,11 @@ class CudaPlatform(GenericGpu):
                 case MathFunctions.Abs if dtype.width == 16:
                     cfunc = CFunction(" __habs", arg_types, dtype)
 
+                case _:
+                    raise MaterializationError(
+                        f"Cannot materialize call to function {func}"
+                    )
+
             call.function = cfunc
             return call
 
@@ -123,9 +131,21 @@ class CudaPlatform(GenericGpu):
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
         dimensions = ispace.dimensions_in_loop_order()
-        launch_config = GpuThreadsRange.from_ispace(ispace)
+
+        if not self._cfg.manual_launch_grid:
+            try:
+                threads_range = GpuThreadsRange.from_ispace(ispace)
+            except MaterializationError as e:
+                warn(
+                    str(e.args[0])
+                    + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
+                    UserWarning,
+                )
+                threads_range = None
+        else:
+            threads_range = None
 
         indexing_decls = []
         conds = []
@@ -146,6 +166,8 @@ class CudaPlatform(GenericGpu):
             if not self._cfg.omit_range_check:
                 conds.append(PsLt(ctr, dim.stop))
 
+        indexing_decls = indexing_decls[::-1]
+
         if conds:
             condition: PsExpression = conds[0]
             for cond in conds[1:]:
@@ -155,7 +177,7 @@ class CudaPlatform(GenericGpu):
             body.statements = indexing_decls + body.statements
             ast = body
 
-        return ast, launch_config
+        return ast, threads_range
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 774b9405cd04b8dc6489cd6b6ae36e4aa563f157..4c990429d8fd33f800dbf4f1f7797b7520c93b39 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -10,6 +10,7 @@ from ..kernelcreation.iteration_space import (
     SparseIterationSpace,
 )
 from .platform import Platform
+from ..exceptions import MaterializationError
 
 
 class GpuThreadsRange:
@@ -56,6 +57,19 @@ class GpuThreadsRange:
             raise NotImplementedError(
                 f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
             )
+        
+        from ..ast.analysis import collect_undefined_symbols as collect
+
+        for dim in dimensions:
+            symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
+            for ctr in ispace.counters:
+                if ctr in symbs:
+                    raise MaterializationError(
+                        "Unable to construct GPU threads range for iteration space: "
+                        f"Limits of dimension counter {dim.counter.name} "
+                        f"depend on another dimension's counter {ctr.name}"
+                    )
+
         work_items = [ispace.actual_iterations(dim) for dim in dimensions]
         return GpuThreadsRange(work_items)
 
@@ -63,6 +77,6 @@ class GpuThreadsRange:
 class GenericGpu(Platform):
     @abstractmethod
     def materialize_iteration_space(
-        self, block: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+        self, body: PsBlock, ispace: IterationSpace
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
         pass
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index cab4d0a7b0143eabd4e40602b45ef2f66592ee8c..2c7ee1c5f4750eac0375bc31a3f44b9eea50642b 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -27,7 +27,7 @@ class Platform(ABC):
 
     @abstractmethod
     def materialize_iteration_space(
-        self, block: PsBlock, ispace: IterationSpace
+        self, body: PsBlock, ispace: IterationSpace
     ) -> PsBlock | tuple[PsBlock, Any]:
         pass
 
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index c89698193d9764996cfab7a5c5fd7e70358fbac6..e1e4fea502c08de86e13de5e3c251f1b7a7d0ee6 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -8,7 +8,7 @@ from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
 from ..ast.structural import PsLoop, PsBlock, PsDeclaration
-from ..ast.expressions import PsExpression
+from ..ast.expressions import PsExpression, PsTernary, PsGt
 from ..ast.vector import PsVecBroadcast
 from ..ast.analysis import collect_undefined_symbols
 
@@ -18,7 +18,7 @@ from .rewrite import substitute_symbols
 
 class LoopVectorizer:
     """Vectorize loops.
-    
+
     The loop vectorizer provides methods to vectorize single loops inside an AST
     using a given number of vector lanes.
     During vectorization, the loop body is transformed using the `AstVectorizer`,
@@ -64,29 +64,26 @@ class LoopVectorizer:
     @overload
     def vectorize_select_loops(
         self, node: PsBlock, predicate: Callable[[PsLoop], bool]
-    ) -> PsBlock:
-        ...
+    ) -> PsBlock: ...
 
     @overload
     def vectorize_select_loops(
         self, node: PsLoop, predicate: Callable[[PsLoop], bool]
-    ) -> PsLoop | PsBlock:
-        ...
+    ) -> PsLoop | PsBlock: ...
 
     @overload
     def vectorize_select_loops(
         self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
-    ) -> PsAstNode:
-        ...
+    ) -> PsAstNode: ...
 
     def vectorize_select_loops(
         self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
     ) -> PsAstNode:
         """Select and vectorize loops from a syntax tree according to a predicate.
-        
+
         Finds each loop inside a subtree and evaluates ``predicate`` on them.
         If ``predicate(loop)`` evaluates to `True`, the loop is vectorized.
-        
+
         Loops nested inside a vectorized loop will not be processed.
 
         Args:
@@ -139,7 +136,7 @@ class LoopVectorizer:
 
         #   Generate vectorized loop body
         simd_body = self._vectorize_ast(loop.body, vc)
-        
+
         if vector_ctr in collect_undefined_symbols(simd_body):
             simd_body.statements.insert(0, vector_counter_decl)
 
@@ -186,20 +183,31 @@ class LoopVectorizer:
                 trailing_start = self._ctx.get_new_symbol(
                     f"__{scalar_ctr.name}_trailing_start", scalar_ctr.get_dtype()
                 )
+
                 trailing_start_decl = self._type_fold(
                     PsDeclaration(
                         PsExpression.make(trailing_start),
-                        (
+                        PsTernary(
+                            #   If at least one vectorized iteration took place...
+                            PsGt(
+                                PsExpression.make(simd_stop),
+                                simd_start.clone(),
+                            ),
+                            #   start from the smallest non-valid multiple of simd_step, offset from simd_start
                             (
-                                PsExpression.make(simd_stop)
-                                - simd_start.clone()
-                                - PsExpression.make(PsConstant(1))
+                                (
+                                    PsExpression.make(simd_stop)
+                                    - simd_start.clone()
+                                    - PsExpression.make(PsConstant(1))
+                                )
+                                / PsExpression.make(simd_step)
+                                + PsExpression.make(PsConstant(1))
                             )
-                            / PsExpression.make(simd_step)
-                            + PsExpression.make(PsConstant(1))
-                        )
-                        * PsExpression.make(simd_step)
-                        + simd_start.clone(),
+                            * PsExpression.make(simd_step)
+                            + simd_start.clone(),
+                            #   otherwise start at zero
+                            simd_start.clone(),
+                        ),
                     )
                 )
 
diff --git a/src/pystencils/boundaries/boundaryhandling.py b/src/pystencils/boundaries/boundaryhandling.py
index 5c0869c2923a5091471fe60b4a32ee9a83aee1fa..fe8dd7d0059940841277f954cc322a42d2d744b6 100644
--- a/src/pystencils/boundaries/boundaryhandling.py
+++ b/src/pystencils/boundaries/boundaryhandling.py
@@ -314,7 +314,7 @@ class BoundaryHandling:
 
     def _create_boundary_kernel(self, symbolic_field, symbolic_index_field, boundary_obj):
         return create_boundary_kernel(symbolic_field, symbolic_index_field, self.stencil, boundary_obj,
-                                      target=self._target,)  # cpu_openmp=self._openmp) TODO: replace
+                                      target=self._target, cpu_openmp=self._openmp)
 
     def _create_index_fields(self):
         dh = self._data_handling
diff --git a/src/pystencils/config.py b/src/pystencils/config.py
index 43918d386696c6c1f2a6287554a7520643557377..c08ddc16198627adba97169e02724371eca62158 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -28,6 +28,14 @@ class PsOptionsError(Exception):
     """Indicates an option clash in the `CreateKernelConfig`."""
 
 
+class _AUTO_TYPE:
+    ...
+
+
+AUTO = _AUTO_TYPE()
+"""Special value that can be passed to some options for invoking automatic behaviour."""
+
+
 @dataclass
 class OpenMpConfig:
     """Parameters controlling kernel parallelization using OpenMP."""
@@ -182,6 +190,14 @@ class GpuIndexingConfig:
     block_size: tuple[int, int, int] | None = None
     """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
 
+    manual_launch_grid: bool = False
+    """Always require a manually specified launch grid when running this kernel.
+    
+    If set to `True`, the code generator will not attempt to infer the size of
+    the launch grid from the kernel.
+    The launch grid will then have to be specified manually at runtime.
+    """
+
     sycl_automatic_block_size: bool = True
     """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size.
 
@@ -213,32 +229,43 @@ class CreateKernelConfig:
     function_name: str = "kernel"
     """Name of the generated function"""
 
-    ghost_layers: None | int | Sequence[int | tuple[int, int]] = None
+    ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None
     """Specifies the number of ghost layers of the iteration region.
     
     Options:
-     - `None`: Required ghost layers are inferred from field accesses
+     - :py:data:`AUTO <pystencils.config.AUTO>`: Required ghost layers are inferred from field accesses
      - `int`:  A uniform number of ghost layers in each spatial coordinate is applied
      - ``Sequence[int, tuple[int, int]]``: Ghost layers are specified for each spatial coordinate.
         In each coordinate, a single integer specifies the ghost layers at both the lower and upper iteration limit,
         while a pair of integers specifies the lower and upper ghost layers separately.
 
     When manually specifying ghost layers, it is the user's responsibility to avoid out-of-bounds memory accesses.
-    If ``ghost_layers=None`` is specified, the iteration region may otherwise be set using the `iteration_slice` option.
+
+    .. note::
+        At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set.
     """
 
-    iteration_slice: None | Sequence[slice] = None
+    iteration_slice: None | int | slice | tuple[int | slice] = None
     """Specifies the kernel's iteration slice.
-    
-    `iteration_slice` may only be set if ``ghost_layers=None``.
-    If it is set, a slice must be specified for each spatial coordinate.
-    TODO: Specification of valid slices and their behaviour
+
+    Example:
+        >>> cfg = CreateKernelConfig(
+        ...     iteration_slice=ps.make_slice[3:14, 2:-2]
+        ... )
+        >>> cfg.iteration_slice
+        (slice(3, 14, None), slice(2, -2, None))
+
+    .. note::
+        At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set.
     """
 
     index_field: Field | None = None
     """Index field for a sparse kernel.
     
     If this option is set, a sparse kernel with the given field as index field will be generated.
+
+    .. note::
+        At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set.
     """
 
     """Data Types"""
diff --git a/src/pystencils/datahandling/serial_datahandling.py b/src/pystencils/datahandling/serial_datahandling.py
index 8521dda1014ab4e031706a1b21f4ae3c28259a05..6a5ce573085b0380196e208c7d19ec16cf5fbb37 100644
--- a/src/pystencils/datahandling/serial_datahandling.py
+++ b/src/pystencils/datahandling/serial_datahandling.py
@@ -291,7 +291,10 @@ class SerialDataHandling(DataHandling):
     def synchronization_function(self, names, stencil=None, target=None, functor=None, **_):
         if target is None:
             target = self.default_target
-        assert target in (Target.CPU, Target.GPU)
+            
+        if not (target.is_cpu() or target == Target.CUDA):
+            raise ValueError(f"Unsupported target: {target}")
+
         if not hasattr(names, '__len__') or type(names) is str:
             names = [names]
 
@@ -325,7 +328,7 @@ class SerialDataHandling(DataHandling):
                 values_per_cell = values_per_cell[0]
 
             if len(filtered_stencil) > 0:
-                if target == Target.CPU:
+                if target.is_cpu():
                     if functor is None:
                         from pystencils.slicing import get_periodic_boundary_functor
                         functor = get_periodic_boundary_functor
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index 651a67cf2092a93e9e1cab3f393c2edd1baf15a9..548fbc9bba8c1606fbba2929324e9cea273b73b3 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -6,6 +6,7 @@ from .config import (
     CreateKernelConfig,
     OpenMpConfig,
     VectorizationConfig,
+    AUTO
 )
 from .backend import KernelFunction
 from .types import create_numeric_type, PsIntegerType, PsScalarType
@@ -91,49 +92,18 @@ class DefaultKernelCreationDriver:
         self,
         assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
     ):
-        if isinstance(assignments, AssignmentBase):
-            assignments = [assignments]
-
-        if not isinstance(assignments, AssignmentCollection):
-            assignments = AssignmentCollection(assignments)  # type: ignore
-
-        _ = _parse_simplification_hints(assignments)
-
-        analysis = KernelAnalysis(
-            self._ctx,
-            not self._cfg.skip_independence_check,
-            not self._cfg.allow_double_writes,
+        kernel_body = self.parse_kernel_body(
+            assignments
         )
-        analysis(assignments)
-
-        if len(self._ctx.fields.index_fields) > 0 or self._cfg.index_field is not None:
-            ispace = create_sparse_iteration_space(
-                self._ctx, assignments, index_field=self._cfg.index_field
-            )
-        else:
-            ispace = create_full_iteration_space(
-                self._ctx,
-                assignments,
-                ghost_layers=self._cfg.ghost_layers,
-                iteration_slice=self._cfg.iteration_slice,
-            )
-
-        self._ctx.set_iteration_space(ispace)
-
-        freeze = FreezeExpressions(self._ctx)
-        kernel_body = freeze(assignments)
-
-        typify = Typifier(self._ctx)
-        kernel_body = typify(kernel_body)
 
         match self._platform:
             case GenericCpu():
                 kernel_ast = self._platform.materialize_iteration_space(
-                    kernel_body, ispace
+                    kernel_body, self._ctx.get_iteration_space()
                 )
             case GenericGpu():
                 kernel_ast, gpu_threads = self._platform.materialize_iteration_space(
-                    kernel_body, ispace
+                    kernel_body, self._ctx.get_iteration_space()
                 )
 
         #   Fold and extract constants
@@ -179,6 +149,53 @@ class DefaultKernelCreationDriver:
                 self._cfg.get_jit(),
             )
 
+    def parse_kernel_body(
+        self,
+        assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
+    ) -> PsBlock:
+        if isinstance(assignments, AssignmentBase):
+            assignments = [assignments]
+
+        if not isinstance(assignments, AssignmentCollection):
+            assignments = AssignmentCollection(assignments)  # type: ignore
+
+        _ = _parse_simplification_hints(assignments)
+
+        analysis = KernelAnalysis(
+            self._ctx,
+            not self._cfg.skip_independence_check,
+            not self._cfg.allow_double_writes,
+        )
+        analysis(assignments)
+
+        if self._cfg.index_field is not None:
+            ispace = create_sparse_iteration_space(
+                self._ctx, assignments, index_field=self._cfg.index_field
+            )
+        else:
+            gls = self._cfg.ghost_layers
+            islice = self._cfg.iteration_slice
+
+            if gls is None and islice is None:
+                gls = AUTO
+
+            ispace = create_full_iteration_space(
+                self._ctx,
+                assignments,
+                ghost_layers=gls,
+                iteration_slice=islice,
+            )
+
+        self._ctx.set_iteration_space(ispace)
+
+        freeze = FreezeExpressions(self._ctx)
+        kernel_body = freeze(assignments)
+
+        typify = Typifier(self._ctx)
+        kernel_body = typify(kernel_body)
+
+        return kernel_body
+
     def _transform_for_cpu(self, kernel_ast: PsBlock):
         canonicalize = CanonicalizeSymbols(self._ctx, True)
         kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
diff --git a/src/pystencils/simp/simplificationstrategy.py b/src/pystencils/simp/simplificationstrategy.py
index 22ffa34d04bc2731f615bd685137c8abebf9d58b..7cba94f8bb80e69afe039a1bc88822e627781a23 100644
--- a/src/pystencils/simp/simplificationstrategy.py
+++ b/src/pystencils/simp/simplificationstrategy.py
@@ -57,7 +57,7 @@ class SimplificationStrategy:
 
             def __str__(self):
                 try:
-                    import tabulate
+                    from tabulate import tabulate
                     return tabulate(self.elements, headers=['Name', 'Runtime', 'Adds', 'Muls', 'Divs', 'Total'])
                 except ImportError:
                     result = "Name, Adds, Muls, Divs, Runtime\n"
diff --git a/src/pystencils/types/types.py b/src/pystencils/types/types.py
index ae751992db61c7bcb2230c5c4dec780eb7ed6d4c..7645a452ffaad83de802da90e10b93fa43e1d3dc 100644
--- a/src/pystencils/types/types.py
+++ b/src/pystencils/types/types.py
@@ -683,7 +683,7 @@ class PsIeeeFloatType(PsScalarType):
     def create_constant(self, value: Any) -> Any:
         np_type = self.NUMPY_TYPES[self._width]
 
-        if isinstance(value, (int, float, np.floating)):
+        if isinstance(value, (int, float, np.integer, np.floating)):
             finfo = np.finfo(np_type)  # type: ignore
             if value < finfo.min or value > finfo.max:
                 raise PsTypeError(
diff --git a/tests/fixtures.py b/tests/fixtures.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c95216147d02b7943b0c218af6fbb5b5175559b
--- /dev/null
+++ b/tests/fixtures.py
@@ -0,0 +1,71 @@
+"""Fixtures for the pystencils test suite
+
+This module provides a number of fixtures used by the pystencils test suite.
+Use these fixtures wherever applicable to extend the code surface area covered
+by your tests:
+
+- All tests that should work for every target should use the `target` fixture
+- All tests that should work with the highest optimization level for every target
+  should use the `gen_config` fixture
+- Use the `xp` fixture to access the correct array module (numpy or cupy) depending
+  on the target
+
+"""
+
+import pytest
+
+from types import ModuleType
+from dataclasses import replace
+
+import pystencils as ps
+
+AVAILABLE_TARGETS = [ps.Target.GenericCPU]
+
+try:
+    import cupy
+
+    AVAILABLE_TARGETS += [ps.Target.CUDA]
+except ImportError:
+    pass
+
+AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets()
+TARGET_IDS = [t.name for t in AVAILABLE_TARGETS]
+
+@pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS)
+def target(request) -> ps.Target:
+    """Provides all code generation targets available on the current hardware"""
+    return request.param
+
+@pytest.fixture
+def gen_config(target: ps.Target):
+    """Default codegen configuration for the current target.
+    
+    For GPU targets, set default indexing options.
+    For vector-CPU targets, set default vectorization config.
+    """
+
+    gen_config = ps.CreateKernelConfig(target=target)
+
+    if target.is_vector_cpu():
+        gen_config = replace(
+            gen_config,
+            cpu_optim=ps.CpuOptimConfig(
+                vectorize=ps.VectorizationConfig(assume_inner_stride_one=True)
+            ),
+        )
+
+    return gen_config
+
+@pytest.fixture()
+def xp(target: ps.Target) -> ModuleType:
+    """Primary array module for the current target.
+    
+    Returns:
+        `cupy` if `target == Target.CUDA`, and `numpy` otherwise
+    """
+    if target == ps.Target.CUDA:
+        import cupy as xp
+        return xp
+    else:
+        import numpy as np
+        return np
diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py
index b9cebb3548851af29406acb93a3a4ea14127e35e..d02bfd8e46e8fc8c8f19bcebffc0db52787ff1bd 100644
--- a/tests/kernelcreation/test_domain_kernels.py
+++ b/tests/kernelcreation/test_domain_kernels.py
@@ -18,35 +18,6 @@ from pystencils.assignment import assignment_from_stencil
 from pystencils.kernelcreation import create_kernel, KernelFunction
 from pystencils.backend.emission import emit_code
 
-AVAILABLE_TARGETS = [Target.GenericCPU]
-
-try:
-    import cupy
-
-    AVAILABLE_TARGETS += [Target.CUDA]
-except ImportError:
-    pass
-
-AVAILABLE_TARGETS += Target.available_vector_cpu_targets()
-TEST_IDS = [t.name for t in AVAILABLE_TARGETS]
-
-
-@pytest.fixture(params=AVAILABLE_TARGETS, ids=TEST_IDS)
-def gen_config(request):
-    target: Target = request.param
-
-    gen_config = CreateKernelConfig(target=target)
-
-    if Target._VECTOR in target:
-        gen_config = replace(
-            gen_config,
-            cpu_optim=CpuOptimConfig(
-                vectorize=VectorizationConfig(assume_inner_stride_one=True)
-            ),
-        )
-
-    return gen_config
-
 
 def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig):
     code = emit_code(kernel)
diff --git a/tests/kernelcreation/test_functions.py b/tests/kernelcreation/test_functions.py
index e16201f819161fc611ec16b09dc98a7d7abf9445..cab1affcb3a344033458829ba28765a9a1ff8c48 100644
--- a/tests/kernelcreation/test_functions.py
+++ b/tests/kernelcreation/test_functions.py
@@ -7,7 +7,6 @@ from pystencils.backend.ast import dfs_preorder
 from pystencils.backend.ast.expressions import PsCall
 
 
-
 def unary_function(name, xp):
     return {
         "exp": (sp.exp, xp.exp),
diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb7f37eba99e2625a0eeb050a49575f657342ca5
--- /dev/null
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -0,0 +1,192 @@
+import numpy as np
+import sympy as sp
+import pytest
+
+from dataclasses import replace
+
+from pystencils import (
+    DEFAULTS,
+    Assignment,
+    Field,
+    TypedSymbol,
+    create_kernel,
+    make_slice,
+    Target,
+    CreateKernelConfig,
+    GpuIndexingConfig,
+    DynamicType,
+)
+from pystencils.sympyextensions.integer_functions import int_rem
+from pystencils.simp import sympy_cse_on_assignment_list
+from pystencils.slicing import normalize_slice
+from pystencils.backend.jit.gpu_cupy import CupyKernelWrapper
+
+
+def test_sliced_iteration():
+    size = (4, 4)
+    src_arr = np.ones(size)
+    dst_arr = np.zeros_like(src_arr)
+    src_field = Field.create_from_numpy_array("src", src_arr)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr)
+
+    a, b = sp.symbols("a b")
+    update_rule = Assignment(
+        dst_field[0, 0],
+        (
+            a * src_field[0, 1]
+            + a * src_field[0, -1]
+            + b * src_field[1, 0]
+            + b * src_field[-1, 0]
+        )
+        / 4,
+    )
+
+    x_end = TypedSymbol("x_end", "int")
+    s = make_slice[1:x_end, 1]
+    x_end_value = size[1] - 1
+    kernel = create_kernel(
+        sympy_cse_on_assignment_list([update_rule]), iteration_slice=s
+    ).compile()
+
+    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
+
+    expected_result = np.zeros(size)
+    expected_result[1:x_end_value, 1] = 1
+    np.testing.assert_almost_equal(expected_result, dst_arr)
+
+
+@pytest.mark.parametrize(
+    "islice",
+    [
+        make_slice[1:-1, 1:-1],
+        make_slice[3, 2:-2],
+        make_slice[2:-2:2, ::3],
+        make_slice[10:, :-5:2],
+        make_slice[-5:-1, -1],
+        make_slice[-3, -1]
+    ],
+)
+def test_numerical_slices(gen_config: CreateKernelConfig, xp, islice):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    expected = xp.zeros_like(f_arr)
+    expected[islice] = 1.0
+
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    update = Assignment(f.center(), 1)
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    try:
+        kernel = create_kernel(update, gen_config).compile()
+    except NotImplementedError:
+        if gen_config.target.is_vector_cpu():
+            #   TODO Gather/Scatter not implemented yet
+            pytest.xfail("Gather/Scatter not available yet")
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_symbolic_slice(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    sx, sy, ex, ey = [
+        TypedSymbol(n, DynamicType.INDEX_TYPE) for n in ("sx", "sy", "ex", "ey")
+    ]
+
+    f_arr = xp.zeros(shape)
+
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    update = Assignment(f.center(), 1)
+    islice = make_slice[sy:ey, sx:ex]
+    gen_config = replace(gen_config, iteration_slice=islice)
+    kernel = create_kernel(update, gen_config).compile()
+
+    for slic in [make_slice[:, :], make_slice[1:-1, 2:-2], make_slice[8:14, 7:11]]:
+        slic = normalize_slice(slic, shape)
+        expected = xp.zeros_like(f_arr)
+        expected[slic] = 1.0
+
+        f_arr[:] = 0.0
+
+        kernel(
+            f=f_arr,
+            sy=slic[0].start,
+            ey=slic[0].stop,
+            sx=slic[1].start,
+            ex=slic[1].stop,
+        )
+
+        xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    expected = xp.zeros_like(f_arr)
+    for r in range(shape[0]):
+        expected[r, r:] = 1.0
+
+    update = Assignment(f.center(), 1)
+    outer_counter = DEFAULTS.spatial_counters[0]
+    islice = make_slice[:, outer_counter:]
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    if gen_config.target == Target.CUDA:
+        gen_config = replace(
+            gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
+        )
+
+    kernel = create_kernel(update, gen_config).compile()
+
+    if isinstance(kernel, CupyKernelWrapper):
+        kernel.block_size = shape + (1,)
+        kernel.num_blocks = (1, 1, 1)
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    expected = xp.zeros_like(f_arr)
+    for r in range(shape[0]):
+        start = 0 if r % 2 == 0 else 1
+        expected[r, start::2] = 1.0
+
+    update = Assignment(f.center(), 1)
+    outer_counter = DEFAULTS.spatial_counters[0]
+    start = sp.Piecewise((0, sp.Eq(int_rem(outer_counter, 2), 0)), (1, True))
+    islice = make_slice[:, start::2]
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    if gen_config.target == Target.CUDA:
+        gen_config = replace(
+            gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
+        )
+
+    try:
+        kernel = create_kernel(update, gen_config).compile()
+    except NotImplementedError:
+        if gen_config.target.is_vector_cpu():
+            pytest.xfail("Gather/Scatter not implemented yet")
+
+    if isinstance(kernel, CupyKernelWrapper):
+        kernel.block_size = (8, 16, 1)
+        kernel.num_blocks = (1, 1, 1)
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
diff --git a/tests/kernelcreation/test_sliced_iteration.py b/tests/kernelcreation/test_sliced_iteration.py
deleted file mode 100644
index 5eff0a89d4d23386d00e7408a19ece93453d7d9d..0000000000000000000000000000000000000000
--- a/tests/kernelcreation/test_sliced_iteration.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import numpy as np
-import sympy as sp
-
-from pystencils import Assignment, Field, TypedSymbol, create_kernel, make_slice
-from pystencils.simp import sympy_cse_on_assignment_list
-
-
-def test_sliced_iteration():
-    size = (4, 4)
-    src_arr = np.ones(size)
-    dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr)
-
-    a, b = sp.symbols("a b")
-    update_rule = Assignment(dst_field[0, 0],
-                             (a * src_field[0, 1] + a * src_field[0, -1] +
-                              b * src_field[1, 0] + b * src_field[-1, 0]) / 4)
-
-    x_end = TypedSymbol("x_end", "int")
-    s = make_slice[1:x_end, 1]
-    x_end_value = size[1] - 1
-    kernel = create_kernel(sympy_cse_on_assignment_list([update_rule]), iteration_slice=s).compile()
-
-    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
-
-    expected_result = np.zeros(size)
-    expected_result[1:x_end_value, 1] = 1
-    np.testing.assert_almost_equal(expected_result, dst_arr)
diff --git a/tests/nbackend/kernelcreation/test_iteration_space.py b/tests/nbackend/kernelcreation/test_iteration_space.py
index 5d56abd2b818fa74fbd48aac0216d472112f8c64..abc1c9820002eb08454ef5bac7c0e1ba2bfca3ba 100644
--- a/tests/nbackend/kernelcreation/test_iteration_space.py
+++ b/tests/nbackend/kernelcreation/test_iteration_space.py
@@ -129,6 +129,36 @@ def test_slices_with_negative_start():
     )
 
 
+def test_negative_singular_slices():
+    ctx = KernelCreationContext()
+    factory = AstFactory(ctx)
+
+    archetype_field = Field.create_generic("f", spatial_dimensions=2, layout="fzyx")
+    ctx.add_field(archetype_field)
+    archetype_arr = ctx.get_buffer(archetype_field)
+
+    islice = (-2, -1)
+    ispace = FullIterationSpace.create_from_slice(ctx, islice, archetype_field)
+
+    dims = ispace.dimensions
+
+    assert dims[0].start.structurally_equal(
+        PsExpression.make(archetype_arr.shape[0]) + factory.parse_index(-2)
+    )
+
+    assert dims[0].stop.structurally_equal(
+        PsExpression.make(archetype_arr.shape[0]) + factory.parse_index(-1)
+    )
+
+    assert dims[1].start.structurally_equal(
+        PsExpression.make(archetype_arr.shape[1]) + factory.parse_index(-1)
+    )
+
+    assert dims[1].stop.structurally_equal(
+        PsExpression.make(archetype_arr.shape[1])
+    )
+
+
 def test_field_independent_slices():
     ctx = KernelCreationContext()
 
diff --git a/tests/nbackend/test_vectorization.py b/tests/nbackend/test_vectorization.py
index aeb0ebaa0601537331d4daaee1bfcbfdcccb2e6a..55330c9ee8d5d675379418748aa085ab4ce3ae73 100644
--- a/tests/nbackend/test_vectorization.py
+++ b/tests/nbackend/test_vectorization.py
@@ -89,6 +89,11 @@ TEST_SETUPS: list[VectorTestSetup] = list(
 TEST_IDS = [t.name for t in TEST_SETUPS]
 
 
+@pytest.fixture(params=TEST_SETUPS, ids=TEST_IDS)
+def vectorization_setup(request) -> VectorTestSetup:
+    return request.param
+
+
 def create_vector_kernel(
     assignments: list[Assignment],
     field: Field,
@@ -139,9 +144,9 @@ def create_vector_kernel(
     return kernel
 
 
-@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
 @pytest.mark.parametrize("ghost_layers", [0, 2])
-def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
+def test_update_kernel(vectorization_setup: VectorTestSetup, ghost_layers: int):
+    setup = vectorization_setup
     src, dst = fields(f"src(2), dst(4): {setup.numeric_dtype}[2D]", layout="fzyx")
 
     x = sp.symbols("x_:4")
@@ -197,8 +202,8 @@ def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
             np.testing.assert_equal(dst_arr[:, -i, :], 0.0)
 
 
-@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
-def test_trailing_iterations(setup: VectorTestSetup):
+def test_trailing_iterations(vectorization_setup: VectorTestSetup):
+    setup = vectorization_setup
     f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")
 
     update = [Assignment(f(0), 2 * f(0))]
@@ -216,3 +221,24 @@ def test_trailing_iterations(setup: VectorTestSetup):
         kernel(f=f_arr)
 
         np.testing.assert_equal(f_arr, 2.0)
+
+
+def test_only_trailing_iterations(vectorization_setup: VectorTestSetup):
+    setup = vectorization_setup
+    f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")
+
+    update = [Assignment(f(0), 2 * f(0))]
+
+    kernel = create_vector_kernel(update, f, setup)
+
+    for trailing_iters in range(1, setup.lanes):
+        shape = (trailing_iters, 1)
+        f_arr = create_numpy_array_with_layout(
+            shape, layout=(1, 0), dtype=setup.numeric_dtype.numpy_dtype
+        )
+
+        f_arr[:] = 1.0
+
+        kernel(f=f_arr)
+
+        np.testing.assert_equal(f_arr, 2.0)