diff --git a/.flake8 b/.flake8
index 3f946922a5f68e39bc02c65d4a5f4ffdb10e05e1..05758405a65664809b3a0a71893554a1af8e3381 100644
--- a/.flake8
+++ b/.flake8
@@ -4,4 +4,4 @@ exclude=src/pystencils/jupyter.py,
         src/pystencils/plot.py
         src/pystencils/session.py
         src/pystencils/old
-ignore = W293 W503 W291 C901 E741
+ignore = W293 W503 W291 C901 E741 E704
diff --git a/conftest.py b/conftest.py
index 0c6c49153770bc40c90cb09e990d12561f7bad7a..4e8e2b73ab49ff1f881e755b19f9a46e07c0837e 100644
--- a/conftest.py
+++ b/conftest.py
@@ -203,3 +203,8 @@ else:
                 return IPyNbFile.from_parent(fspath=path, parent=parent)
             else:
                 return IPyNbFile(path, parent)
+
+
+#   Fixtures
+
+from tests.fixtures import *
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 5c8b5b5045b5f889123cdf86686ecd696c6d8bf7..dbade47d1b8fafcb55c12be0b07d674f6edb3ce3 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -144,7 +144,7 @@ class CudaPlatform(GenericGpu):
 
         indexing_decls = []
         conds = []
-        for i, dim in enumerate(dimensions):
+        for i, dim in enumerate(dimensions[::-1]):
             dim.counter.dtype = constify(dim.counter.get_dtype())
 
             ctr = PsExpression.make(dim.counter)
@@ -161,6 +161,8 @@ class CudaPlatform(GenericGpu):
             if not self._cfg.omit_range_check:
                 conds.append(PsLt(ctr, dim.stop))
 
+        indexing_decls = indexing_decls[::-1]
+
         if conds:
             condition: PsExpression = conds[0]
             for cond in conds[1:]:
@@ -214,72 +216,3 @@ class CudaPlatform(GenericGpu):
         block_idx = BLOCK_IDX[coord]
         thread_idx = THREAD_IDX[coord]
         return block_idx * block_size + thread_idx
-
-
-# class LinearIndexing:
-#     """Linear GPU thread indexing.
-
-#     This indexing scheme maps GPU threads to iteration space points in the following way:
-#     - Starting from the slowest coordinate, each coordinate is mapped to a dimension
-#       of the GPU grid until just one dimension is left
-#     - All remaining dimensions of the iteration space are linearly mapped
-#       onto the fastest launch grid dimension
-#     """
-
-#     def __init__(
-#         self,
-#         ctx: KernelCreationContext,
-#         launch_grid_dimensions: int,
-#         ispace: FullIterationSpace,
-#     ) -> None:
-#         if not (0 < launch_grid_dimensions <= 3):
-#             raise ValueError(
-#                 f"Invalid number of launch grid dimensions: {launch_grid_dimensions}"
-#             )
-
-#         self._ctx = ctx
-
-#         self._grid_dims = launch_grid_dimensions
-#         self._ispace = ispace
-#         self._ispace_dims = len(ispace.dimensions)
-
-#         self._typify = Typifier(ctx)
-
-#     def get_counter_declarations(self) -> Sequence[PsDeclaration]:
-#         num_slower_dimensions = min(self._grid_dims, self._ispace_dims) - 1
-#         num_fast_dimensions = self._ispace_dims - num_slower_dimensions
-
-#         decls = []
-
-#         #   Slower n dimensions
-#         for i in range(num_slower_dimensions, 0, -1):
-#             thread_idx = BLOCK_IDX[i] * BLOCK_DIM[i] + THREAD_IDX[i]
-#             decls.append(self._make_ctr_decl(self._ispace.dimensions[num_fast_dimensions + i], thread_idx))
-
-#         #   Fastest dimensions
-#         thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
-
-#         if num_fast_dimensions == 1:
-#             decls.append(self._make_ctr_decl(self._ispace.dimensions[0], thread_idx))
-#         else:
-#             for i in range(num_fast_dimensions, 0, -1):
-#                 decls.append(
-#                     self._make_ctr_decl(
-#                         self._ispace.dimensions[i],
-#                         #   ergh... need actual iterations here...
-#                     )
-#                 )
-
-
-#     def _make_ctr_decl(
-#         self, dim: FullIterationSpace.Dimension, thread_idx: PsExpression
-#     ):
-#         dim.counter.dtype = constify(dim.counter.get_dtype())
-
-#         ctr = PsExpression.make(dim.counter)
-#         return self._typify(
-#             PsDeclaration(
-#                 ctr,
-#                 dim.start + dim.step * PsCast(ctr.get_dtype(), thread_idx),
-#             )
-#         )
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index c89698193d9764996cfab7a5c5fd7e70358fbac6..e1e4fea502c08de86e13de5e3c251f1b7a7d0ee6 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -8,7 +8,7 @@ from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
 from ..ast.structural import PsLoop, PsBlock, PsDeclaration
-from ..ast.expressions import PsExpression
+from ..ast.expressions import PsExpression, PsTernary, PsGt
 from ..ast.vector import PsVecBroadcast
 from ..ast.analysis import collect_undefined_symbols
 
@@ -18,7 +18,7 @@ from .rewrite import substitute_symbols
 
 class LoopVectorizer:
     """Vectorize loops.
-    
+
     The loop vectorizer provides methods to vectorize single loops inside an AST
     using a given number of vector lanes.
     During vectorization, the loop body is transformed using the `AstVectorizer`,
@@ -64,29 +64,26 @@ class LoopVectorizer:
     @overload
     def vectorize_select_loops(
         self, node: PsBlock, predicate: Callable[[PsLoop], bool]
-    ) -> PsBlock:
-        ...
+    ) -> PsBlock: ...
 
     @overload
     def vectorize_select_loops(
         self, node: PsLoop, predicate: Callable[[PsLoop], bool]
-    ) -> PsLoop | PsBlock:
-        ...
+    ) -> PsLoop | PsBlock: ...
 
     @overload
     def vectorize_select_loops(
         self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
-    ) -> PsAstNode:
-        ...
+    ) -> PsAstNode: ...
 
     def vectorize_select_loops(
         self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
     ) -> PsAstNode:
         """Select and vectorize loops from a syntax tree according to a predicate.
-        
+
         Finds each loop inside a subtree and evaluates ``predicate`` on them.
         If ``predicate(loop)`` evaluates to `True`, the loop is vectorized.
-        
+
         Loops nested inside a vectorized loop will not be processed.
 
         Args:
@@ -139,7 +136,7 @@ class LoopVectorizer:
 
         #   Generate vectorized loop body
         simd_body = self._vectorize_ast(loop.body, vc)
-        
+
         if vector_ctr in collect_undefined_symbols(simd_body):
             simd_body.statements.insert(0, vector_counter_decl)
 
@@ -186,20 +183,31 @@ class LoopVectorizer:
                 trailing_start = self._ctx.get_new_symbol(
                     f"__{scalar_ctr.name}_trailing_start", scalar_ctr.get_dtype()
                 )
+
                 trailing_start_decl = self._type_fold(
                     PsDeclaration(
                         PsExpression.make(trailing_start),
-                        (
+                        PsTernary(
+                            #   If at least one vectorized iteration took place...
+                            PsGt(
+                                PsExpression.make(simd_stop),
+                                simd_start.clone(),
+                            ),
+                            #   start from the smallest non-valid multiple of simd_step, offset from simd_start
                             (
-                                PsExpression.make(simd_stop)
-                                - simd_start.clone()
-                                - PsExpression.make(PsConstant(1))
+                                (
+                                    PsExpression.make(simd_stop)
+                                    - simd_start.clone()
+                                    - PsExpression.make(PsConstant(1))
+                                )
+                                / PsExpression.make(simd_step)
+                                + PsExpression.make(PsConstant(1))
                             )
-                            / PsExpression.make(simd_step)
-                            + PsExpression.make(PsConstant(1))
-                        )
-                        * PsExpression.make(simd_step)
-                        + simd_start.clone(),
+                            * PsExpression.make(simd_step)
+                            + simd_start.clone(),
+                            #   otherwise start at zero
+                            simd_start.clone(),
+                        ),
                     )
                 )
 
diff --git a/tests/fixtures.py b/tests/fixtures.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c95216147d02b7943b0c218af6fbb5b5175559b
--- /dev/null
+++ b/tests/fixtures.py
@@ -0,0 +1,71 @@
+"""Fixtures for the pystencils test suite
+
+This module provides a number of fixtures used by the pystencils test suite.
+Use these fixtures wherever applicable to extend the code surface area covered
+by your tests:
+
+- All tests that should work for every target should use the `target` fixture
+- All tests that should work with the highest optimization level for every target
+  should use the `gen_config` fixture
+- Use the `xp` fixture to access the correct array module (numpy or cupy) depending
+  on the target
+
+"""
+
+import pytest
+
+from types import ModuleType
+from dataclasses import replace
+
+import pystencils as ps
+
+AVAILABLE_TARGETS = [ps.Target.GenericCPU]
+
+try:
+    import cupy
+
+    AVAILABLE_TARGETS += [ps.Target.CUDA]
+except ImportError:
+    pass
+
+AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets()
+TARGET_IDS = [t.name for t in AVAILABLE_TARGETS]
+
+@pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS)
+def target(request) -> ps.Target:
+    """Provides all code generation targets available on the current hardware"""
+    return request.param
+
+@pytest.fixture
+def gen_config(target: ps.Target):
+    """Default codegen configuration for the current target.
+    
+    For GPU targets, set default indexing options.
+    For vector-CPU targets, set default vectorization config.
+    """
+
+    gen_config = ps.CreateKernelConfig(target=target)
+
+    if target.is_vector_cpu():
+        gen_config = replace(
+            gen_config,
+            cpu_optim=ps.CpuOptimConfig(
+                vectorize=ps.VectorizationConfig(assume_inner_stride_one=True)
+            ),
+        )
+
+    return gen_config
+
+@pytest.fixture()
+def xp(target: ps.Target) -> ModuleType:
+    """Primary array module for the current target.
+    
+    Returns:
+        `cupy` if `target == Target.CUDA`, and `numpy` otherwise
+    """
+    if target == ps.Target.CUDA:
+        import cupy as xp
+        return xp
+    else:
+        import numpy as np
+        return np
diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py
index b9cebb3548851af29406acb93a3a4ea14127e35e..d02bfd8e46e8fc8c8f19bcebffc0db52787ff1bd 100644
--- a/tests/kernelcreation/test_domain_kernels.py
+++ b/tests/kernelcreation/test_domain_kernels.py
@@ -18,35 +18,6 @@ from pystencils.assignment import assignment_from_stencil
 from pystencils.kernelcreation import create_kernel, KernelFunction
 from pystencils.backend.emission import emit_code
 
-AVAILABLE_TARGETS = [Target.GenericCPU]
-
-try:
-    import cupy
-
-    AVAILABLE_TARGETS += [Target.CUDA]
-except ImportError:
-    pass
-
-AVAILABLE_TARGETS += Target.available_vector_cpu_targets()
-TEST_IDS = [t.name for t in AVAILABLE_TARGETS]
-
-
-@pytest.fixture(params=AVAILABLE_TARGETS, ids=TEST_IDS)
-def gen_config(request):
-    target: Target = request.param
-
-    gen_config = CreateKernelConfig(target=target)
-
-    if Target._VECTOR in target:
-        gen_config = replace(
-            gen_config,
-            cpu_optim=CpuOptimConfig(
-                vectorize=VectorizationConfig(assume_inner_stride_one=True)
-            ),
-        )
-
-    return gen_config
-
 
 def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig):
     code = emit_code(kernel)
diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ed0295441885cb4ffc87855181659383b1fde7
--- /dev/null
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -0,0 +1,190 @@
+import numpy as np
+import sympy as sp
+import pytest
+
+from dataclasses import replace
+
+from pystencils import (
+    DEFAULTS,
+    Assignment,
+    Field,
+    TypedSymbol,
+    create_kernel,
+    make_slice,
+    Target,
+    CreateKernelConfig,
+    GpuIndexingConfig,
+    DynamicType,
+)
+from pystencils.sympyextensions.integer_functions import int_rem
+from pystencils.simp import sympy_cse_on_assignment_list
+from pystencils.slicing import normalize_slice
+from pystencils.backend.jit.gpu_cupy import CupyKernelWrapper
+
+
+def test_sliced_iteration():
+    size = (4, 4)
+    src_arr = np.ones(size)
+    dst_arr = np.zeros_like(src_arr)
+    src_field = Field.create_from_numpy_array("src", src_arr)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr)
+
+    a, b = sp.symbols("a b")
+    update_rule = Assignment(
+        dst_field[0, 0],
+        (
+            a * src_field[0, 1]
+            + a * src_field[0, -1]
+            + b * src_field[1, 0]
+            + b * src_field[-1, 0]
+        )
+        / 4,
+    )
+
+    x_end = TypedSymbol("x_end", "int")
+    s = make_slice[1:x_end, 1]
+    x_end_value = size[1] - 1
+    kernel = create_kernel(
+        sympy_cse_on_assignment_list([update_rule]), iteration_slice=s
+    ).compile()
+
+    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
+
+    expected_result = np.zeros(size)
+    expected_result[1:x_end_value, 1] = 1
+    np.testing.assert_almost_equal(expected_result, dst_arr)
+
+
+@pytest.mark.parametrize(
+    "islice",
+    [
+        make_slice[1:-1, 1:-1],
+        make_slice[3, 2:-2],
+        make_slice[2:-2:2, ::3],
+        make_slice[10:, :-5:2],
+    ],
+)
+def test_numerical_slices(gen_config: CreateKernelConfig, xp, islice):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    expected = xp.zeros_like(f_arr)
+    expected[islice] = 1.0
+
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    update = Assignment(f.center(), 1)
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    try:
+        kernel = create_kernel(update, gen_config).compile()
+    except NotImplementedError:
+        if gen_config.target.is_vector_cpu():
+            #   TODO Gather/Scatter not implemented yet
+            pytest.xfail("Gather/Scatter not available yet")
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_symbolic_slice(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    sx, sy, ex, ey = [
+        TypedSymbol(n, DynamicType.INDEX_TYPE) for n in ("sx", "sy", "ex", "ey")
+    ]
+
+    f_arr = xp.zeros(shape)
+
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    update = Assignment(f.center(), 1)
+    islice = make_slice[sy:ey, sx:ex]
+    gen_config = replace(gen_config, iteration_slice=islice)
+    kernel = create_kernel(update, gen_config).compile()
+
+    for slic in [make_slice[:, :], make_slice[1:-1, 2:-2], make_slice[8:14, 7:11]]:
+        slic = normalize_slice(slic, shape)
+        expected = xp.zeros_like(f_arr)
+        expected[slic] = 1.0
+
+        f_arr[:] = 0.0
+
+        kernel(
+            f=f_arr,
+            sy=slic[0].start,
+            ey=slic[0].stop,
+            sx=slic[1].start,
+            ex=slic[1].stop,
+        )
+
+        xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    expected = xp.zeros_like(f_arr)
+    for r in range(shape[0]):
+        expected[r, r:] = 1.0
+
+    update = Assignment(f.center(), 1)
+    outer_counter = DEFAULTS.spatial_counters[0]
+    islice = make_slice[:, outer_counter:]
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    if gen_config.target == Target.CUDA:
+        gen_config = replace(
+            gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
+        )
+
+    kernel = create_kernel(update, gen_config).compile()
+
+    if isinstance(kernel, CupyKernelWrapper):
+        kernel.block_size = shape + (1,)
+        kernel.num_blocks = (1, 1, 1)
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    expected = xp.zeros_like(f_arr)
+    for r in range(shape[0]):
+        start = 0 if r % 2 == 0 else 1
+        expected[r, start::2] = 1.0
+
+    update = Assignment(f.center(), 1)
+    outer_counter = DEFAULTS.spatial_counters[0]
+    start = sp.Piecewise((0, sp.Eq(int_rem(outer_counter, 2), 0)), (1, True))
+    islice = make_slice[:, start::2]
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    if gen_config.target == Target.CUDA:
+        gen_config = replace(
+            gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
+        )
+
+    try:
+        kernel = create_kernel(update, gen_config).compile()
+    except NotImplementedError:
+        if gen_config.target.is_vector_cpu():
+            pytest.xfail("Gather/Scatter not implemented yet")
+
+    if isinstance(kernel, CupyKernelWrapper):
+        kernel.block_size = (8, 16, 1)
+        kernel.num_blocks = (1, 1, 1)
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
diff --git a/tests/kernelcreation/test_sliced_iteration.py b/tests/kernelcreation/test_sliced_iteration.py
deleted file mode 100644
index 5eff0a89d4d23386d00e7408a19ece93453d7d9d..0000000000000000000000000000000000000000
--- a/tests/kernelcreation/test_sliced_iteration.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import numpy as np
-import sympy as sp
-
-from pystencils import Assignment, Field, TypedSymbol, create_kernel, make_slice
-from pystencils.simp import sympy_cse_on_assignment_list
-
-
-def test_sliced_iteration():
-    size = (4, 4)
-    src_arr = np.ones(size)
-    dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr)
-
-    a, b = sp.symbols("a b")
-    update_rule = Assignment(dst_field[0, 0],
-                             (a * src_field[0, 1] + a * src_field[0, -1] +
-                              b * src_field[1, 0] + b * src_field[-1, 0]) / 4)
-
-    x_end = TypedSymbol("x_end", "int")
-    s = make_slice[1:x_end, 1]
-    x_end_value = size[1] - 1
-    kernel = create_kernel(sympy_cse_on_assignment_list([update_rule]), iteration_slice=s).compile()
-
-    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
-
-    expected_result = np.zeros(size)
-    expected_result[1:x_end_value, 1] = 1
-    np.testing.assert_almost_equal(expected_result, dst_arr)
diff --git a/tests/nbackend/test_vectorization.py b/tests/nbackend/test_vectorization.py
index aeb0ebaa0601537331d4daaee1bfcbfdcccb2e6a..55330c9ee8d5d675379418748aa085ab4ce3ae73 100644
--- a/tests/nbackend/test_vectorization.py
+++ b/tests/nbackend/test_vectorization.py
@@ -89,6 +89,11 @@ TEST_SETUPS: list[VectorTestSetup] = list(
 TEST_IDS = [t.name for t in TEST_SETUPS]
 
 
+@pytest.fixture(params=TEST_SETUPS, ids=TEST_IDS)
+def vectorization_setup(request) -> VectorTestSetup:
+    return request.param
+
+
 def create_vector_kernel(
     assignments: list[Assignment],
     field: Field,
@@ -139,9 +144,9 @@ def create_vector_kernel(
     return kernel
 
 
-@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
 @pytest.mark.parametrize("ghost_layers", [0, 2])
-def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
+def test_update_kernel(vectorization_setup: VectorTestSetup, ghost_layers: int):
+    setup = vectorization_setup
     src, dst = fields(f"src(2), dst(4): {setup.numeric_dtype}[2D]", layout="fzyx")
 
     x = sp.symbols("x_:4")
@@ -197,8 +202,8 @@ def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
             np.testing.assert_equal(dst_arr[:, -i, :], 0.0)
 
 
-@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
-def test_trailing_iterations(setup: VectorTestSetup):
+def test_trailing_iterations(vectorization_setup: VectorTestSetup):
+    setup = vectorization_setup
     f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")
 
     update = [Assignment(f(0), 2 * f(0))]
@@ -216,3 +221,24 @@ def test_trailing_iterations(setup: VectorTestSetup):
         kernel(f=f_arr)
 
         np.testing.assert_equal(f_arr, 2.0)
+
+
+def test_only_trailing_iterations(vectorization_setup: VectorTestSetup):
+    setup = vectorization_setup
+    f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")
+
+    update = [Assignment(f(0), 2 * f(0))]
+
+    kernel = create_vector_kernel(update, f, setup)
+
+    for trailing_iters in range(1, setup.lanes):
+        shape = (trailing_iters, 1)
+        f_arr = create_numpy_array_with_layout(
+            shape, layout=(1, 0), dtype=setup.numeric_dtype.numpy_dtype
+        )
+
+        f_arr[:] = 1.0
+
+        kernel(f=f_arr)
+
+        np.testing.assert_equal(f_arr, 2.0)