diff --git a/pytest.ini b/pytest.ini
index 707a43b4548e99e8e6862e1b48a1844e4318b55e..744a74bc781b3e03568e3c3a67cefbe9395bd713 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -64,6 +64,7 @@ exclude_lines =
        if 0:
        if False:
        if __name__ == .__main__.:
+       assert False
 
        # Don't cover type checking imports
        if TYPE_CHECKING:
diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py
index da261faec49940df31d59f44651956e2012b113a..0d71dbe1a250c865c0f637aa3a125837abfe39e7 100644
--- a/tests/kernelcreation/test_domain_kernels.py
+++ b/tests/kernelcreation/test_domain_kernels.py
@@ -32,14 +32,7 @@ def inspect_dp_kernel(kernel: Kernel, gen_config: CreateKernelConfig):
             assert "_mm512_storeu_pd" in code
 
 
-def test_filter_kernel(gen_config):
-    if gen_config.target == Target.CUDA:
-        import cupy as cp
-
-        xp = cp
-    else:
-        xp = np
-
+def test_filter_kernel(gen_config, xp):
     weight = sp.Symbol("weight")
     stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]
 
@@ -62,14 +55,7 @@ def test_filter_kernel(gen_config):
     xp.testing.assert_allclose(dst_arr, expected)
 
 
-def test_filter_kernel_fixedsize(gen_config):
-    if gen_config.target == Target.CUDA:
-        import cupy as cp
-
-        xp = cp
-    else:
-        xp = np
-
+def test_filter_kernel_fixedsize(gen_config, xp):
     weight = sp.Symbol("weight")
     stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]
 
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 621e4c2514736f2dc2bb9b163931cba4af63bddc..d80647fb6a620182c008a9cb9a65adb8f16b0b4e 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -11,10 +11,9 @@ from pystencils import (
     CreateKernelConfig,
     create_kernel,
     Target,
+    assignment_from_stencil,
 )
 
-# from pystencils.gpu import BlockIndexing
-from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import (
     add_ghost_layers,
     make_slice,
@@ -30,6 +29,58 @@ except ImportError:
     pytest.skip(reason="CuPy is not available", allow_module_level=True)
 
 
+@pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
+@pytest.mark.parametrize("omit_range_check", [False, True])
+@pytest.mark.parametrize("manual_grid", [False, True])
+def test_indexing_options(
+    indexing_scheme: str, omit_range_check: bool, manual_grid: bool
+):
+    src, dst = fields("src, dst: [3D]")
+    asm = Assignment(
+        dst.center(),
+        src[-1, 0, 0]
+        + src[1, 0, 0]
+        + src[0, -1, 0]
+        + src[0, 1, 0]
+        + src[0, 0, -1]
+        + src[0, 0, 1],
+    )
+
+    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg.gpu.indexing_scheme = indexing_scheme
+    cfg.gpu.omit_range_check = omit_range_check
+    cfg.gpu.manual_launch_grid = manual_grid
+
+    ast = create_kernel(asm, cfg)
+    kernel = ast.compile()
+
+    src_arr = cp.ones((18, 34, 42))
+    dst_arr = cp.zeros_like(src_arr)
+
+    if manual_grid:
+        match indexing_scheme:
+            case "linear3d":
+                kernel.launch_config.block_size = (10, 8, 8)
+                kernel.launch_config.grid_size = (4, 4, 2)
+            case "blockwise4d":
+                kernel.launch_config.block_size = (40, 1, 1)
+                kernel.launch_config.grid_size = (32, 16, 1)
+
+    elif indexing_scheme == "linear3d":
+        kernel.launch_config.block_size = (
+            10,
+            8,
+            8,
+        )  # must fit the src_arr shape (without ghost layers)
+
+    kernel(src=src_arr, dst=dst_arr)
+
+    expected = cp.zeros_like(src_arr)
+    expected[1:-1, 1:-1, 1:-1].fill(6.0)
+
+    cp.testing.assert_allclose(dst_arr, expected)
+
+
 def test_averaging_kernel():
     size = (40, 55)
     src_arr = np.random.rand(*size)
@@ -44,7 +95,7 @@ def test_averaging_kernel():
     )
 
     config = CreateKernelConfig(target=Target.GPU)
-    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     gpu_src_arr = cp.asarray(src_arr)
@@ -70,7 +121,7 @@ def test_variable_sized_fields():
     )
 
     config = CreateKernelConfig(target=Target.GPU)
-    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     size = (3, 3)
@@ -142,7 +193,7 @@ def test_ghost_layer():
     config.ghost_layers = ghost_layers
     config.gpu.indexing_scheme = "blockwise4d"
 
-    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     gpu_src_arr = cp.asarray(src_arr)
@@ -170,8 +221,8 @@ def test_setting_value():
     config.target = Target.CUDA
     config.iteration_slice = iteration_slice
     config.gpu.indexing_scheme = "blockwise4d"
-    
-    ast = create_kernel(sympy_cse_on_assignment_list(update_rule), config=config)
+
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     kernel(f=arr_gpu, value=np.float64(42.0))