diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/backend/jit/gpu_cupy.py
index 563a9c06a7261343e6435737c47854a55d54a05e..1dd18767160a626ff7972ebb78f83bb3e64a1efc 100644
--- a/src/pystencils/backend/jit/gpu_cupy.py
+++ b/src/pystencils/backend/jit/gpu_cupy.py
@@ -41,6 +41,7 @@ class CupyKernelWrapper(KernelWrapper):
         self._kfunc: GpuKernelFunction = kfunc
         self._raw_kernel = raw_kernel
         self._block_size = block_size
+        self._num_blocks: tuple[int, int, int] | None = None
         self._args_cache: dict[Any, tuple] = dict()
 
     @property
@@ -59,6 +60,14 @@ class CupyKernelWrapper(KernelWrapper):
     def block_size(self, bs: tuple[int, int, int]):
         self._block_size = bs
 
+    @property
+    def num_blocks(self) -> tuple[int, int, int] | None:
+        return self._num_blocks
+
+    @num_blocks.setter
+    def num_blocks(self, nb: tuple[int, int, int] | None):
+        self._num_blocks = nb
+
     def __call__(self, **kwargs: Any):
         kernel_args, launch_grid = self._get_cached_args(**kwargs)
         device = self._get_device(kernel_args)
@@ -72,7 +81,7 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        key = (self._block_size,) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items())
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -185,25 +194,36 @@ class CupyKernelWrapper(KernelWrapper):
 
         symbolic_threads_range = self._kfunc.threads_range
 
-        threads_range: list[int] = [
-            evaluate_expression(expr, valuation)
-            for expr in symbolic_threads_range.num_work_items
-        ]
+        if self._num_blocks is not None:
+            launch_grid = LaunchGrid(self._num_blocks, self._block_size)
 
-        if symbolic_threads_range.dim < 3:
-            threads_range += [1] * (3 - symbolic_threads_range.dim)
+        elif symbolic_threads_range is not None:
+            threads_range: list[int] = [
+                evaluate_expression(expr, valuation)
+                for expr in symbolic_threads_range.num_work_items
+            ]
 
-        def div_ceil(a, b):
-            return a // b if a % b == 0 else a // b + 1
+            if symbolic_threads_range.dim < 3:
+                threads_range += [1] * (3 - symbolic_threads_range.dim)
 
-        #   TODO: Refine this?
-        grid_size = tuple(
-            div_ceil(threads, tpb)
-            for threads, tpb in zip(threads_range, self._block_size)
-        )
-        assert len(grid_size) == 3
+            def div_ceil(a, b):
+                return a // b if a % b == 0 else a // b + 1
+
+            #   TODO: Refine this?
+            num_blocks = tuple(
+                div_ceil(threads, tpb)
+                for threads, tpb in zip(threads_range, self._block_size)
+            )
+            assert len(num_blocks) == 3
+
+            launch_grid = LaunchGrid(num_blocks, self._block_size)
 
-        launch_grid = LaunchGrid(grid_size, self._block_size)
+        else:
+            raise JitError(
+                "Unable to determine launch grid for GPU kernel invocation: "
+                "No manual grid size was specified, and the number of threads could not "
+                "be determined automatically."
+            )
 
         return tuple(args), launch_grid
 
diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py
index c3c9eaa7a8825ed036cda92b4926d9abbb29a0a9..9df9883ce67d6d856335a7a7a9537f829b7df11e 100644
--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -196,21 +196,25 @@ class FullIterationSpace(IterationSpace):
     def dimensions(self):
         """The dimensions of this iteration space"""
         return self._dimensions
+    
+    @property
+    def counters(self) -> tuple[PsSymbol, ...]:
+        return tuple(dim.counter for dim in self._dimensions)
 
     @property
-    def lower(self):
+    def lower(self) -> tuple[PsExpression, ...]:
         """Lower limits of each dimension"""
-        return (dim.start for dim in self._dimensions)
+        return tuple(dim.start for dim in self._dimensions)
 
     @property
-    def upper(self):
+    def upper(self) -> tuple[PsExpression, ...]:
         """Upper limits of each dimension"""
-        return (dim.stop for dim in self._dimensions)
+        return tuple(dim.stop for dim in self._dimensions)
 
     @property
-    def steps(self):
+    def steps(self) -> tuple[PsExpression, ...]:
         """Iteration steps of each dimension"""
-        return (dim.step for dim in self._dimensions)
+        return tuple(dim.step for dim in self._dimensions)
 
     @property
     def archetype_field(self) -> Field | None:
diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py
index 0118c4f40a2b0702c51f39e9bc75fe4dc29cd67e..8f599d57e27ac45c713ab8878602140107e6e628 100644
--- a/src/pystencils/backend/kernelfunction.py
+++ b/src/pystencils/backend/kernelfunction.py
@@ -261,7 +261,7 @@ class GpuKernelFunction(KernelFunction):
     def __init__(
         self,
         body: PsBlock,
-        threads_range: GpuThreadsRange,
+        threads_range: GpuThreadsRange | None,
         target: Target,
         name: str,
         parameters: Sequence[KernelParameter],
@@ -275,7 +275,7 @@ class GpuKernelFunction(KernelFunction):
         self._threads_range = threads_range
 
     @property
-    def threads_range(self) -> GpuThreadsRange:
+    def threads_range(self) -> GpuThreadsRange | None:
         return self._threads_range
 
 
@@ -283,14 +283,16 @@ def create_gpu_kernel_function(
     ctx: KernelCreationContext,
     platform: Platform,
     body: PsBlock,
-    threads_range: GpuThreadsRange,
+    threads_range: GpuThreadsRange | None,
     function_name: str,
     target_spec: Target,
     jit: JitBase,
 ):
     undef_symbols = collect_undefined_symbols(body)
-    for threads in threads_range.num_work_items:
-        undef_symbols |= collect_undefined_symbols(threads)
+
+    if threads_range is not None:
+        for threads in threads_range.num_work_items:
+            undef_symbols |= collect_undefined_symbols(threads)
 
     params = _get_function_params(ctx, undef_symbols)
     req_headers = _get_headers(ctx, platform, body)
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 323dcc5a9b306990a11f85037f305794c8729625..5c8b5b5045b5f889123cdf86686ecd696c6d8bf7 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,3 +1,5 @@
+from warnings import warn
+
 from ...types import constify
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu, GpuThreadsRange
@@ -7,7 +9,7 @@ from ..kernelcreation import (
     IterationSpace,
     FullIterationSpace,
     SparseIterationSpace,
-    AstFactory
+    AstFactory,
 )
 
 from ..kernelcreation.context import KernelCreationContext
@@ -43,6 +45,7 @@ GRID_DIM = [
 
 
 class CudaPlatform(GenericGpu):
+    """Platform for CUDA-based GPUs."""
 
     def __init__(
         self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None
@@ -57,7 +60,7 @@ class CudaPlatform(GenericGpu):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
         if isinstance(ispace, FullIterationSpace):
             return self._prepend_dense_translation(body, ispace)
         elif isinstance(ispace, SparseIterationSpace):
@@ -123,13 +126,25 @@ class CudaPlatform(GenericGpu):
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
         dimensions = ispace.dimensions_in_loop_order()
-        launch_config = GpuThreadsRange.from_ispace(ispace)
+
+        if not self._cfg.manual_launch_grid:
+            try:
+                threads_range = GpuThreadsRange.from_ispace(ispace)
+            except MaterializationError as e:
+                warn(
+                    str(e.args[0])
+                    + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
+                    UserWarning,
+                )
+                threads_range = None
+        else:
+            threads_range = None
 
         indexing_decls = []
         conds = []
-        for i, dim in enumerate(dimensions[::-1]):
+        for i, dim in enumerate(dimensions):
             dim.counter.dtype = constify(dim.counter.get_dtype())
 
             ctr = PsExpression.make(dim.counter)
@@ -155,7 +170,7 @@ class CudaPlatform(GenericGpu):
             body.statements = indexing_decls + body.statements
             ast = body
 
-        return ast, launch_config
+        return ast, threads_range
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
@@ -199,3 +214,72 @@ class CudaPlatform(GenericGpu):
         block_idx = BLOCK_IDX[coord]
         thread_idx = THREAD_IDX[coord]
         return block_idx * block_size + thread_idx
+
+
+# class LinearIndexing:
+#     """Linear GPU thread indexing.
+
+#     This indexing scheme maps GPU threads to iteration space points in the following way:
+#     - Starting from the slowest coordinate, each coordinate is mapped to a dimension
+#       of the GPU grid until just one dimension is left
+#     - All remaining dimensions of the iteration space are linearly mapped
+#       onto the fastest launch grid dimension
+#     """
+
+#     def __init__(
+#         self,
+#         ctx: KernelCreationContext,
+#         launch_grid_dimensions: int,
+#         ispace: FullIterationSpace,
+#     ) -> None:
+#         if not (0 < launch_grid_dimensions <= 3):
+#             raise ValueError(
+#                 f"Invalid number of launch grid dimensions: {launch_grid_dimensions}"
+#             )
+
+#         self._ctx = ctx
+
+#         self._grid_dims = launch_grid_dimensions
+#         self._ispace = ispace
+#         self._ispace_dims = len(ispace.dimensions)
+
+#         self._typify = Typifier(ctx)
+
+#     def get_counter_declarations(self) -> Sequence[PsDeclaration]:
+#         num_slower_dimensions = min(self._grid_dims, self._ispace_dims) - 1
+#         num_fast_dimensions = self._ispace_dims - num_slower_dimensions
+
+#         decls = []
+
+#         #   Slower n dimensions
+#         for i in range(num_slower_dimensions, 0, -1):
+#             thread_idx = BLOCK_IDX[i] * BLOCK_DIM[i] + THREAD_IDX[i]
+#             decls.append(self._make_ctr_decl(self._ispace.dimensions[num_fast_dimensions + i], thread_idx))
+
+#         #   Fastest dimensions
+#         thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
+
+#         if num_fast_dimensions == 1:
+#             decls.append(self._make_ctr_decl(self._ispace.dimensions[0], thread_idx))
+#         else:
+#             for i in range(num_fast_dimensions, 0, -1):
+#                 decls.append(
+#                     self._make_ctr_decl(
+#                         self._ispace.dimensions[i],
+#                         #   ergh... need actual iterations here...
+#                     )
+#                 )
+
+
+#     def _make_ctr_decl(
+#         self, dim: FullIterationSpace.Dimension, thread_idx: PsExpression
+#     ):
+#         dim.counter.dtype = constify(dim.counter.get_dtype())
+
+#         ctr = PsExpression.make(dim.counter)
+#         return self._typify(
+#             PsDeclaration(
+#                 ctr,
+#                 dim.start + dim.step * PsCast(ctr.get_dtype(), thread_idx),
+#             )
+#         )
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 774b9405cd04b8dc6489cd6b6ae36e4aa563f157..f6b888a4941ee657b41c3dd8837c9b1f8a5059e7 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -10,6 +10,7 @@ from ..kernelcreation.iteration_space import (
     SparseIterationSpace,
 )
 from .platform import Platform
+from ..exceptions import MaterializationError
 
 
 class GpuThreadsRange:
@@ -56,6 +57,19 @@ class GpuThreadsRange:
             raise NotImplementedError(
                 f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
             )
+        
+        from ..ast.analysis import collect_undefined_symbols as collect
+
+        for dim in dimensions:
+            symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
+            for ctr in ispace.counters:
+                if ctr in symbs:
+                    raise MaterializationError(
+                        "Unable to construct GPU threads range for iteration space: "
+                        f"Limits of dimension counter {dim.counter.name} "
+                        f"depend on another dimension's counter {ctr.name}"
+                    )
+
         work_items = [ispace.actual_iterations(dim) for dim in dimensions]
         return GpuThreadsRange(work_items)
 
@@ -64,5 +78,5 @@ class GenericGpu(Platform):
     @abstractmethod
     def materialize_iteration_space(
         self, block: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> tuple[PsBlock, GpuThreadsRange | None]:
         pass
diff --git a/src/pystencils/config.py b/src/pystencils/config.py
index c688530aecd497ddced30cc47e83bf4e314f9ac8..c08ddc16198627adba97169e02724371eca62158 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -190,6 +190,14 @@ class GpuIndexingConfig:
     block_size: tuple[int, int, int] | None = None
     """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
 
+    manual_launch_grid: bool = False
+    """Always require a manually specified launch grid when running this kernel.
+    
+    If set to `True`, the code generator will not attempt to infer the size of
+    the launch grid from the kernel.
+    The launch grid will then have to be specified manually at runtime.
+    """
+
     sycl_automatic_block_size: bool = True
     """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size.