diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 47a64df647e79fb931224b226550b5b68b6415ce..2d62f286b5a8c30799c408175f0dfa0d10c83740 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -381,7 +381,11 @@ class GpuOptions(ConfigBase):
     """
 
     block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
-    """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
+    """Desired block size for the execution of GPU kernels.
+    
+    This option only takes effect if `Linear3D` is chosen as an indexing scheme.
+    The block size may be overridden at runtime.
+    """
 
     manual_launch_grid: BasicOption[bool] = BasicOption(False)
     """Always require a manually specified launch grid when running this kernel.
@@ -596,11 +600,8 @@ class CreateKernelConfig(ConfigBase):
             elif self.get_target() == Target.CUDA:
                 try:
                     from ..jit.gpu_cupy import CupyJit
-
-                    if self.gpu is not None and self.gpu.block_size is not None:
-                        return CupyJit(self.gpu.block_size)
-                    else:
-                        return CupyJit()
+                    
+                    return CupyJit()
 
                 except ImportError:
                     from ..jit import no_jit
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index f7eb8ddb41fe869cbd5b86358d31fdababca7afb..14a95c84d899638ea796d13cfddf7dd4e7ccd04f 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -220,7 +220,7 @@ class DefaultKernelCreationDriver:
                 self._cfg.get_option("function_name"),
                 self._target,
                 self._cfg.get_jit(),
-                self._gpu_indexing.get_launch_config,
+                self._gpu_indexing.get_launch_config_factory(),
             )
 
     def parse_kernel_body(
@@ -397,15 +397,13 @@ class DefaultKernelCreationDriver:
         if self._target != Target.CUDA:
             return None
 
-        idx_scheme = self._cfg.gpu.get_option("indexing_scheme")
+        from .gpu_indexing import dim3
 
-        match idx_scheme:
-            case None | GpuIndexingScheme.Linear3D:
-                from .gpu_indexing import Linear3DGpuIndexing
+        idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
+        block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
+        manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
 
-                return Linear3DGpuIndexing(self._ctx)
-            case _:
-                raise NotImplementedError()
+        return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid)
 
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
@@ -496,7 +494,7 @@ def create_gpu_kernel_function(
     function_name: str,
     target_spec: Target,
     jit: JitBase,
-    launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
+    launch_config_factory: Callable[[], GpuLaunchConfiguration],
 ) -> GpuKernel:
     undef_symbols = collect_undefined_symbols(body)
 
diff --git a/src/pystencils/codegen/functions.py b/src/pystencils/codegen/functions.py
index 2779fa289e04cda9bc47fd46e48ff0ada9a98ad1..f6be3b1f3446c6b9a25a0013f0e06d099edf5bed 100644
--- a/src/pystencils/codegen/functions.py
+++ b/src/pystencils/codegen/functions.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
-from typing import Sequence
-
-import numpy as np
+from typing import Sequence, Any
 
 from .parameters import Parameter
 from ..types import PsType
@@ -36,7 +34,7 @@ class Lambda:
         """Return type of this lambda"""
         return self._return_type
 
-    def __call__(self, **kwargs) -> np.generic:
+    def __call__(self, **kwargs) -> Any:
         """Evaluate this lambda with the given arguments.
 
         The lambda must receive a value for each parameter listed in `parameters`.
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 24189bf635b0cd8a5c64b8ceb3d65a14a4121d36..1e23a820e85fc7250b9d49196ba5d11ee0526f4d 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -5,7 +5,6 @@ from typing import cast, Any, Callable
 from itertools import chain
 
 from .functions import Lambda
-from .kernel import GpuKernel
 from .parameters import Parameter
 from .errors import CodegenError
 from .config import GpuIndexingScheme, _AUTO_TYPE
@@ -15,7 +14,7 @@ from ..backend.kernelcreation import (
     FullIterationSpace,
     SparseIterationSpace,
 )
-from ..backend.platforms.cuda import ThreadToIndexMapping
+
 from ..backend.ast.expressions import PsExpression
 
 
@@ -24,7 +23,7 @@ _Dim3Params = tuple[Parameter, Parameter, Parameter]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
-class GpuLaunchConfiguration:
+class GpuLaunchConfiguration(ABC):
     """Base class for launch configurations for CUDA and HIP kernels.
 
     Args:
@@ -34,33 +33,61 @@ class GpuLaunchConfiguration:
             parameters to the associated kernel
     """
 
+    @property
+    @abstractmethod
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+
+    @abstractmethod
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        """Compute block and grid size for a kernel launch.
+
+        Args:
+            kwargs: Valuation providing a value for each parameter listed in `parameters`
+        """
+
+    @abstractmethod
+    def jit_cache_key(self) -> Any:
+        """Return a hashable object that represents any user-configurable options of
+        this launch configuration, such that when the configuration changes, the JIT parameter
+        cache is invalidated."""
+
+
+class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
+    """Launch configuration that is dynamically computed from kernel parameters.
+
+    This launch configuration permits no further user customization
+    """
+
     def __init__(
         self,
         block_size: _Dim3Lambda,
         grid_size: _Dim3Lambda,
-        config_parameters: set[Parameter],
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
 
-        self._params = frozenset(config_parameters)
-        self._valuation: dict[Parameter, Any] = dict()
+        self._params: frozenset[Parameter] = frozenset().union(
+            *(lb.parameters for lb in chain(block_size, grid_size))
+        )
 
     @property
     def parameters(self) -> frozenset[Parameter]:
-        """Parameters to this set of constraints"""
+        """Parameters of this launch configuration"""
         return self._params
 
-    def get_valuation(self) -> dict[Parameter, Any]:
-        """Values for all parameters that are specific to the launch grid configuration and not
-        also kernel parameters."""
-        return self._valuation
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        """Compute block and grid size for a kernel launch.
 
-    def get_block_size(self) -> _Dim3Lambda:
-        return self._block_size
+        Args:
+            kwargs: Valuation providing a value for each parameter listed in `parameters`
+        """
+        block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
+        grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
+        return cast(dim3, block_size), cast(dim3, grid_size)
 
-    def get_grid_size(self) -> _Dim3Lambda:
-        return self._grid_size
+    def jit_cache_key(self) -> Any:
+        return ()
 
 
 class ManualLaunchConfiguration(GpuLaunchConfiguration):
@@ -71,89 +98,93 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
     def __init__(
         self,
-        block_size: _Dim3Lambda,
-        grid_size: _Dim3Lambda,
-        block_size_params: _Dim3Params,
-        grid_size_params: _Dim3Params,
-    ):
-        super().__init__(
-            cast(_Dim3Lambda, block_size),
-            cast(_Dim3Lambda, grid_size),
-            set(block_size_params).union(grid_size_params),
-        )
-        self._block_size_params = block_size_params
-        self._grid_size_params = grid_size_params
-
-        self._user_block_size: dim3 | None = None
-        self._user_grid_size: dim3 | None = None
+    ) -> None:
+        self._block_size: dim3 | None = None
+        self._grid_size: dim3 | None = None
 
     @property
     def block_size(self) -> dim3 | None:
-        return self._user_block_size
+        return self._block_size
 
     @block_size.setter
     def block_size(self, val: dim3):
-        self._user_block_size = val
+        self._block_size = val
 
     @property
     def grid_size(self) -> dim3 | None:
-        return self._user_grid_size
+        return self._grid_size
 
     @grid_size.setter
     def grid_size(self, val: dim3):
-        self._user_grid_size = val
-
-    def get_valuation(self) -> dict[Parameter, Any]:
-        if self._user_block_size is None:
-            raise AttributeError("No GPU block size was specified")
+        self._grid_size = val
 
-        if self._user_grid_size is None:
-            raise AttributeError("No GPU grid size was specified")
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        return frozenset()
 
-        valuation: dict[Parameter, Any] = dict()
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        if self._block_size is None:
+            raise AttributeError("No GPU block size was set by the user.")
 
-        for bs_param, bs in zip(self._block_size_params, self._user_block_size):
-            valuation[bs_param] = bs
+        if self._grid_size is None:
+            raise AttributeError("No GPU grid size was set by the user.")
 
-        for gs_param, gs in zip(self._grid_size_params, self._user_grid_size):
-            valuation[gs_param] = gs
+        return self._block_size, self._grid_size
 
-        return valuation
+    def jit_cache_key(self) -> Any:
+        return (self._block_size, self._grid_size)
 
 
-class GridFromBlockSizeConfiguration(GpuLaunchConfiguration):
-    """GPU launch configuration that computes the grid size from a user-defined block size."""
+class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
+    """GPU launch configuration that permits the user to set a block size dynamically."""
 
     def __init__(
         self,
-        block_size: _Dim3Lambda,
-        grid_size: _Dim3Lambda,
+        block_size_expr: _Dim3Lambda,
+        grid_size_expr: _Dim3Lambda,
         block_size_params: _Dim3Params,
         default_block_size: dim3 | None = None,
     ) -> None:
-        super().__init__(block_size, grid_size, set(block_size_params))
+        self._block_size_expr = block_size_expr
+        self._grid_size_expr = grid_size_expr
 
         self._block_size_params = block_size_params
-        self._user_block_size: dim3 | None = default_block_size
+        self._block_size: dim3 | None = default_block_size
+
+        self._params: frozenset[Parameter] = frozenset().union(
+            *(lb.parameters for lb in chain(block_size_expr, grid_size_expr))
+        ) - set(self._block_size_params)
 
     @property
     def block_size(self) -> dim3 | None:
-        return self._user_block_size
+        return self._block_size
 
     @block_size.setter
     def block_size(self, val: dim3):
-        self._user_block_size = val
+        self._block_size = val
+
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+        return self._params
 
-    def get_valuation(self) -> dict[Parameter, Any]:
-        if self._user_block_size is None:
-            raise AttributeError("No GPU block size was specified")
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        if self._block_size is None:
+            raise AttributeError("No GPU block size was specified by the user!")
 
-        valuation: dict[Parameter, Any] = dict()
+        kwargs.update(
+            {
+                param.name: value
+                for param, value in zip(self._block_size_params, self._block_size)
+            }
+        )
 
-        for bs_param, bs in zip(self._block_size_params, self._user_block_size):
-            valuation[bs_param] = bs
+        block_size = tuple(int(bs(**kwargs)) for bs in self._block_size_expr)
+        grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size_expr)
+        return cast(dim3, block_size), cast(dim3, grid_size)
 
-        return valuation
+    def jit_cache_key(self) -> Any:
+        return self._block_size
 
 
 class GpuIndexing(ABC):
@@ -182,65 +213,19 @@ class GpuIndexing(ABC):
             case GpuIndexingScheme.Blockwise4D:
                 return Blockwise4DMapping()
 
-    def get_launch_config_factory(
-        self, scheme: GpuIndexingScheme
-    ) -> Callable[[], GpuLaunchConfiguration]:
+    def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
         if self._manual_launch_grid:
-            return self._manual_config_factory()
-
-        raise NotImplementedError()
-
-    def _manual_config_factory(self) -> Callable[[], ManualLaunchConfiguration]:
-        ctx = self._ctx
-
-        block_size_symbols = [
-            ctx.get_new_symbol(f"gpuBlockSize_{c}", ctx.index_dtype) for c in range(3)
-        ]
-        grid_size_symbols = [
-            ctx.get_new_symbol(f"gpuGridSize_{c}", ctx.index_dtype) for c in range(3)
-        ]
-
-        block_size = tuple(
-            Lambda.from_expression(ctx, PsExpression.make(bs))
-            for bs in block_size_symbols
-        )
+            return ManualLaunchConfiguration
 
-        grid_size = tuple(
-            Lambda.from_expression(ctx, PsExpression.make(gs))
-            for gs in grid_size_symbols
-        )
-
-        from .driver import _symbol_to_param
-
-        bs_params = [_symbol_to_param(ctx, s) for s in block_size_symbols]
-        gs_params = [_symbol_to_param(ctx, s) for s in grid_size_symbols]
-
-        def factory():
-            return ManualLaunchConfiguration(
-                cast(_Dim3Lambda, block_size),
-                cast(_Dim3Lambda, grid_size),
-                cast(_Dim3Params, bs_params),
-                cast(_Dim3Params, gs_params),
-            )
-
-        return factory
-
-
-class Linear3DGpuIndexing(GpuIndexing):
-
-    def __init__(self, ctx: KernelCreationContext) -> None:
-        self._ctx = ctx
-
-        from ..backend.kernelcreation import AstFactory
-
-        self._factory = AstFactory(self._ctx)
-
-    def get_thread_mapping(self) -> ThreadToIndexMapping:
-        from ..backend.platforms.cuda import Linear3DMapping
-
-        return Linear3DMapping()
+        match self._scheme:
+            case GpuIndexingScheme.Linear3D:
+                return self._get_linear3d_config_factory()
+            case GpuIndexingScheme.Blockwise4D:
+                return self._get_blockwise4d_config_factory()
 
-    def get_launch_config(self, kernel: GpuKernel) -> GpuLaunchConfiguration:
+    def _get_linear3d_config_factory(
+        self,
+    ) -> Callable[[], DynamicBlockSizeLaunchConfiguration]:
         work_items = self._get_work_items()
         rank = len(work_items)
 
@@ -280,13 +265,65 @@ class Linear3DGpuIndexing(GpuIndexing):
             _symbol_to_param(self._ctx, s) for s in block_size_symbols
         )
 
-        return GridFromBlockSizeConfiguration(
-            cast(_Dim3Lambda, tuple(block_size)),
-            cast(_Dim3Lambda, tuple(grid_size)),
-            cast(tuple[Parameter, Parameter, Parameter], block_size_params),
+        def factory():
+            return DynamicBlockSizeLaunchConfiguration(
+                cast(_Dim3Lambda, tuple(block_size)),
+                cast(_Dim3Lambda, tuple(grid_size)),
+                cast(_Dim3Params, block_size_params),
+                self._get_default_block_size(rank),
+            )
+
+        return factory
+
+    def _get_default_block_size(self, rank: int) -> dim3:
+        if isinstance(self._block_size, _AUTO_TYPE):
+            match rank:
+                case 1:
+                    return (256, 1, 1)
+                case 2:
+                    return (128, 2, 1)
+                case 3:
+                    return (128, 2, 2)
+                case _:
+                    assert False, "unreachable code"
+        else:
+            return self._block_size
+
+    def _get_blockwise4d_config_factory(
+        self,
+    ) -> Callable[[], AutomaticLaunchConfiguration]:
+        work_items = self._get_work_items()[::-1]  # Want this ordered fastest first
+        rank = len(work_items)
+
+        if rank > 4:
+            raise ValueError(f"Iteration space rank is too large: {rank}")
+
+        block_size = (
+            Lambda.from_expression(self._ctx, work_items[0]),
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
+        )
+
+        grid_size = tuple(
+            Lambda.from_expression(self._ctx, wit) for wit in work_items[1:]
+        ) + tuple(
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
+            for _ in range(4 - rank)
         )
 
+        def factory():
+            return AutomaticLaunchConfiguration(
+                block_size,
+                cast(_Dim3Lambda, grid_size),
+            )
+
+        return factory
+
     def _get_work_items(self) -> tuple[PsExpression, ...]:
+        """Return a tuple of expressions representing the number of work items
+        in each dimension of the kernel's iteration space,
+        ordered from slowest to fastest dimension.
+        """
         ispace = self._ctx.get_iteration_space()
         match ispace:
             case FullIterationSpace():
diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py
index 67ef6554cb8cfb00c19ad3bd9b7609076d47a8e8..181e6ad3b5d1cb1f1835d8b4e656c39f65a1316b 100644
--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -6,6 +6,8 @@ from itertools import chain
 
 from .target import Target
 from .parameters import Parameter
+from .gpu_indexing import GpuLaunchConfiguration
+
 from ..backend.ast.structural import PsBlock
 from ..field import Field
 
@@ -13,7 +15,6 @@ from .._deprecation import _deprecated
 
 if TYPE_CHECKING:
     from ..jit import JitBase
-    from .gpu_indexing import GpuLaunchConfiguration
 
 
 class Kernel:
@@ -123,11 +124,11 @@ class GpuKernel(Kernel):
         parameters: Sequence[Parameter],
         required_headers: set[str],
         jit: JitBase,
-        launch_config_factory: Callable[[GpuKernel], GpuLaunchConfiguration],
+        launch_config_factory: Callable[[], GpuLaunchConfiguration],
     ):
         super().__init__(body, target, name, parameters, required_headers, jit)
         self._launch_config_factory = launch_config_factory
 
     def get_launch_configuration(self) -> GpuLaunchConfiguration:
         """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._launch_config_factory(self)
+        return self._launch_config_factory()
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index d4f1c020437b4ffeacc462f2ded1277933dee75d..42d9a685feef80611a7441d2d1c453a039e2b52d 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -45,7 +45,7 @@ class CupyKernelWrapper(KernelWrapper):
     @property
     def kernel_function(self) -> GpuKernel:
         return self._kfunc
-    
+
     @property
     def launch_config(self) -> GpuLaunchConfiguration:
         return self._launch_config
@@ -67,10 +67,9 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        launch_config_params = self._launch_config.get_valuation
-        key = tuple(
-            (k, v) for k, v in launch_config_params.items()
-        ) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._launch_config.jit_cache_key(),) + tuple(
+            (k, id(v)) for k, v in kwargs.items()
+        )
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -80,7 +79,7 @@ class CupyKernelWrapper(KernelWrapper):
             return self._args_cache[key]
 
     def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]:
-        args = []
+        kernel_args = []
         valuation: dict[str, Any] = dict()
 
         def add_arg(name: str, arg: Any, dtype: PsType):
@@ -88,7 +87,7 @@ class CupyKernelWrapper(KernelWrapper):
             assert nptype is not None
             typecast = nptype.type
             arg = typecast(arg)
-            args.append(arg)
+            kernel_args.append(arg)
             valuation[name] = arg
 
         field_shapes = set()
@@ -168,7 +167,7 @@ class CupyKernelWrapper(KernelWrapper):
                                     f"Expected {field.dtype}, got {arr.dtype}"
                                 )
                             check_shape(kparam, arr)
-                            args.append(arr)
+                            kernel_args.append(arr)
                             break
 
                         case FieldShape(field, coord):
@@ -191,31 +190,21 @@ class CupyKernelWrapper(KernelWrapper):
 
         #   Determine launch grid
 
-        launch_cfg_valuation = valuation.copy()
-        launch_cfg_valuation.update(
-            {
-                param.name: value
-                for param, value in self._launch_config.get_valuation.items()
-            }
-        )
+        def add_launch_config_arg(name: str, arg: Any, dtype: PsType):
+            nptype = dtype.numpy_dtype
+            assert nptype is not None
+            typecast = nptype.type
+            arg = typecast(arg)
+            valuation[name] = arg
 
-        block_size = cast(
-            tuple[int, int, int],
-            tuple(
-                int(component(**launch_cfg_valuation))
-                for component in self._launch_config.get_block_size()
-            ),
-        )
+        for cparam in self._launch_config.parameters:
+            if cparam.name not in valuation:
+                val = kwargs[cparam.name]
+                add_launch_config_arg(cparam.name, val, cparam.dtype)
 
-        grid_size = cast(
-            tuple[int, int, int],
-            tuple(
-                int(component(**launch_cfg_valuation))
-                for component in self._launch_config.get_grid_size()
-            ),
-        )
+        block_size, grid_size = self._launch_config.evaluate(**valuation)
 
-        return tuple(args), LaunchGrid(grid_size, block_size)
+        return tuple(kernel_args), LaunchGrid(grid_size, block_size)
 
 
 class CupyJit(JitBase):