diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index ff41d3a68888347e81e9ff65d1359467666f2e32..4f78d344de861261ef89db6e10b51a93026a3dcd 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
-from warnings import warn
-from typing import TYPE_CHECKING
+from abc import ABC, abstractmethod
 
-from ...types import constify
+from ...types import constify, deconstify
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
 
+from ..memory import PsSymbol
 from ..kernelcreation import (
     Typifier,
     IterationSpace,
@@ -29,8 +29,6 @@ from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
 from ..functions import PsMathFunction, MathFunctions, CFunction
 
-if TYPE_CHECKING:
-    from ...codegen import GpuThreadsRange
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
@@ -48,18 +46,88 @@ GRID_DIM = [
 ]
 
 
+class DenseThreadIdxMapping(ABC):
+
+    @abstractmethod
+    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+        """Map the current thread index onto a point in the given iteration space.
+
+        Implementations of this method must return a declaration for each dimension counter
+        of the given iteration space.
+        """
+
+
+class Linear3DMapping(DenseThreadIdxMapping):
+    """3D globally linearized mapping, where each thread is assigned a work item according to
+    its location in the global launch grid."""
+
+    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 3:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Linear3D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for coord, dim in enumerate(dimensions[::-1]):
+            tid = self._linear_thread_idx(coord)
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+    def _linear_thread_idx(self, coord: int):
+        block_size = BLOCK_DIM[coord]
+        block_idx = BLOCK_IDX[coord]
+        thread_idx = THREAD_IDX[coord]
+        return block_idx * block_size + thread_idx
+
+
+class Blockwise4DMapping(DenseThreadIdxMapping):
+    """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
+    are mapped to block indices."""
+
+    _indices_in_loop_order = [  # slowest to fastest
+        BLOCK_IDX[2],
+        BLOCK_IDX[1],
+        BLOCK_IDX[0],
+        THREAD_IDX[0],
+    ]
+
+    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 4:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Blockwise4D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for dim, tid in zip(dimensions, self._indices_in_loop_order):
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+
 class CudaPlatform(GenericGpu):
     """Platform for CUDA-based GPUs."""
 
     def __init__(
-        self, ctx: KernelCreationContext,
+        self,
+        ctx: KernelCreationContext,
         omit_range_check: bool = False,
-        manual_launch_grid: bool = False,
+        dense_idx_mapping: DenseThreadIdxMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
         self._omit_range_check = omit_range_check
-        self._manual_launch_grid = manual_launch_grid
+        self._dense_idx_mapping = dense_idx_mapping
 
         self._typify = Typifier(ctx)
 
@@ -69,7 +137,7 @@ class CudaPlatform(GenericGpu):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
         if isinstance(ispace, FullIterationSpace):
             return self._prepend_dense_translation(body, ispace)
         elif isinstance(ispace, SparseIterationSpace):
@@ -141,40 +209,41 @@ class CudaPlatform(GenericGpu):
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
         dimensions = ispace.dimensions_in_loop_order()
 
-        if not self._manual_launch_grid:
-            try:
-                threads_range = self.threads_from_ispace(ispace)
-            except MaterializationError as e:
-                warn(
-                    str(e.args[0])
-                    + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
-                    UserWarning,
-                )
-                threads_range = None
-        else:
-            threads_range = None
+        #   TODO move to codegen
+        # if not self._manual_launch_grid:
+        #     try:
+        #         threads_range = self.threads_from_ispace(ispace)
+        #     except MaterializationError as e:
+        #         warn(
+        #             str(e.args[0])
+        #             + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
+        #             UserWarning,
+        #         )
+        #         threads_range = None
+        # else:
+        #     threads_range = None
+
+        idx_mapper = (
+            self._dense_idx_mapping
+            if self._dense_idx_mapping is not None
+            else Linear3DMapping()
+        )
+        ctr_mapping = idx_mapper(ispace)
 
         indexing_decls = []
         conds = []
-        for i, dim in enumerate(dimensions[::-1]):
+        for dim in dimensions:
             dim.counter.dtype = constify(dim.counter.get_dtype())
 
-            ctr = PsExpression.make(dim.counter)
+            ctr_expr = PsExpression.make(dim.counter)
             indexing_decls.append(
-                self._typify(
-                    PsDeclaration(
-                        ctr,
-                        dim.start
-                        + dim.step
-                        * PsCast(ctr.get_dtype(), self._linear_thread_idx(i)),
-                    )
-                )
+                self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
             if not self._omit_range_check:
-                conds.append(PsLt(ctr, dim.stop))
+                conds.append(PsLt(ctr_expr, dim.stop))
 
         indexing_decls = indexing_decls[::-1]
 
@@ -187,16 +256,16 @@ class CudaPlatform(GenericGpu):
             body.statements = indexing_decls + body.statements
             ast = body
 
-        return ast, threads_range
+        return ast
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         factory = AstFactory(self._ctx)
         ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
 
         sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = self._linear_thread_idx(0)
+        thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
         sparse_idx_decl = self._typify(
             PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx))
         )
@@ -224,10 +293,4 @@ class CudaPlatform(GenericGpu):
             body.statements = [sparse_idx_decl] + body.statements
             ast = body
 
-        return ast, self.threads_from_ispace(ispace)
-
-    def _linear_thread_idx(self, coord: int):
-        block_size = BLOCK_DIM[coord]
-        block_idx = BLOCK_IDX[coord]
-        thread_idx = THREAD_IDX[coord]
-        return block_idx * block_size + thread_idx
+        return ast
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index f22c3c99ba901c12b1e247c85364eb12da26a60b..f66f9aa0ee5ea350ba4304c03ced1ceae346282f 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 
 from ..ast.expressions import PsExpression
 from ..ast.structural import PsBlock
@@ -16,31 +16,11 @@ if TYPE_CHECKING:
     from ...codegen.kernel import GpuThreadsRange
 
 
-class WorkItemMapping(ABC):
-    """Signature for work-item mappings used to modify the thread index mapping behavior"""
-
-    @abstractmethod
-    def __call__(
-        self,
-        block_idx: tuple[PsExpression, PsExpression, PsExpression],
-        thread_idx: tuple[PsExpression, PsExpression, PsExpression],
-        ispace_rank: int,
-    ) -> tuple[PsExpression, ...]:
-        """Compute a work item index from the current block index, thread index, and iteration space dimensionality.
-        
-        Implementations of this method must return a tuple with `ispace_rank` entries,
-        containing expressions for the compressed index of the work item identified by the
-        given GPU block and thread index triples.
-        (The *compressed index* is the work item's index before application
-        of the iteration space's lower limits and strides.)
-        """
-
-
 class GenericGpu(Platform):
     @abstractmethod
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
         pass
 
     @classmethod
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index e1da9e2237b504c4b6b681d48812e4d079a5b463..7f2bbf9f6bf54a8c9cdbcd4a9989e54b9db66923 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
 
 from ..functions import CFunction, PsMathFunction, MathFunctions
 from ..kernelcreation.iteration_space import (
@@ -29,9 +28,6 @@ from .generic_gpu import GenericGpu
 from ..exceptions import MaterializationError
 from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType
 
-if TYPE_CHECKING:
-    from ...codegen import GpuThreadsRange
-
 
 class SyclPlatform(GenericGpu):
 
@@ -39,7 +35,7 @@ class SyclPlatform(GenericGpu):
         self,
         ctx: KernelCreationContext,
         omit_range_check: bool = False,
-        automatic_block_size: bool = False
+        automatic_block_size: bool = False,
     ):
         super().__init__(ctx)
 
@@ -52,7 +48,7 @@ class SyclPlatform(GenericGpu):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         if isinstance(ispace, FullIterationSpace):
             return self._prepend_dense_translation(body, ispace)
         elif isinstance(ispace, SparseIterationSpace):
@@ -113,15 +109,13 @@ class SyclPlatform(GenericGpu):
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         rank = ispace.rank
         id_type = self._id_type(rank)
         id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type))
         id_decl = self._id_declaration(rank, id_symbol)
 
         dimensions = ispace.dimensions_in_loop_order()
-        launch_config = self.threads_from_ispace(ispace)
-
         indexing_decls = [id_decl]
         conds = []
 
@@ -153,11 +147,11 @@ class SyclPlatform(GenericGpu):
             body.statements = indexing_decls + body.statements
             ast = body
 
-        return ast, launch_config
+        return ast
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         factory = AstFactory(self._ctx)
 
         id_type = PsCustomType("sycl::id< 1 >", const=True)
@@ -195,7 +189,7 @@ class SyclPlatform(GenericGpu):
             body.statements = [sparse_idx_decl] + body.statements
             ast = body
 
-        return ast, self.threads_from_ispace(ispace)
+        return ast
 
     def _item_type(self, rank: int):
         if not self._automatic_block_size: