diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 051e02e7f59dfc1e17b86e03f2218772b2fa4163..761b11e99af82f1a8ca60673163fa9b511aa5153 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -18,6 +18,7 @@ from .config import (
 from .kernel_decorator import kernel, kernel_config
 from .kernelcreation import create_kernel
 from .backend.kernelfunction import KernelFunction
+from .backend.jit import no_jit
 from .slicing import make_slice
 from .spatial_coordinates import (
     x_,
@@ -51,6 +52,7 @@ __all__ = [
     "create_kernel",
     "KernelFunction",
     "Target",
+    "no_jit",
     "show_code",
     "to_dot",
     "get_code_obj",
diff --git a/src/pystencils/backend/__init__.py b/src/pystencils/backend/__init__.py
index 646fc3055a68c4ca3ff63035b2a72a80591e51a4..a0b1c8f747984e3fffde5a336f40e2aa46ad631d 100644
--- a/src/pystencils/backend/__init__.py
+++ b/src/pystencils/backend/__init__.py
@@ -5,6 +5,7 @@ from .kernelfunction import (
     FieldStrideParam,
     FieldPointerParam,
     KernelFunction,
+    GpuKernelFunction,
 )
 
 from .constraints import KernelParamsConstraint
@@ -16,5 +17,6 @@ __all__ = [
     "FieldStrideParam",
     "FieldPointerParam",
     "KernelFunction",
+    "GpuKernelFunction",
     "KernelParamsConstraint",
 ]
diff --git a/src/pystencils/backend/emission.py b/src/pystencils/backend/emission.py
index fdc81a47c2cec1c73fab8042f6cfb61ae2081dc5..9756d21f33550571666f0413ac6c4516b90eb0da 100644
--- a/src/pystencils/backend/emission.py
+++ b/src/pystencils/backend/emission.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 from enum import Enum
 
+from ..enums import Target
+
 from .ast.structural import (
     PsAstNode,
     PsBlock,
@@ -53,7 +55,7 @@ from .extensions.foreign_ast import PsForeignExpression
 from .symbols import PsSymbol
 from ..types import PsScalarType, PsArrayType
 
-from .kernelfunction import KernelFunction
+from .kernelfunction import KernelFunction, GpuKernelFunction
 
 
 __all__ = ["emit_code", "CAstPrinter"]
@@ -167,10 +169,13 @@ class CAstPrinter:
 
     def __call__(self, obj: PsAstNode | KernelFunction) -> str:
         if isinstance(obj, KernelFunction):
+            prefix = self._func_prefix(obj)
+
             params_str = ", ".join(
                 f"{p.dtype.c_string()} {p.name}" for p in obj.parameters
             )
-            decl = f"FUNC_PREFIX void {obj.name} ({params_str})"
+
+            decl = " ".join([prefix, "void", obj.name, f"({params_str})"])
             body_code = self.visit(obj.body, PrinterCtx())
             return f"{decl}\n{body_code}"
         else:
@@ -336,7 +341,7 @@ class CAstPrinter:
                 items_str = ", ".join(self.visit(item, pc) for item in items)
                 pc.pop_op()
                 return "{ " + items_str + " }"
-            
+
             case PsForeignExpression(children):
                 pc.push_op(Ops.Weakest, LR.Middle)
                 foreign_code = node.get_code(self.visit(c, pc) for c in children)
@@ -346,6 +351,12 @@ class CAstPrinter:
             case _:
                 raise NotImplementedError(f"Don't know how to print {node}")
 
+    def _func_prefix(self, func: KernelFunction):
+        if isinstance(func, GpuKernelFunction) and func.target == Target.CUDA:
+            return "__global__"
+        else:
+            return "FUNC_PREFIX"
+
     def _symbol_decl(self, symb: PsSymbol):
         dtype = symb.get_dtype()
 
diff --git a/src/pystencils/backend/extensions/foreign_ast.py b/src/pystencils/backend/extensions/foreign_ast.py
index 55042ea835e827ebd8f4991cc13af3e5371f89ea..67362ce37e050558d16e06ebc6dbb6b30ff5c6e1 100644
--- a/src/pystencils/backend/extensions/foreign_ast.py
+++ b/src/pystencils/backend/extensions/foreign_ast.py
@@ -11,7 +11,7 @@ from ...types import PsType
 
 class PsForeignExpression(PsExpression, ABC):
     """Base class for foreign expressions.
-    
+
     Foreign expressions are expressions whose properties are not modelled by the pystencils AST,
     and which pystencils therefore does not understand.
 
@@ -24,9 +24,7 @@ class PsForeignExpression(PsExpression, ABC):
 
     __match_args__ = ("children",)
 
-    def __init__(
-        self, children: Iterable[PsExpression], dtype: PsType | None = None
-    ):
+    def __init__(self, children: Iterable[PsExpression], dtype: PsType | None = None):
         self._children = list(children)
         super().__init__(dtype)
 
diff --git a/src/pystencils/backend/kernelcreation/__init__.py b/src/pystencils/backend/kernelcreation/__init__.py
index 5de83caadb3b4aa50112ef2b65c28c1ca7932aae..abba9d9d8d571fa7540f82807d009e02d522849f 100644
--- a/src/pystencils/backend/kernelcreation/__init__.py
+++ b/src/pystencils/backend/kernelcreation/__init__.py
@@ -5,6 +5,7 @@ from .typification import Typifier
 from .ast_factory import AstFactory
 
 from .iteration_space import (
+    IterationSpace,
     FullIterationSpace,
     SparseIterationSpace,
     create_full_iteration_space,
@@ -19,6 +20,7 @@ __all__ = [
     "FreezeExpressions",
     "Typifier",
     "AstFactory",
+    "IterationSpace",
     "FullIterationSpace",
     "SparseIterationSpace",
     "create_full_iteration_space",
diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py
index 38aca4efef935bce161453359c486c5383eeddc7..c505864508997fd3c02f531f1b653762e448a403 100644
--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -208,7 +208,7 @@ class FullIterationSpace(IterationSpace):
     @property
     def archetype_field(self) -> Field | None:
         return self._archetype_field
-    
+
     def dimensions_in_loop_order(self) -> Sequence[FullIterationSpace.Dimension]:
         """Return the dimensions of this iteration space ordered from the fastest to the slowest coordinate.
 
@@ -220,7 +220,9 @@ class FullIterationSpace(IterationSpace):
         else:
             return self._dimensions
 
-    def actual_iterations(self, dimension: int | FullIterationSpace.Dimension | None = None) -> PsExpression:
+    def actual_iterations(
+        self, dimension: int | FullIterationSpace.Dimension | None = None
+    ) -> PsExpression:
         if dimension is None:
             return reduce(
                 mul, (self.actual_iterations(d) for d in range(len(self.dimensions)))
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index eb1576190fad13af01a8de2d352c79138f6b66ba..a389628368eb4f56dfb9de805c8a5cdb109cba27 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,22 +1,22 @@
 from pystencils.backend.functions import CFunction, PsMathFunction
-from pystencils.types.types import PsType
-from .platform import Platform
+from pystencils.backend.kernelcreation.context import KernelCreationContext
+from pystencils.types import PsType, constify
+from ..exceptions import MaterializationError
+from .generic_gpu import GenericGpu, GpuThreadsRange
 
-from ..kernelcreation.iteration_space import (
+from ..kernelcreation import (
+    Typifier,
     IterationSpace,
     FullIterationSpace,
-    # SparseIterationSpace,
+    SparseIterationSpace,
 )
 
-from ..ast.structural import PsBlock, PsConditional
-from ..ast.expressions import (
-    PsExpression,
-    PsLiteralExpr,
-    PsAdd,
-)
+from ..ast.structural import PsBlock, PsConditional, PsDeclaration
+from ..ast.expressions import PsExpression, PsLiteralExpr, PsCast
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType
 from ..literals import PsLiteral
+from ...config import GpuIndexingConfig
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
@@ -34,7 +34,14 @@ GRID_DIM = [
 ]
 
 
-class CudaPlatform(Platform):
+class CudaPlatform(GenericGpu):
+
+    def __init__(
+        self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None
+    ) -> None:
+        super().__init__(ctx)
+        self._cfg = indexing_cfg if indexing_cfg is not None else GpuIndexingConfig()
+        self._typify = Typifier(ctx)
 
     @property
     def required_headers(self) -> set[str]:
@@ -42,20 +49,13 @@ class CudaPlatform(Platform):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> PsBlock:
+    ) -> tuple[PsBlock, GpuThreadsRange]:
         if isinstance(ispace, FullIterationSpace):
-            return self._guard_full_iteration_space(body, ispace)
+            return self._prepend_dense_translation(body, ispace)
+        elif isinstance(ispace, SparseIterationSpace):
+            return self._prepend_sparse_translation(body, ispace)
         else:
-            assert False, "unreachable code"
-
-    def cuda_indices(self, dim):
-        block_size = BLOCK_DIM
-        indices = [
-            block_index * bs + thread_idx
-            for block_index, bs, thread_idx in zip(BLOCK_IDX, block_size, THREAD_IDX)
-        ]
-
-        return indices[:dim]
+            raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
     def select_function(
         self, math_function: PsMathFunction, dtype: PsType
@@ -63,26 +63,57 @@ class CudaPlatform(Platform):
         raise NotImplementedError()
 
     #   Internals
-    def _guard_full_iteration_space(
+
+    def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> PsBlock:
+    ) -> tuple[PsBlock, GpuThreadsRange]:
+        dimensions = ispace.dimensions_in_loop_order()
+        launch_config = GpuThreadsRange.from_ispace(ispace)
+
+        indexing_decls = []
+        conds = []
+        for i, dim in enumerate(dimensions[::-1]):
+            dim.counter.dtype = constify(dim.counter.get_dtype())
+
+            ctr = PsExpression.make(dim.counter)
+            indexing_decls.append(
+                self._typify(
+                    PsDeclaration(
+                        ctr,
+                        dim.start
+                        + dim.step
+                        * PsCast(ctr.get_dtype(), self._linear_thread_idx(i)),
+                    )
+                )
+            )
+            if not self._cfg.omit_range_check:
+                conds.append(PsLt(ctr, dim.stop))
+
+        if conds:
+            condition: PsExpression = conds[0]
+            for cond in conds[1:]:
+                condition = PsAnd(condition, cond)
+            ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
+        else:
+            body.statements = indexing_decls + body.statements
+            ast = body
 
-        dimensions = ispace.dimensions
+        return ast, launch_config
 
-        #   Determine loop order by permuting dimensions
-        archetype_field = ispace.archetype_field
-        if archetype_field is not None:
-            loop_order = archetype_field.layout
-            dimensions = [dimensions[coordinate] for coordinate in loop_order]
+    def _prepend_sparse_translation(
+        self, body: PsBlock, ispace: SparseIterationSpace
+    ) -> tuple[PsBlock, GpuThreadsRange]:
+        ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
 
-        start = [
-            PsAdd(c, d.start)
-            for c, d in zip(self.cuda_indices(len(dimensions)), dimensions[::-1])
-        ]
-        conditions = [PsLt(c, d.stop) for c, d in zip(start, dimensions[::-1])]
+        ctr = PsExpression.make(ispace.sparse_counter)
+        thread_idx = self._linear_thread_idx(0)
+        idx_decl = self._typify(PsDeclaration(ctr, PsCast(ctr.get_dtype(), thread_idx)))
+        body.statements = [idx_decl] + body.statements
 
-        condition: PsExpression = conditions[0]
-        for c in conditions[1:]:
-            condition = PsAnd(condition, c)
+        return body, GpuThreadsRange.from_ispace(ispace)
 
-        return PsBlock([PsConditional(condition, body)])
+    def _linear_thread_idx(self, coord: int):
+        block_size = BLOCK_DIM[coord]
+        block_idx = BLOCK_IDX[coord]
+        thread_idx = THREAD_IDX[coord]
+        return block_idx * block_size + thread_idx
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index 9ee33cc4e6e132be46df3fd9af6f32de240b2752..0eff7db02ce4f551ca3a68003d0021c393dc38a5 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -89,7 +89,7 @@ class SyclPlatform(GenericGpu):
 
         if conds:
             condition: PsExpression = conds[0]
-            for cond in conds:
+            for cond in conds[1:]:
                 condition = PsAnd(condition, cond)
             ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
         else:
diff --git a/src/pystencils/enums.py b/src/pystencils/enums.py
index 276a0c44f4d1dcfcee330d87ca33a33f88c298bc..23c255ef0949e02ac5b0af57551ceec1bf6cfee2 100644
--- a/src/pystencils/enums.py
+++ b/src/pystencils/enums.py
@@ -71,13 +71,13 @@ class Target(Flag):
     found on the current machine and runtime environment.
     """
 
-    GenericCUDA = _GPU | _CUDA
+    CUDA = _GPU | _CUDA
     """Generic CUDA GPU target.
 
     Generate a CUDA kernel for a generic Nvidia GPU.
     """
 
-    GPU = GenericCUDA
+    GPU = CUDA
     """Alias for backward compatibility."""
 
     SYCL = _GPU | _SYCL
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index 62f5d2968fa605244aee4a955fb7f9c2977819eb..f7d33b1deda68fb1715aad79e4d6ca3abe2563ad 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -86,16 +86,30 @@ def create_kernel(
 
             platform = GenericCpu(ctx)
             kernel_ast = platform.materialize_iteration_space(kernel_body, ispace)
-        case Target.SYCL:
-            from .backend.platforms import SyclPlatform
 
-            platform = SyclPlatform(ctx, config.gpu_indexing)
+        case target if target.is_gpu():
+            match target:
+                case Target.SYCL:
+                    from .backend.platforms import SyclPlatform
+
+                    platform = SyclPlatform(ctx, config.gpu_indexing)
+                case Target.CUDA:
+                    from .backend.platforms import CudaPlatform
+
+                    platform = CudaPlatform(ctx, config.gpu_indexing)
+                case _:
+                    raise NotImplementedError(
+                        f"Code generation for target {target} not implemented"
+                    )
+
             kernel_ast, gpu_threads = platform.materialize_iteration_space(
                 kernel_body, ispace
             )
+
         case _:
-            #   TODO: CUDA/HIP platform
-            raise NotImplementedError("Target platform not implemented")
+            raise NotImplementedError(
+                f"Code generation for target {target} not implemented"
+            )
 
     #   Simplifying transformations
     elim_constants = EliminateConstants(ctx, extract_constant_exprs=True)