diff --git a/conftest.py b/conftest.py
index ff0467eff85d2b43854246ec98091a15ad347e63..7ea8f5ba0abebe731c229311e12d6346786f8345 100644
--- a/conftest.py
+++ b/conftest.py
@@ -43,10 +43,16 @@ def add_path_to_ignore(path):
     ]
 
 
-collect_ignore = [
-    os.path.join(SCRIPT_FOLDER, "doc", "conf.py"),
-    os.path.join(SCRIPT_FOLDER, "src", "pystencils", "opencl", "opencl.autoinit"),
-]
+def ignore_file(fp):
+    global collect_ignore
+    collect_ignore += [os.path.join(SCRIPT_FOLDER, fp)]
+
+
+collect_ignore = []
+
+ignore_file("noxfile.py")
+ignore_file("docs/source/conf.py")
+add_path_to_ignore("docs/build")
 add_path_to_ignore("tests/benchmark")
 add_path_to_ignore("_local_tmp")
 
diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index e5035d7c7b4f2f21699f04b3cc0a7245a7ba2ff4..3fe00840ed13faeb837d91711bca0be1a390aa4e 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -2,23 +2,19 @@
 
 The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components:
 
- - The {any}`CudaPlatform` at `backend.platforms` which performs materialization of a kernel's iteration
-   space by mapping GPU block and thread indices to iteration space points. To perform this task,
-   it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping.
+ - The platforms {any}`CudaPlatform` and {any}`HipPlatform` at `backend.platforms`
+   which perform materialization of a kernel's iteration space
+   by mapping GPU block and thread indices to iteration space points.
+   To perform this task, it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping.
    The platform also takes care of lowering mathematical functions to their CUDA runtime library implementation.
  - In the code generation driver, the strings are drawn by the `GpuIndexing` helper class.
    It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration
    for the runtime system.
 
-:::{attention}
-
-Code generation for HIP through the `CudaPlatform` is experimental and not tested at the moment.
-:::
-
-## The CUDA Platform and Thread Mappings
+## The GPU Platform and Thread Mappings
 
 ```{eval-rst}
-.. module:: pystencils.backend.platforms.cuda
+.. module:: pystencils.backend.platforms.generic_gpu
 
 .. autosummary::
     :toctree: generated
diff --git a/docs/source/backend/platforms.md b/docs/source/backend/platforms.md
index e7ffc6f1523272621d273a17624d6323c25651b1..2d2c33d862627a0e92ca8afb466fee86cb78e3d5 100644
--- a/docs/source/backend/platforms.md
+++ b/docs/source/backend/platforms.md
@@ -26,7 +26,6 @@ targets in the future.
     Platform
     GenericCpu
     GenericVectorCpu
-    GenericGpu
 ```
 
 ## CPU Platforms
@@ -49,6 +48,18 @@ targets in the future.
     :nosignatures:
     :template: autosummary/entire_class.rst
 
+    GenericGpu
     CudaPlatform
-    SyclPlatform
+    HipPlatform
 ```
+
+## Experimental Platforms
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    SyclPlatform
+```
\ No newline at end of file
diff --git a/docs/source/contributing/dev-workflow.md b/docs/source/contributing/dev-workflow.md
index 8daac8cbd179a9922d9d70ebbcee2cd7b5dbbba2..d9291613083439baeb226f4536844665a63d353f 100644
--- a/docs/source/contributing/dev-workflow.md
+++ b/docs/source/contributing/dev-workflow.md
@@ -48,16 +48,22 @@ git pull --set-upstream upstream master
 
 ## Set Up the Python Environment
 
+### Prerequesites
+
 To develop pystencils, you will need at least the following software installed on your machine:
 
 - Python 3.10 or later: Since pystencils minimal supported version is Python 3.10, we recommend that you work with Python 3.10 directly.
 - An up-to-date C++ compiler, used by pystencils to JIT-compile generated code
 - [Nox](https://nox.thea.codes/en/stable/), which we use for test automation.
   Nox will be used extensively in the instructions on testing below.
-- Optionally [CUDA](https://developer.nvidia.com/cuda-toolkit),
-  if you have an Nvidia or AMD GPU and plan to develop on pystencils' GPU capabilities
+- Optionally, for GPU development:
+  - At least CUDA 11 for Nvidia GPUs, or
+  - At least ROCm/HIP 6.1 for AMD GPUs.
+
+### Virtual Environment Setup
 
-Once you have these, set up a [virtual environment](https://docs.python.org/3/library/venv.html) for development.
+Once you have all the prerequesites,
+set up a [virtual environment](https://docs.python.org/3/library/venv.html) for development.
 This ensures that your system's installation of Python is kept clean, and isolates your development environment
 from outside influence.
 Use the following commands to create a virtual environment at `.venv` and perform an editable install of pystencils into it:
@@ -74,7 +80,39 @@ Setting `PIP_REQUIRE_VIRTUALENV` ensures that pip refuses to install packages gl
 Consider setting this variable globally in your shell's configuration file.
 :::
 
-You are now ready to go! Create a new git branch to work on, open up an IDE, and start coding.
+:::{admonition} Feature Groups
+The above installation instructions assume that you will be running all code checking
+and test tasks through `nox`.
+If you need or want to run them manually, you will need to add one or more
+of these feature groups to your installation:
+
+ - `doc`, which contains all dependencies required to build this documentation;
+ - `dev`, which adds `flake8` for code style checking,
+   `mypy` for static type checking,
+    and the `black` formatter;
+ - `testsuite`, which adds `pytest` plus plugins and some more dependencies required
+   for running the test suite.
+
+Depending on your development focus, you might also need to add some of the user feature
+groups listed in [the installation guide](#installation_guide).
+:::
+
+### Cupy for CUDA and HIP
+
+When developing for NVidia or AMD GPUs, you will likely need an installation of [cupy](https://cupy.dev/).
+Since cupy has to be built specifically against the libraries of a given CUDA or ROCm version,
+it cannot be installed directly via dependency resolution from pystencils.
+For instructions on how to install Cupy, refer to their [installation manual](https://docs.cupy.dev/en/stable/install.html).
+
+### Test Your Setup
+
+To check if your setup is complete, a good check is to invoke the pystencils test suite:
+
+```bash
+nox -s "testsuite(cpu)"
+```
+
+If this finishes without errors, you are ready to go! Create a new git branch to work on, open up an IDE, and start coding.
 Make sure your IDE recognizes the virtual environment you created, though.
 
 ## Static Code Analysis
diff --git a/docs/source/installation.md b/docs/source/installation.md
index deb2b0613564f98468f623544acf3cc1ca9d279e..8fdb5684fdfea60a117b090e4d3ab1976ef9d0b7 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,4 +1,4 @@
-(_installation)=
+(installation_guide)=
 # Setup and Installation
 
 ## Install pystencils
@@ -17,7 +17,7 @@ git clone -b v2.0-dev https://i10git.cs.fau.de/pycodegen/pystencils.git
 pip install -e pystencils
 ```
 
-### Feature Groups
+## Feature Groups
 
 In both cases, you can add a set of optional features to your installation by listing them
 in square brackets (e.g. `pip install -e pystencils[feature1, feature2]`).
@@ -33,25 +33,22 @@ The following feature sets are available:
 - `use_cython`: Install [Cython](https://cython.org/), which is used internally by pystencils
   to accelerate the setup of boundary conditions.
 
-:::{dropdown} For Developers
-
-If you are developing pystencils, we recommend you perform an editable install of your
-local clone of the repository, with all optional features:
-```bash
-pip install -e pystencils[alltrafos,interactive,use_cython,doc,testsuite]
-```
-
-This includes the additional feature groups `doc`, which contains all dependencies required
-to build this documentation, and `tests`, which adds `flake8` for code style checking,
-`mypy` for static type checking, and `pytest` plus plugins for running the test suite.
-
-For more information on developing pystencils, see the [](#contribution_guide).
-:::
-
-### For Nvidia GPUs
+## For GPUs
 
 If you have an Nvidia graphics processor and CUDA installed, you can use pystencils to directly compile
 and execute kernels running on your GPU.
-This requires a working installation of [cupy](https://cupy.dev).
+This requires a working installation of [Cupy](https://cupy.dev).
 Please refer to the cupy's [installation manual](https://docs.cupy.dev/en/stable/install.html)
 for details about installing cupy.
+
+You can also use Cupy together with AMD ROCm and HIP for AMD graphics cards,
+but the setup steps are a bit more complicated - you might have to build cupy from source.
+The Cupy documentation covers this in their [installation guide for Cupy on ROCm][cupy-rocm].
+
+:::{note}
+Since Cupy's support for ROCm is at this time still an experimental feature,
+just-in-time compilation of pystencils HIP kernels
+for the ROCm platform must also considered *experimental*.
+:::
+
+[cupy-rocm]: https://docs.cupy.dev/en/stable/install.html#using-cupy-on-amd-gpu-experimental "Cupy on ROCm"
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 48a24e703584e8c87365550c7a7d5d8338feb717..7a3d54f6f0a41fefdb4f841351061ba3d383d74b 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -26,23 +26,46 @@ import matplotlib.pyplot as plt
 (guide_gpukernels)=
 # Pystencils for GPUs
 
-Pystencils offers code generation for Nvidia GPUs using the CUDA programming model,
+Pystencils offers code generation for Nvidia and AMD GPUs
+using the CUDA and HIP programming models,
 as well as just-in-time compilation and execution of CUDA kernels from within Python
 based on the [cupy] library.
 This section's objective is to give a detailed introduction into the creation of
 GPU kernels with pystencils.
 
-## Generate, Compile and Run CUDA Kernels
+:::{note}
+[CuPy][cupy] is a Python library for numerical computations on GPU arrays,
+which operates much in the same way that [NumPy][numpy] works on CPU arrays.
+Cupy and NumPy expose nearly the same APIs for array operations;
+the difference being that CuPy allocates all its arrays on the GPU
+and performs its operations as CUDA kernels.
+Also, CuPy exposes a just-in-time-compiler for GPU kernels.
+In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code,
+and to allocate and manage the data these kernels can be executed on.
+
+For more information on CuPy, refer to [their documentation][cupy-docs].
+:::
+
+## Generate, Compile and Run GPU Kernels
+
+The CUDA and HIP platforms are made available in pystencils via the code generation targets
+{any}`Target.CUDA` and {any}`Target.HIP`.
+For pystencils code to be portable between both, we can use {any}`Target.CurrentGPU` to
+automatically select one or the other, depending on the current runtime environment.
+
+:::{note}
+If `cupy` is not installed, `create_kernel` will raise an exception when using `Target.CurrentGPU`.
+You can still generate kernels for CUDA or HIP directly even without Cupy;
+you just won't be able to just-in-time compile and run them.
+:::
 
-In order to obtain a CUDA implementation of a symbolic kernel, naught more is required
-than setting the {any}`target <CreateKernelConfig.target>` code generator option to
-{any}`Target.CUDA`:
+Here is a snippet creating a kernel for the locally available GPU target:
 
 ```{code-cell} ipython3
 f, g = ps.fields("f, g: float64[3D]")
 update = ps.Assignment(f.center(), 2 * g.center())
 
-cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+cfg = ps.CreateKernelConfig(target=ps.Target.CurrentGPU)
 kernel = ps.create_kernel(update, cfg)
 
 ps.inspect(kernel)
@@ -68,19 +91,6 @@ kfunc = kernel.compile()
 kfunc(f=f_arr, g=g_arr)
 ```
 
-:::{note}
-[CuPy][cupy] is a Python library for numerical computations on GPU arrays,
-which operates much in the same way that [NumPy][numpy] works on CPU arrays.
-Cupy and NumPy expose nearly the same APIs for array operations;
-the difference being that CuPy allocates all its arrays on the GPU
-and performs its operations as CUDA kernels.
-Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc].
-In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code,
-and to allocate and manage the data these kernels can be executed on.
-
-For more information on CuPy, refer to [their documentation][cupy-docs].
-:::
-
 (indexing_and_launch_config)=
 ## Modify the Indexing Scheme and Launch Configuration
 
@@ -233,7 +243,7 @@ assignments = [
 ```{code-cell} ipython3
 y = ps.DEFAULTS.spatial_counters[0]
 cfg = ps.CreateKernelConfig()
-cfg.target= ps.Target.CUDA
+cfg.target= ps.Target.CurrentGPU
 cfg.iteration_slice = ps.make_slice[:, y:]
 ```
 
@@ -286,5 +296,4 @@ only a part of the triangle is being processed.
 
 [cupy]: https://cupy.dev "CuPy Homepage"
 [numpy]: https://numpy.org "NumPy Homepage"
-[nvrtc]: https://docs.nvidia.com/cuda/nvrtc/index.html "NVIDIA Runtime Compilation Library"
 [cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation"
diff --git a/pyproject.toml b/pyproject.toml
index b3c6b1c0238654bedd1ba90b09c8d98541e1c73f..ae539b12c7399694f46af0261851de5d68bcc86e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,6 @@ classifiers = [
 "Source Code" = "https://i10git.cs.fau.de/pycodegen/pystencils"
 
 [project.optional-dependencies]
-gpu = ['cupy']
 alltrafos = ['islpy', 'py-cpuinfo']
 bench_db = ['blitzdb', 'pymongo', 'pandas']
 interactive = [
@@ -76,7 +75,7 @@ testsuite = [
     'matplotlib',
     'py-cpuinfo',
     'randomgen>=1.18',
-    'scipy'
+    'scipy',
 ]
 
 [build-system]
@@ -89,7 +88,8 @@ build-backend = "setuptools.build_meta"
 
 [tool.setuptools.package-data]
 pystencils = [
-    "include/*.h",
+    "include/**/*.h",
+    "include/**/*.hpp",
     "jit/cpu/*.tmpl.cpp",
     "boundaries/createindexlistcython.pyx"
 ]
diff --git a/src/pystencils/backend/emission/base_printer.py b/src/pystencils/backend/emission/base_printer.py
index cc4b50e217d68a4bba28c2c95705464a91182212..c4ac0640c44a222f3fbecef9086fdbd36e68f8bc 100644
--- a/src/pystencils/backend/emission/base_printer.py
+++ b/src/pystencils/backend/emission/base_printer.py
@@ -57,7 +57,6 @@ from ..extensions.foreign_ast import PsForeignExpression
 from ..memory import PsSymbol
 from ..constants import PsConstant
 from ...types import PsType
-from ...codegen import Target
 
 if TYPE_CHECKING:
     from ...codegen import Kernel
@@ -383,7 +382,7 @@ class BasePrinter(ABC):
         from ...codegen import GpuKernel
         
         sig_parts = [self._func_prefix] if self._func_prefix is not None else []
-        if isinstance(func, GpuKernel) and func.target == Target.CUDA:
+        if isinstance(func, GpuKernel) and func.target.is_gpu():
             sig_parts.append("__global__")
         sig_parts += ["void", func.name, f"({params_str})"]
         signature = " ".join(sig_parts)
diff --git a/src/pystencils/backend/platforms/__init__.py b/src/pystencils/backend/platforms/__init__.py
index 589841db87efb598ffeed20d4d11db7ffcd452cc..3b602964bf2442823c9965f3398517635c8cc217 100644
--- a/src/pystencils/backend/platforms/__init__.py
+++ b/src/pystencils/backend/platforms/__init__.py
@@ -2,6 +2,7 @@ from .platform import Platform
 from .generic_cpu import GenericCpu, GenericVectorCpu
 from .generic_gpu import GenericGpu
 from .cuda import CudaPlatform
+from .hip import HipPlatform
 from .x86 import X86VectorCpu, X86VectorArch
 from .sycl import SyclPlatform
 
@@ -13,5 +14,6 @@ __all__ = [
     "X86VectorArch",
     "GenericGpu",
     "CudaPlatform",
+    "HipPlatform",
     "SyclPlatform",
 ]
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index c97fad413198c16b87d6c27949e70d6aa59a3aa1..98ff3e3d332a46074931514ba3af1603dc6318b2 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,318 +1,11 @@
 from __future__ import annotations
-from abc import ABC, abstractmethod
 
-from ...types import constify, deconstify
-from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
 
-from ..memory import PsSymbol
-from ..kernelcreation import (
-    Typifier,
-    IterationSpace,
-    FullIterationSpace,
-    SparseIterationSpace,
-    AstFactory,
-)
-
-from ..kernelcreation.context import KernelCreationContext
-from ..ast.structural import PsBlock, PsConditional, PsDeclaration
-from ..ast.expressions import (
-    PsExpression,
-    PsLiteralExpr,
-    PsCast,
-    PsCall,
-    PsLookup,
-    PsBufferAcc,
-)
-from ..ast.expressions import PsLt, PsAnd
-from ...types import PsSignedIntegerType, PsIeeeFloatType
-from ..literals import PsLiteral
-from ..functions import PsMathFunction, MathFunctions, CFunction
-
-
-int32 = PsSignedIntegerType(width=32, const=False)
-
-BLOCK_IDX = [
-    PsLiteralExpr(PsLiteral(f"blockIdx.{coord}", int32)) for coord in ("x", "y", "z")
-]
-THREAD_IDX = [
-    PsLiteralExpr(PsLiteral(f"threadIdx.{coord}", int32)) for coord in ("x", "y", "z")
-]
-BLOCK_DIM = [
-    PsLiteralExpr(PsLiteral(f"blockDim.{coord}", int32)) for coord in ("x", "y", "z")
-]
-GRID_DIM = [
-    PsLiteralExpr(PsLiteral(f"gridDim.{coord}", int32)) for coord in ("x", "y", "z")
-]
-
-
-class ThreadMapping(ABC):
-
-    @abstractmethod
-    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
-        """Map the current thread index onto a point in the given iteration space.
-
-        Implementations of this method must return a declaration for each dimension counter
-        of the given iteration space.
-        """
-
-
-class Linear3DMapping(ThreadMapping):
-    """3D globally linearized mapping, where each thread is assigned a work item according to
-    its location in the global launch grid."""
-
-    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
-        match ispace:
-            case FullIterationSpace():
-                return self._dense_mapping(ispace)
-            case SparseIterationSpace():
-                return self._sparse_mapping(ispace)
-            case _:
-                assert False, "unexpected iteration space"
-
-    def _dense_mapping(
-        self, ispace: FullIterationSpace
-    ) -> dict[PsSymbol, PsExpression]:
-        if ispace.rank > 3:
-            raise MaterializationError(
-                f"Cannot handle {ispace.rank}-dimensional iteration space "
-                "using the Linear3D GPU thread index mapping."
-            )
-
-        dimensions = ispace.dimensions_in_loop_order()
-        idx_map: dict[PsSymbol, PsExpression] = dict()
-
-        for coord, dim in enumerate(dimensions[::-1]):
-            tid = self._linear_thread_idx(coord)
-            idx_map[dim.counter] = dim.start + dim.step * PsCast(
-                deconstify(dim.counter.get_dtype()), tid
-            )
-
-        return idx_map
-
-    def _sparse_mapping(
-        self, ispace: SparseIterationSpace
-    ) -> dict[PsSymbol, PsExpression]:
-        sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = self._linear_thread_idx(0)
-        idx_map: dict[PsSymbol, PsExpression] = {
-            ispace.sparse_counter: PsCast(
-                deconstify(sparse_ctr.get_dtype()), thread_idx
-            )
-        }
-        return idx_map
-
-    def _linear_thread_idx(self, coord: int):
-        block_size = BLOCK_DIM[coord]
-        block_idx = BLOCK_IDX[coord]
-        thread_idx = THREAD_IDX[coord]
-        return block_idx * block_size + thread_idx
-
-
-class Blockwise4DMapping(ThreadMapping):
-    """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
-    are mapped to block indices."""
-
-    _indices_fastest_first = [  # slowest to fastest
-        THREAD_IDX[0],
-        BLOCK_IDX[0],
-        BLOCK_IDX[1],
-        BLOCK_IDX[2],
-    ]
-
-    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
-        match ispace:
-            case FullIterationSpace():
-                return self._dense_mapping(ispace)
-            case SparseIterationSpace():
-                return self._sparse_mapping(ispace)
-            case _:
-                assert False, "unexpected iteration space"
-
-    def _dense_mapping(
-        self, ispace: FullIterationSpace
-    ) -> dict[PsSymbol, PsExpression]:
-        if ispace.rank > 4:
-            raise MaterializationError(
-                f"Cannot handle {ispace.rank}-dimensional iteration space "
-                "using the Blockwise4D GPU thread index mapping."
-            )
-
-        dimensions = ispace.dimensions_in_loop_order()
-        idx_map: dict[PsSymbol, PsExpression] = dict()
-
-        for dim, tid in zip(dimensions[::-1], self._indices_fastest_first):
-            idx_map[dim.counter] = dim.start + dim.step * PsCast(
-                deconstify(dim.counter.get_dtype()), tid
-            )
-
-        return idx_map
-
-    def _sparse_mapping(
-        self, ispace: SparseIterationSpace
-    ) -> dict[PsSymbol, PsExpression]:
-        sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = self._indices_fastest_first[0]
-        idx_map: dict[PsSymbol, PsExpression] = {
-            ispace.sparse_counter: PsCast(
-                deconstify(sparse_ctr.get_dtype()), thread_idx
-            )
-        }
-        return idx_map
-
 
 class CudaPlatform(GenericGpu):
-    """Platform for CUDA-based GPUs.
-
-    Args:
-        ctx: The kernel creation context
-        thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
-    """
-
-    def __init__(
-        self,
-        ctx: KernelCreationContext,
-        thread_mapping: ThreadMapping | None = None,
-    ) -> None:
-        super().__init__(ctx)
-
-        self._thread_mapping = (
-            thread_mapping if thread_mapping is not None else Linear3DMapping()
-        )
-
-        self._typify = Typifier(ctx)
+    """Platform for the CUDA GPU taret."""
 
     @property
     def required_headers(self) -> set[str]:
-        return {'"pystencils_runtime/hip.h"'}  # TODO: move to HipPlatform once it is introduced
-
-    def materialize_iteration_space(
-        self, body: PsBlock, ispace: IterationSpace
-    ) -> PsBlock:
-        if isinstance(ispace, FullIterationSpace):
-            return self._prepend_dense_translation(body, ispace)
-        elif isinstance(ispace, SparseIterationSpace):
-            return self._prepend_sparse_translation(body, ispace)
-        else:
-            raise MaterializationError(f"Unknown type of iteration space: {ispace}")
-
-    def select_function(self, call: PsCall) -> PsExpression:
-        assert isinstance(call.function, PsMathFunction)
-
-        func = call.function.func
-        dtype = call.get_dtype()
-        arg_types = (dtype,) * func.num_args
-
-        if isinstance(dtype, PsIeeeFloatType):
-            match func:
-                case (
-                    MathFunctions.Exp
-                    | MathFunctions.Log
-                    | MathFunctions.Sin
-                    | MathFunctions.Cos
-                    | MathFunctions.Sqrt
-                    | MathFunctions.Ceil
-                    | MathFunctions.Floor
-                ) if dtype.width in (16, 32, 64):
-                    prefix = "h" if dtype.width == 16 else ""
-                    suffix = "f" if dtype.width == 32 else ""
-                    name = f"{prefix}{func.function_name}{suffix}"
-                    cfunc = CFunction(name, arg_types, dtype)
-
-                case (
-                    MathFunctions.Pow
-                    | MathFunctions.Tan
-                    | MathFunctions.Sinh
-                    | MathFunctions.Cosh
-                    | MathFunctions.ASin
-                    | MathFunctions.ACos
-                    | MathFunctions.ATan
-                    | MathFunctions.ATan2
-                ) if dtype.width in (32, 64):
-                    #   These are unavailable for fp16
-                    suffix = "f" if dtype.width == 32 else ""
-                    name = f"{func.function_name}{suffix}"
-                    cfunc = CFunction(name, arg_types, dtype)
-
-                case (
-                    MathFunctions.Min | MathFunctions.Max | MathFunctions.Abs
-                ) if dtype.width in (32, 64):
-                    suffix = "f" if dtype.width == 32 else ""
-                    name = f"f{func.function_name}{suffix}"
-                    cfunc = CFunction(name, arg_types, dtype)
-
-                case MathFunctions.Abs if dtype.width == 16:
-                    cfunc = CFunction(" __habs", arg_types, dtype)
-
-                case _:
-                    raise MaterializationError(
-                        f"Cannot materialize call to function {func}"
-                    )
-
-            call.function = cfunc
-            return call
-
-        raise MaterializationError(
-            f"No implementation available for function {func} on data type {dtype}"
-        )
-
-    #   Internals
-
-    def _prepend_dense_translation(
-        self, body: PsBlock, ispace: FullIterationSpace
-    ) -> PsBlock:
-        ctr_mapping = self._thread_mapping(ispace)
-
-        indexing_decls = []
-        conds = []
-
-        dimensions = ispace.dimensions_in_loop_order()
-
-        for dim in dimensions:
-            # counter declarations must be ordered slowest-to-fastest
-            # such that inner dimensions can depend on outer ones
-
-            dim.counter.dtype = constify(dim.counter.get_dtype())
-
-            ctr_expr = PsExpression.make(dim.counter)
-            indexing_decls.append(
-                self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
-            )
-            conds.append(PsLt(ctr_expr, dim.stop))
-
-        condition: PsExpression = conds[0]
-        for cond in conds[1:]:
-            condition = PsAnd(condition, cond)
-        return PsBlock(indexing_decls + [PsConditional(condition, body)])
-
-    def _prepend_sparse_translation(
-        self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> PsBlock:
-        factory = AstFactory(self._ctx)
-        ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
-
-        sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
-        ctr_mapping = self._thread_mapping(ispace)
-
-        sparse_idx_decl = self._typify(
-            PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter])
-        )
-
-        mappings = [
-            PsDeclaration(
-                PsExpression.make(ctr),
-                PsLookup(
-                    PsBufferAcc(
-                        ispace.index_list.base_pointer,
-                        (sparse_ctr_expr.clone(), factory.parse_index(0)),
-                    ),
-                    coord.name,
-                ),
-            )
-            for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members)
-        ]
-        body.statements = mappings + body.statements
-
-        stop = PsExpression.make(ispace.index_list.shape[0])
-        condition = PsLt(sparse_ctr_expr.clone(), stop)
-        return PsBlock([sparse_idx_decl, PsConditional(condition, body)])
+        return set()
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index b5b35c8b03447f1d5c35ed1289b89542bb1127ca..11425d9238e8270b06a628555908622b623931d6 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,7 +1,320 @@
 from __future__ import annotations
+from abc import ABC, abstractmethod
 
+from ...types import constify, deconstify
+from ..exceptions import MaterializationError
 from .platform import Platform
 
+from ..memory import PsSymbol
+from ..kernelcreation import (
+    Typifier,
+    IterationSpace,
+    FullIterationSpace,
+    SparseIterationSpace,
+    AstFactory,
+)
+
+from ..kernelcreation.context import KernelCreationContext
+from ..ast.structural import PsBlock, PsConditional, PsDeclaration
+from ..ast.expressions import (
+    PsExpression,
+    PsLiteralExpr,
+    PsCast,
+    PsCall,
+    PsLookup,
+    PsBufferAcc,
+)
+from ..ast.expressions import PsLt, PsAnd
+from ...types import PsSignedIntegerType, PsIeeeFloatType
+from ..literals import PsLiteral
+from ..functions import PsMathFunction, MathFunctions, CFunction
+
+
+int32 = PsSignedIntegerType(width=32, const=False)
+
+BLOCK_IDX = [
+    PsLiteralExpr(PsLiteral(f"blockIdx.{coord}", int32)) for coord in ("x", "y", "z")
+]
+THREAD_IDX = [
+    PsLiteralExpr(PsLiteral(f"threadIdx.{coord}", int32)) for coord in ("x", "y", "z")
+]
+BLOCK_DIM = [
+    PsLiteralExpr(PsLiteral(f"blockDim.{coord}", int32)) for coord in ("x", "y", "z")
+]
+GRID_DIM = [
+    PsLiteralExpr(PsLiteral(f"gridDim.{coord}", int32)) for coord in ("x", "y", "z")
+]
+
+
+class ThreadMapping(ABC):
+
+    @abstractmethod
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        """Map the current thread index onto a point in the given iteration space.
+
+        Implementations of this method must return a declaration for each dimension counter
+        of the given iteration space.
+        """
+
+
+class Linear3DMapping(ThreadMapping):
+    """3D globally linearized mapping, where each thread is assigned a work item according to
+    its location in the global launch grid."""
+
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        match ispace:
+            case FullIterationSpace():
+                return self._dense_mapping(ispace)
+            case SparseIterationSpace():
+                return self._sparse_mapping(ispace)
+            case _:
+                assert False, "unexpected iteration space"
+
+    def _dense_mapping(
+        self, ispace: FullIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 3:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Linear3D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for coord, dim in enumerate(dimensions[::-1]):
+            tid = self._linear_thread_idx(coord)
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+    def _sparse_mapping(
+        self, ispace: SparseIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
+        thread_idx = self._linear_thread_idx(0)
+        idx_map: dict[PsSymbol, PsExpression] = {
+            ispace.sparse_counter: PsCast(
+                deconstify(sparse_ctr.get_dtype()), thread_idx
+            )
+        }
+        return idx_map
+
+    def _linear_thread_idx(self, coord: int):
+        block_size = BLOCK_DIM[coord]
+        block_idx = BLOCK_IDX[coord]
+        thread_idx = THREAD_IDX[coord]
+        return block_idx * block_size + thread_idx
+
+
+class Blockwise4DMapping(ThreadMapping):
+    """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
+    are mapped to block indices."""
+
+    _indices_fastest_first = [  # slowest to fastest
+        THREAD_IDX[0],
+        BLOCK_IDX[0],
+        BLOCK_IDX[1],
+        BLOCK_IDX[2]
+    ]
+
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        match ispace:
+            case FullIterationSpace():
+                return self._dense_mapping(ispace)
+            case SparseIterationSpace():
+                return self._sparse_mapping(ispace)
+            case _:
+                assert False, "unexpected iteration space"
+
+    def _dense_mapping(
+        self, ispace: FullIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 4:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Blockwise4D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for dim, tid in zip(dimensions[::-1], self._indices_fastest_first):
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+    def _sparse_mapping(
+        self, ispace: SparseIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
+        thread_idx = self._indices_fastest_first[0]
+        idx_map: dict[PsSymbol, PsExpression] = {
+            ispace.sparse_counter: PsCast(
+                deconstify(sparse_ctr.get_dtype()), thread_idx
+            )
+        }
+        return idx_map
+
 
 class GenericGpu(Platform):
-    """Base class for GPU platforms."""
+    """Common base platform for CUDA- and HIP-type GPU targets.
+    
+    Args:
+        ctx: The kernel creation context
+        omit_range_check: If `True`, generated index translation code will not check if the point identified
+            by block and thread indices is actually contained in the iteration space
+        thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
+    """
+
+    def __init__(
+        self,
+        ctx: KernelCreationContext,
+        thread_mapping: ThreadMapping | None = None,
+    ) -> None:
+        super().__init__(ctx)
+
+        self._thread_mapping = (
+            thread_mapping if thread_mapping is not None else Linear3DMapping()
+        )
+
+        self._typify = Typifier(ctx)
+
+    def materialize_iteration_space(
+        self, body: PsBlock, ispace: IterationSpace
+    ) -> PsBlock:
+        if isinstance(ispace, FullIterationSpace):
+            return self._prepend_dense_translation(body, ispace)
+        elif isinstance(ispace, SparseIterationSpace):
+            return self._prepend_sparse_translation(body, ispace)
+        else:
+            raise MaterializationError(f"Unknown type of iteration space: {ispace}")
+
+    def select_function(self, call: PsCall) -> PsExpression:
+        assert isinstance(call.function, PsMathFunction)
+
+        func = call.function.func
+        dtype = call.get_dtype()
+        arg_types = (dtype,) * func.num_args
+
+        if isinstance(dtype, PsIeeeFloatType):
+            match func:
+                case (
+                    MathFunctions.Exp
+                    | MathFunctions.Log
+                    | MathFunctions.Sin
+                    | MathFunctions.Cos
+                    | MathFunctions.Sqrt
+                    | MathFunctions.Ceil
+                    | MathFunctions.Floor
+                ) if dtype.width in (16, 32, 64):
+                    prefix = "h" if dtype.width == 16 else ""
+                    suffix = "f" if dtype.width == 32 else ""
+                    name = f"{prefix}{func.function_name}{suffix}"
+                    cfunc = CFunction(name, arg_types, dtype)
+
+                case (
+                    MathFunctions.Pow
+                    | MathFunctions.Tan
+                    | MathFunctions.Sinh
+                    | MathFunctions.Cosh
+                    | MathFunctions.ASin
+                    | MathFunctions.ACos
+                    | MathFunctions.ATan
+                    | MathFunctions.ATan2
+                ) if dtype.width in (32, 64):
+                    #   These are unavailable for fp16
+                    suffix = "f" if dtype.width == 32 else ""
+                    name = f"{func.function_name}{suffix}"
+                    cfunc = CFunction(name, arg_types, dtype)
+
+                case (
+                    MathFunctions.Min | MathFunctions.Max | MathFunctions.Abs
+                ) if dtype.width in (32, 64):
+                    suffix = "f" if dtype.width == 32 else ""
+                    name = f"f{func.function_name}{suffix}"
+                    cfunc = CFunction(name, arg_types, dtype)
+
+                case MathFunctions.Abs if dtype.width == 16:
+                    cfunc = CFunction(" __habs", arg_types, dtype)
+
+                case _:
+                    raise MaterializationError(
+                        f"Cannot materialize call to function {func}"
+                    )
+
+            call.function = cfunc
+            return call
+
+        raise MaterializationError(
+            f"No implementation available for function {func} on data type {dtype}"
+        )
+
+    #   Internals
+
+    def _prepend_dense_translation(
+        self, body: PsBlock, ispace: FullIterationSpace
+    ) -> PsBlock:
+        ctr_mapping = self._thread_mapping(ispace)
+
+        indexing_decls = []
+        conds = []
+
+        dimensions = ispace.dimensions_in_loop_order()
+
+        for dim in dimensions:
+            # counter declarations must be ordered slowest-to-fastest
+            # such that inner dimensions can depend on outer ones
+
+            dim.counter.dtype = constify(dim.counter.get_dtype())
+
+            ctr_expr = PsExpression.make(dim.counter)
+            indexing_decls.append(
+                self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
+            )
+            conds.append(PsLt(ctr_expr, dim.stop))
+
+        condition: PsExpression = conds[0]
+        for cond in conds[1:]:
+            condition = PsAnd(condition, cond)
+        ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
+
+        return ast
+
+    def _prepend_sparse_translation(
+        self, body: PsBlock, ispace: SparseIterationSpace
+    ) -> PsBlock:
+        factory = AstFactory(self._ctx)
+        ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
+
+        sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
+        ctr_mapping = self._thread_mapping(ispace)
+
+        sparse_idx_decl = self._typify(
+            PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter])
+        )
+
+        mappings = [
+            PsDeclaration(
+                PsExpression.make(ctr),
+                PsLookup(
+                    PsBufferAcc(
+                        ispace.index_list.base_pointer,
+                        (sparse_ctr_expr.clone(), factory.parse_index(0)),
+                    ),
+                    coord.name,
+                ),
+            )
+            for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members)
+        ]
+        body.statements = mappings + body.statements
+
+        stop = PsExpression.make(ispace.index_list.shape[0])
+        condition = PsLt(sparse_ctr_expr.clone(), stop)
+        ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
+
+        return ast
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
new file mode 100644
index 0000000000000000000000000000000000000000..c758995a0d9f8fbbb2e9e424bf2cfa6ab7eca086
--- /dev/null
+++ b/src/pystencils/backend/platforms/hip.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from .generic_gpu import GenericGpu
+
+
+class HipPlatform(GenericGpu):
+    """Platform for the HIP GPU taret."""
+
+    @property
+    def required_headers(self) -> set[str]:
+        return {'"pystencils_runtime/hip.h"'}
diff --git a/src/pystencils/boundaries/boundaryhandling.py b/src/pystencils/boundaries/boundaryhandling.py
index 1f6e3d126365de0af08ee98ddd26d1600af15027..58340c3e0fbb16b98af2cf08c3d1894ca34a2309 100644
--- a/src/pystencils/boundaries/boundaryhandling.py
+++ b/src/pystencils/boundaries/boundaryhandling.py
@@ -123,7 +123,7 @@ class BoundaryHandling:
         class_ = self.IndexFieldBlockData
         class_.to_cpu = to_cpu
         class_.to_gpu = to_gpu
-        gpu = self._target in data_handling._GPU_LIKE_TARGETS
+        gpu = self._target.is_gpu()
         data_handling.add_custom_class(self._index_array_name, class_, cpu=True, gpu=gpu)
 
     @property
@@ -240,7 +240,7 @@ class BoundaryHandling:
         if self._dirty:
             self.prepare()
 
-        for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
+        for b in self._data_handling.iterate(gpu=self._target.is_gpu()):
             for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
                 kwargs[self._field_name] = b[self._field_name]
                 kwargs['indexField'] = idx_arr
@@ -255,7 +255,7 @@ class BoundaryHandling:
         if self._dirty:
             self.prepare()
 
-        for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
+        for b in self._data_handling.iterate(gpu=self._target.is_gpu()):
             for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
                 arguments = kwargs.copy()
                 arguments[self._field_name] = b[self._field_name]
@@ -341,7 +341,7 @@ class BoundaryHandling:
     def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs):
         if boundary_obj.additional_data_init_callback:
             boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs)
-        if self._target in self._data_handling._GPU_LIKE_TARGETS:
+        if self._target.is_gpu():
             self._data_handling.to_gpu(self._index_array_name)
 
     class BoundaryInfo(object):
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index d6f8e403cdce0c99853ee1127567e11c14b92b32..8e7e54ff1125a8bba2ba35c223277ee2867c28b7 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -395,10 +395,12 @@ class GpuOptions(ConfigBase):
     """
 
     @staticmethod
-    def default_warp_size(target: Target):
+    def default_warp_size(target: Target) -> int | None:
         match target:
             case Target.CUDA:
                 return 32
+            case Target.HIP:
+                return None
             case _:
                 raise NotImplementedError(
                     f"No default warp/wavefront size known for target {target}"
@@ -594,6 +596,8 @@ class CreateKernelConfig(ConfigBase):
         match t:
             case Target.CurrentCPU:
                 return Target.auto_cpu()
+            case Target.CurrentGPU:
+                return Target.auto_gpu()
             case _:
                 return t
 
@@ -601,12 +605,14 @@ class CreateKernelConfig(ConfigBase):
         """Returns either the user-specified JIT compiler, or infers one from the target if none is given."""
         jit: JitBase | None = self.get_option("jit")
 
+        target = self.get_target()
+
         if jit is None:
-            if self.get_target().is_cpu():
+            if target.is_cpu():
                 from ..jit import LegacyCpuJit
 
                 return LegacyCpuJit()
-            elif self.get_target() == Target.CUDA:
+            elif target == Target.CUDA or target == Target.HIP:
                 try:
                     from ..jit.gpu_cupy import CupyJit
 
@@ -617,7 +623,7 @@ class CreateKernelConfig(ConfigBase):
 
                     return no_jit
 
-            elif self.get_target() == Target.SYCL:
+            elif target == Target.SYCL:
                 from ..jit import no_jit
 
                 return no_jit
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 65352f5205125b05acbd088bc42e9205cd7eb415..e9fc69b76b3d88024f9cbde880617d6a3a3696ff 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import cast, Sequence, Callable, TYPE_CHECKING
 from dataclasses import dataclass, replace
+from warnings import warn
 
 from .target import Target
 from .config import (
@@ -399,7 +400,7 @@ class DefaultKernelCreationDriver:
         return kernel_ast
 
     def _get_gpu_indexing(self) -> GpuIndexing | None:
-        if self._target != Target.CUDA:
+        if not self._target.is_gpu():
             return None
 
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
@@ -410,6 +411,9 @@ class DefaultKernelCreationDriver:
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
 
+        if warp_size is None and assume_warp_aligned_block_size:
+            warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.")
+
         return GpuIndexing(
             self._ctx,
             self._target,
@@ -447,20 +451,26 @@ class DefaultKernelCreationDriver:
                 )
 
         elif self._target.is_gpu():
+            thread_mapping = (
+                self._gpu_indexing.get_thread_mapping()
+                if self._gpu_indexing is not None
+                else None
+            )
+
+            GpuPlatform: type
             match self._target:
                 case Target.CUDA:
-                    from ..backend.platforms import CudaPlatform
-
-                    thread_mapping = (
-                        self._gpu_indexing.get_thread_mapping()
-                        if self._gpu_indexing is not None
-                        else None
-                    )
-
-                    return CudaPlatform(
-                        self._ctx,
-                        thread_mapping=thread_mapping,
-                    )
+                    from ..backend.platforms import CudaPlatform as GpuPlatform
+                case Target.HIP:
+                    from ..backend.platforms import HipPlatform as GpuPlatform
+                case _:
+                    assert False, f"unexpected GPU target: {self._target}"
+
+            return GpuPlatform(
+                self._ctx,
+                thread_mapping=thread_mapping,
+            )
+
         elif self._target == Target.SYCL:
             from ..backend.platforms import SyclPlatform
 
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index d473e5b4a7408d2be77f0c4f2bc3b55a5ca9df0a..43b612bd77f535ef45c09c38489784dd12a53ce0 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -17,7 +17,7 @@ from ..backend.kernelcreation import (
     FullIterationSpace,
     SparseIterationSpace,
 )
-from ..backend.platforms.cuda import ThreadMapping
+from ..backend.platforms.generic_gpu import ThreadMapping
 
 from ..backend.ast.expressions import PsExpression, PsIntDiv
 from math import prod
@@ -30,14 +30,11 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 @dataclass
 class HardwareProperties:
-    warp_size: int
+    warp_size: int | None
     max_threads_per_block: int
     max_block_sizes: dim3
 
-    def block_size_exceeds_hw_limits(
-            self,
-            block_size: tuple[int, ...]
-    ) -> bool:
+    def block_size_exceeds_hw_limits(self, block_size: tuple[int, ...]) -> bool:
         """Checks if provided block size conforms limits given by the hardware."""
 
         return (
@@ -106,8 +103,10 @@ class GpuLaunchConfiguration(ABC):
 
     @staticmethod
     def _excessive_block_size_error_msg(block_size: tuple[int, ...]):
-        return f"Unable to determine GPU block size for this kernel. \
-        Final block size was too large: {block_size}."
+        return (
+            "Unable to determine GPU block size for this kernel. "
+            f"Final block size was too large: {block_size}."
+        )
 
 
 class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
@@ -139,7 +138,9 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
 
     @block_size.setter
     def block_size(self, val: dim3):
-        AttributeError("Setting `block_size` on an automatic launch configuration has no effect.")
+        AttributeError(
+            "Setting `block_size` on an automatic launch configuration has no effect."
+        )
 
     @property
     def parameters(self) -> frozenset[Parameter]:
@@ -203,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
         if (
             self._assume_warp_aligned_block_size
+            and self._hw_props.warp_size is not None
             and prod(self._block_size) % self._hw_props.warp_size != 0
         ):
             raise CodegenError(
@@ -289,6 +291,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
     def parameters(self) -> frozenset[Parameter]:
         """Parameters of this launch configuration"""
         return self._params
+    
+    @property
+    def default_block_size(self) -> dim3:
+        return self._default_block_size
 
     @property
     def block_size(self) -> dim3 | None:
@@ -297,7 +303,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     @block_size.setter
     def block_size(self, val: dim3):
-        AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.")
+        AttributeError(
+            "Setting `block_size` on an dynamic launch configuration has no effect."
+        )
 
     @staticmethod
     def _round_block_sizes_to_warp_size(
@@ -348,7 +356,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
         if (
             self._assume_warp_aligned_block_size
-            and prod(ret) % self._hw_props.warp_size != 0
+            and hw_props.warp_size is not None
+            and prod(ret) % hw_props.warp_size != 0
         ):
             self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
 
@@ -384,6 +393,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
             return ret
 
         trimmed = trim(list(block_size))
+
+        if hw_props.warp_size is None:
+            return tuple(trimmed)
+
         if (
             prod(trimmed) >= hw_props.warp_size
             and prod(trimmed) % hw_props.warp_size == 0
@@ -490,14 +503,13 @@ class GpuIndexing:
         ctx: KernelCreationContext,
         target: Target,
         scheme: GpuIndexingScheme,
-        warp_size: int,
+        warp_size: int | None,
         manual_launch_grid: bool = False,
         assume_warp_aligned_block_size: bool = False,
     ) -> None:
         self._ctx = ctx
         self._target = target
         self._scheme = scheme
-        self._warp_size = warp_size
         self._manual_launch_grid = manual_launch_grid
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
@@ -518,6 +530,8 @@ class GpuIndexing:
         match target:
             case Target.CUDA:
                 return (1024, 1024, 64)
+            case Target.HIP:
+                return (1024, 1024, 1024)
             case _:
                 raise CodegenError(
                     f"Cannot determine max GPU block sizes for target {target}"
@@ -526,7 +540,7 @@ class GpuIndexing:
     @staticmethod
     def get_max_threads_per_block(target: Target):
         match target:
-            case Target.CUDA:
+            case Target.CUDA | Target.HIP:
                 return 1024
             case _:
                 raise CodegenError(
@@ -536,7 +550,7 @@ class GpuIndexing:
     def get_thread_mapping(self) -> ThreadMapping:
         """Retrieve a thread mapping object for use by the backend"""
 
-        from ..backend.platforms.cuda import Linear3DMapping, Blockwise4DMapping
+        from ..backend.platforms.generic_gpu import Linear3DMapping, Blockwise4DMapping
 
         match self._scheme:
             case GpuIndexingScheme.Linear3D:
@@ -603,11 +617,20 @@ class GpuIndexing:
         # impossible to use block size determination function since the iteration space is unknown
         # -> round block size in fastest moving dimension up to multiple of warp size
         rounded_block_size: PsExpression
-        if self._assume_warp_aligned_block_size:
+        if (
+            self._assume_warp_aligned_block_size
+            and self._hw_props.warp_size is not None
+        ):
             warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
             rounded_block_size = self._ast_factory.parse_index(
-                PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1),
-                         warp_size.clone()) * warp_size.clone())
+                PsIntDiv(
+                    work_items[0].clone()
+                    + warp_size.clone()
+                    - self._ast_factory.parse_index(1),
+                    warp_size.clone(),
+                )
+                * warp_size.clone()
+            )
         else:
             rounded_block_size = work_items[0]
 
diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py
index 0d724b87730f0ec327772bccbb55a8bfff7c8ddd..5e214430cdd828768ab468cba04d1f9f5e69aeb9 100644
--- a/src/pystencils/codegen/target.py
+++ b/src/pystencils/codegen/target.py
@@ -30,6 +30,7 @@ class Target(Flag):
     _GPU = auto()
 
     _CUDA = auto()
+    _HIP = auto()
 
     _SYCL = auto()
 
@@ -86,8 +87,14 @@ class Target(Flag):
     Generate a CUDA kernel for a generic Nvidia GPU.
     """
 
-    GPU = CUDA
-    """Alias for `Target.CUDA`, for backward compatibility."""
+    HIP = _GPU | _HIP
+    """Generic HIP GPU target.
+    
+    Generate a HIP kernel for generic AMD or NVidia GPUs.
+    """
+
+    GPU = CurrentGPU
+    """Alias for `Target.CurrentGPU`, for backward compatibility."""
 
     SYCL = _SYCL
     """SYCL kernel target.
@@ -99,15 +106,24 @@ class Target(Flag):
     """
 
     def is_automatic(self) -> bool:
+        """Determine if this target is a proxy target that is automatically resolved
+        according to the runtime environment."""
         return Target._AUTOMATIC in self
 
     def is_cpu(self) -> bool:
+        """Determine if this target is a CPU target."""
         return Target._CPU in self
 
     def is_vector_cpu(self) -> bool:
+        """Determine if this target is a vector CPU target."""
         return self.is_cpu() and Target._VECTOR in self
 
     def is_gpu(self) -> bool:
+        """Determine if this target is a GPU target.
+        
+        This refers to targets for the CUDA and HIP family of platforms.
+        `Target.SYCL` is *not* a GPU target.
+        """
         return Target._GPU in self
 
     @staticmethod
@@ -119,12 +135,30 @@ class Target(Flag):
         else:
             return Target.GenericCPU
         
+    @staticmethod
+    def auto_gpu() -> Target:
+        """Return the GPU target available in the current runtime environment.
+        
+        Raises:
+            RuntimeError: If `cupy` is not installed and therefore no GPU runtime is available.
+        """
+        try:
+            import cupy
+
+            if cupy.cuda.runtime.is_hip:
+                return Target.HIP
+            else:
+                return Target.CUDA
+        except ImportError:
+            raise RuntimeError("Cannot infer GPU target since cupy is not installed.")
+        
     @staticmethod
     def available_targets() -> list[Target]:
+        """List available"""
         targets = [Target.GenericCPU]
         try:
             import cupy  # noqa: F401
-            targets.append(Target.CUDA)
+            targets.append(Target.auto_gpu())
         except ImportError:
             pass
 
diff --git a/src/pystencils/datahandling/datahandling_interface.py b/src/pystencils/datahandling/datahandling_interface.py
index 867bbf062d7307187c6a72f465fe54177bbfacc1..a6b1fcb559f183f3e82d7a68db8c75423df8c520 100644
--- a/src/pystencils/datahandling/datahandling_interface.py
+++ b/src/pystencils/datahandling/datahandling_interface.py
@@ -17,8 +17,6 @@ class DataHandling(ABC):
     'gather' function that has collects (parts of the) distributed data on a single process.
     """
 
-    _GPU_LIKE_TARGETS = [Target.GPU]
-
     # ---------------------------- Adding and accessing data -----------------------------------------------------------
     @property
     @abstractmethod
diff --git a/src/pystencils/datahandling/serial_datahandling.py b/src/pystencils/datahandling/serial_datahandling.py
index 73b749ca46171c86f6183789bff2731efb8a1a5d..dc6904c3a59cc7e86746718a226e9738ad9db1a0 100644
--- a/src/pystencils/datahandling/serial_datahandling.py
+++ b/src/pystencils/datahandling/serial_datahandling.py
@@ -110,7 +110,7 @@ class SerialDataHandling(DataHandling):
         if layout is None:
             layout = self.default_layout
         if gpu is None:
-            gpu = self.default_target in self._GPU_LIKE_TARGETS
+            gpu = self.default_target.is_gpu()
 
         kwargs = {
             'shape': tuple(s + 2 * ghost_layers for s in self._domainSize),
@@ -241,7 +241,7 @@ class SerialDataHandling(DataHandling):
 
     def swap(self, name1, name2, gpu=None):
         if gpu is None:
-            gpu = self.default_target in self._GPU_LIKE_TARGETS
+            gpu = self.default_target.is_gpu()
         arr = self.gpu_arrays if gpu else self.cpu_arrays
         arr[name1], arr[name2] = arr[name2], arr[name1]
 
@@ -292,7 +292,7 @@ class SerialDataHandling(DataHandling):
         if target is None:
             target = self.default_target
             
-        if not (target.is_cpu() or target == Target.CUDA):
+        if not (target.is_cpu() or target.is_gpu()):
             raise ValueError(f"Unsupported target: {target}")
 
         if not hasattr(names, '__len__') or type(names) is str:
diff --git a/src/pystencils/enums.py b/src/pystencils/enums.py
index bcea50e84cfba9190a2353245c7c29168443ca13..9d6470ed71cbea42b50564dadeca6fa9943e5506 100644
--- a/src/pystencils/enums.py
+++ b/src/pystencils/enums.py
@@ -5,7 +5,8 @@ from warnings import warn
 warn(
     "Importing anything from `pystencils.enums` is deprecated and the module will be removed in pystencils 2.1. "
     "Import from `pystencils` instead.",
-    FutureWarning
+    FutureWarning,
+    stacklevel=2,
 )
 
 Target = _Target
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 4e36e369e3b65f3f6e8c8f268feae9e16e1b6b56..2217809618ec9a161d5e2d6f8c3c374e399e0471 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -207,18 +207,25 @@ class CupyKernelWrapper(KernelWrapper):
 
 class CupyJit(JitBase):
 
-    def __init__(self):
-        self._runtime_headers = {"<cstdint>"}
-
     def compile(self, kernel: Kernel) -> KernelWrapper:
         if not HAVE_CUPY:
             raise JitError(
                 "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
             )
 
-        if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA:
-            raise ValueError(
-                "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP"
+        if not isinstance(kernel, GpuKernel):
+            raise JitError(
+                "The CupyJit just-in-time compiler only accepts GPU kernels generated for CUDA or HIP"
+            )
+
+        if kernel.target == Target.CUDA and cp.cuda.runtime.is_hip:
+            raise JitError(
+                "Cannot compile a CUDA kernel on a HIP-based Cupy installation."
+            )
+
+        if kernel.target == Target.HIP and not cp.cuda.runtime.is_hip:
+            raise JitError(
+                "Cannot compile a HIP kernel on a CUDA-based Cupy installation."
             )
 
         options = self._compiler_options()
@@ -237,7 +244,13 @@ class CupyJit(JitBase):
         return tuple(options)
 
     def _prelude(self, kfunc: GpuKernel) -> str:
-        headers = self._runtime_headers
+
+        headers: set[str]
+        if cp.cuda.runtime.is_hip:
+            headers = set()
+        else:
+            headers = {"<cstdint>"}
+
         headers |= kfunc.required_headers
 
         if '"pystencils_runtime/half.h"' in headers:
diff --git a/tests/fixtures.py b/tests/fixtures.py
index ba2593f76f4fae81d7e785fdb5f7b0c9a4639c28..a4c77f550e228ecfeb7b4e61b95bd2dce9739f9f 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -18,16 +18,7 @@ from types import ModuleType
 
 import pystencils as ps
 
-AVAILABLE_TARGETS = [ps.Target.GenericCPU]
-
-try:
-    import cupy
-
-    AVAILABLE_TARGETS += [ps.Target.CUDA]
-except ImportError:
-    pass
-
-AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets()
+AVAILABLE_TARGETS = ps.Target.available_targets()
 TARGET_IDS = [t.name for t in AVAILABLE_TARGETS]
 
 
@@ -72,9 +63,9 @@ def xp(target: ps.Target) -> ModuleType:
     """Primary array module for the current target.
 
     Returns:
-        `cupy` if `target == Target.CUDA`, and `numpy` otherwise
+        `cupy` if `target.is_gpu()`, and `numpy` otherwise
     """
-    if target == ps.Target.CUDA:
+    if target.is_gpu():
         import cupy as xp
 
         return xp
diff --git a/tests/kernelcreation/test_buffer_gpu.py b/tests/kernelcreation/test_buffer_gpu.py
index 0b5019fba49ba439253ef1257b3c2f12a728d064..bd9d2156b451e10f7a91ed21b76a64e60af9bd03 100644
--- a/tests/kernelcreation/test_buffer_gpu.py
+++ b/tests/kernelcreation/test_buffer_gpu.py
@@ -58,7 +58,7 @@ def test_full_scalar_field():
 
         pack_eqs = [Assignment(buffer.center(), src_field.center())]
 
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         pack_ast = create_kernel(pack_eqs, config=config)
 
         pack_kernel = pack_ast.compile()
@@ -66,7 +66,7 @@ def test_full_scalar_field():
 
         unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
 
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         unpack_ast = create_kernel(unpack_eqs, config=config)
 
         unpack_kernel = unpack_ast.compile()
@@ -94,7 +94,7 @@ def test_field_slice():
 
             pack_eqs = [Assignment(buffer.center(), src_field.center())]
 
-            config = CreateKernelConfig(target=pystencils.Target.GPU)
+            config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
             pack_ast = create_kernel(pack_eqs, config=config)
 
             pack_kernel = pack_ast.compile()
@@ -103,7 +103,7 @@ def test_field_slice():
             # Unpack into ghost layer of dst_field in N direction
             unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
 
-            config = CreateKernelConfig(target=pystencils.Target.GPU)
+            config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
             unpack_ast = create_kernel(unpack_eqs, config=config)
 
             unpack_kernel = unpack_ast.compile()
@@ -131,7 +131,7 @@ def test_all_cell_values():
             eq = Assignment(buffer(idx), src_field(idx))
             pack_eqs.append(eq)
 
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         pack_code = create_kernel(pack_eqs, config=config)
         pack_kernel = pack_code.compile()
 
@@ -143,7 +143,7 @@ def test_all_cell_values():
             eq = Assignment(dst_field(idx), buffer(idx))
             unpack_eqs.append(eq)
 
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         unpack_ast = create_kernel(unpack_eqs, config=config)
         unpack_kernel = unpack_ast.compile()
         unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)
@@ -173,7 +173,7 @@ def test_subset_cell_values():
             pack_eqs.append(eq)
 
         pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         pack_ast = create_kernel(pack_eqs, config=config)
         pack_kernel = pack_ast.compile()
         pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)
@@ -185,7 +185,7 @@ def test_subset_cell_values():
             unpack_eqs.append(eq)
 
         unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         unpack_ast = create_kernel(unpack_eqs, config=config)
         unpack_kernel = unpack_ast.compile()
 
@@ -215,7 +215,7 @@ def test_field_layouts():
                 pack_eqs.append(eq)
 
             pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            config = CreateKernelConfig(target=pystencils.Target.GPU)
+            config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
             pack_ast = create_kernel(pack_eqs, config=config)
             pack_kernel = pack_ast.compile()
 
@@ -228,7 +228,7 @@ def test_field_layouts():
                 unpack_eqs.append(eq)
 
             unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            config = CreateKernelConfig(target=pystencils.Target.GPU)
+            config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
             unpack_ast = create_kernel(unpack_eqs, config=config)
             unpack_kernel = unpack_ast.compile()
 
@@ -299,7 +299,7 @@ def test_iteration_slices(gpu_indexing):
         gpu_src_arr.set(src_arr)
         gpu_dst_arr.fill(0)
 
-        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice)
+        config = CreateKernelConfig(target=Target.CurrentGPU, iteration_slice=pack_slice)
 
         pack_code = create_kernel(pack_eqs, config=config)
         pack_kernel = pack_code.compile()
@@ -311,7 +311,7 @@ def test_iteration_slices(gpu_indexing):
             eq = Assignment(dst_field(idx), buffer(idx))
             unpack_eqs.append(eq)
 
-        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice)
+        config = CreateKernelConfig(target=Target.CurrentGPU, iteration_slice=pack_slice)
 
         unpack_code = create_kernel(unpack_eqs, config=config)
         unpack_kernel = unpack_code.compile()
diff --git a/tests/kernelcreation/test_functions.py b/tests/kernelcreation/test_functions.py
index a4d154d4b0c86ea694bbe94f66372aa2ba3a190c..182a590056d68a9677a657877416574db9f81e25 100644
--- a/tests/kernelcreation/test_functions.py
+++ b/tests/kernelcreation/test_functions.py
@@ -106,14 +106,14 @@ def function_domain(function_name, dtype):
         case "pow":
             return np.concatenate(
                 [
-                    [0., 1., 1.],
-                    rng.uniform(-1., 1., 8),
-                    rng.uniform(0., 5., 8),
+                    [0.0, 1.0, 1.0],
+                    rng.uniform(-1.0, 1.0, 8),
+                    rng.uniform(0.0, 5.0, 8),
                 ]
             ).astype(dtype), np.concatenate(
                 [
-                    [1., 0., 2.],
-                    np.arange(2., 10., 1.),
+                    [1.0, 0.0, 2.0],
+                    np.arange(2.0, 10.0, 1.0),
                     rng.uniform(-2.0, 2.0, 8),
                 ]
             ).astype(
@@ -211,14 +211,14 @@ def test_binary_functions(gen_config, xp, function_name, dtype, function_domain)
 
 dtype_and_target_for_integer_funcs = pytest.mark.parametrize(
     "dtype, target",
-    list(product([np.int32], [t for t in AVAIL_TARGETS if t is not Target.CUDA]))
+    list(product([np.int32], [t for t in AVAIL_TARGETS if not t.is_gpu()]))
     + list(
         product(
             [np.int64],
             [
                 t
                 for t in AVAIL_TARGETS
-                if t not in (Target.X86_SSE, Target.X86_AVX, Target.CUDA)
+                if t not in (Target.X86_SSE, Target.X86_AVX) and not t.is_gpu()
             ],
         )
     ),
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 70b4edd35852b1d2ff674e6cb0421bf4fea66e1f..a3f8a5482ab24ecb53fb7f297a28930a6bc7deeb 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -49,7 +49,7 @@ def test_indexing_options_3d(
         + src[0, 0, 1],
     )
 
-    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg = CreateKernelConfig(target=Target.CurrentGPU)
     cfg.gpu.indexing_scheme = indexing_scheme
     cfg.gpu.manual_launch_grid = manual_grid
     cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
@@ -90,10 +90,32 @@ def test_indexing_options_3d(
 
     cp.testing.assert_allclose(dst_arr, expected)
 
-@pytest.mark.parametrize("iteration_space",
-                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)])
-@pytest.mark.parametrize("initial_block_size",
-                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)])
+
+@pytest.mark.parametrize(
+    "iteration_space",
+    [
+        (8, 4, 4),
+        (1, 8, 8),
+        (1, 1, 16),
+        (17, 1, 1),
+        (1, 12, 56),
+        (65, 65, 65),
+        (1, 7, 9),
+    ],
+)
+@pytest.mark.parametrize(
+    "initial_block_size",
+    [
+        (8, 4, 4),
+        (1, 8, 8),
+        (1, 1, 16),
+        (2, 2, 64),
+        (8, 2, 1),
+        (3, 1, 32),
+        (32, 1, 1),
+        (1, 2, 3),
+    ],
+)
 @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
 @pytest.mark.parametrize("use_block_fitting", [True, False])
 def test_block_size_adaptations(
@@ -102,7 +124,13 @@ def test_block_size_adaptations(
     assume_warp_aligned_block_size: bool,
     use_block_fitting: bool,
 ):
-    src, dst = fields("src, dst: [3D]")
+    field_shape = tuple(2 + x for x in iteration_space[::-1])
+    src_arr = cp.ones(field_shape)
+    dst_arr = cp.zeros_like(src_arr)
+
+    src = Field.create_from_numpy_array("src", src_arr)
+    dst = Field.create_from_numpy_array("dst", dst_arr)
+
     asm = Assignment(
         dst.center(),
         src[-1, 0, 0]
@@ -113,25 +141,21 @@ def test_block_size_adaptations(
         + src[0, 0, 1],
     )
 
-    target = Target.CUDA
+    target = Target.CurrentGPU
     cfg = CreateKernelConfig(target=target)
     cfg.gpu.indexing_scheme = "linear3d"
     cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
-
-    warp_size = cfg.gpu.default_warp_size(target)
-    max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
-    max_block_sizes = GpuIndexing.get_max_block_sizes(target)
+    
+    warp_size = 32
+    cfg.gpu.warp_size = warp_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
 
     if use_block_fitting:
         # test internal block fitting function later used in `kernel.launch_config.fit_block_size`
-        internal_block_size = kernel.launch_config._fit_block_size_to_it_space(
-            iteration_space,
-            initial_block_size,
-            HardwareProperties(warp_size, max_threads_per_block, max_block_sizes),
-        )
+        kernel.launch_config.fit_block_size(initial_block_size)
+        internal_block_size, _ = kernel.launch_config.evaluate()
 
         # checks if criterion for warp size alignment is fulfilled
         def check_suitability(b):
@@ -139,25 +163,20 @@ def test_block_size_adaptations(
 
         # block size fitting should not modify an already ideal configuration
         # -> check if ideal configurations are modified
-        if (
-                check_suitability(initial_block_size)
-                and all(x == y for x, y in zip(initial_block_size, iteration_space))  # trimming may alter results
-        ):
-            assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \
-                f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}."
-
-        assert check_suitability(internal_block_size), \
-            "Determined block size shall be divisible by warp size."
-
-        # set block size via fitting algorithm
-        kernel.launch_config.fit_block_size(initial_block_size)
+        if check_suitability(initial_block_size) and all(
+            x == y for x, y in zip(initial_block_size, iteration_space)
+        ):  # trimming may alter results
+            assert all(
+                x == y for x, y in zip(initial_block_size, internal_block_size)
+            ), f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}."
+
+        assert check_suitability(
+            internal_block_size
+        ), "Determined block size shall be divisible by warp size."
     else:
         # set block size via trimming algorithm
         kernel.launch_config.trim_block_size(initial_block_size)
 
-    src_arr = cp.ones(iteration_space)
-    dst_arr = cp.zeros_like(src_arr)
-
     kernel(src=src_arr, dst=dst_arr)
 
     expected = cp.zeros_like(src_arr)
@@ -173,15 +192,9 @@ def test_indexing_options_2d(
     indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool
 ):
     src, dst = fields("src, dst: [2D]")
-    asm = Assignment(
-        dst.center(),
-        src[-1, 0]
-        + src[1, 0]
-        + src[0, -1]
-        + src[0, 1]
-    )
+    asm = Assignment(dst.center(), src[-1, 0] + src[1, 0] + src[0, -1] + src[0, 1])
 
-    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg = CreateKernelConfig(target=Target.CurrentGPU)
     cfg.gpu.indexing_scheme = indexing_scheme
     cfg.gpu.manual_launch_grid = manual_grid
     cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
@@ -227,7 +240,7 @@ def test_invalid_indexing_schemes():
     src, dst = fields("src, dst: [4D]")
     asm = Assignment(src.center(0), dst.center(0))
 
-    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg = CreateKernelConfig(target=Target.CurrentGPU)
     cfg.gpu.indexing_scheme = "linear3d"
 
     with pytest.raises(Exception):
@@ -342,7 +355,7 @@ def test_ghost_layer():
     ghost_layers = [(1, 2), (2, 1)]
 
     config = CreateKernelConfig()
-    config.target = Target.CUDA
+    config.target = Target.CurrentGPU
     config.ghost_layers = ghost_layers
     config.gpu.indexing_scheme = "blockwise4d"
 
@@ -371,7 +384,7 @@ def test_setting_value():
     update_rule = [Assignment(f(0), sp.Symbol("value"))]
 
     config = CreateKernelConfig()
-    config.target = Target.CUDA
+    config.target = Target.CurrentGPU
     config.iteration_slice = iteration_slice
     config.gpu.indexing_scheme = "blockwise4d"
 
diff --git a/tests/kernelcreation/test_half_precision.py b/tests/kernelcreation/test_half_precision.py
index a9745459da19dbd206264980ed72987cc7879387..5dbe2180ed48e8d71c216fa015da15005fdee9d1 100644
--- a/tests/kernelcreation/test_half_precision.py
+++ b/tests/kernelcreation/test_half_precision.py
@@ -5,7 +5,7 @@ import numpy as np
 import pystencils as ps
 
 
-@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU))
+@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.CurrentGPU))
 def test_half_precison(target):
     if target == ps.Target.CPU:
         if not platform.machine() in ['arm64', 'aarch64']:
@@ -14,7 +14,7 @@ def test_half_precison(target):
         if 'clang' not in ps.cpu.cpujit.get_compiler_config()['command']:
             pytest.xfail("skipping half precision because clang compiler is not used")
 
-    if target == ps.Target.GPU:
+    if target.is_gpu():
         pytest.importorskip("cupy")
 
     dh = ps.create_data_handling(domain_size=(10, 10), default_target=target)
diff --git a/tests/kernelcreation/test_index_kernels.py b/tests/kernelcreation/test_index_kernels.py
index 569c0ab6a0e582de895a66c656697fdf8a5909ee..bda0ef2732067ef58d682a4501b855ca936148a8 100644
--- a/tests/kernelcreation/test_index_kernels.py
+++ b/tests/kernelcreation/test_index_kernels.py
@@ -5,14 +5,7 @@ from pystencils import Assignment, Field, FieldType, AssignmentCollection, Targe
 from pystencils import create_kernel, CreateKernelConfig
 
 
-@pytest.mark.parametrize("target", [Target.CPU, Target.GPU])
-def test_indexed_kernel(target):
-    if target == Target.GPU:
-        cp = pytest.importorskip("cupy")
-        xp = cp
-    else:
-        xp = np
-
+def test_indexed_kernel(target, xp):
     arr = xp.zeros((3, 4))
     dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)], align=True)
     
@@ -21,8 +14,8 @@ def test_indexed_kernel(target):
     cpu_index_arr[1] = (1, 3, 42.0)
     cpu_index_arr[2] = (2, 1, 5.0)
 
-    if target == Target.GPU:
-        gpu_index_arr = cp.empty(cpu_index_arr.shape, cpu_index_arr.dtype)
+    if target.is_gpu():
+        gpu_index_arr = xp.empty(cpu_index_arr.shape, cpu_index_arr.dtype)
         gpu_index_arr.set(cpu_index_arr)
         index_arr = gpu_index_arr
     else:
@@ -40,8 +33,8 @@ def test_indexed_kernel(target):
 
     kernel(f=arr, index=index_arr)
 
-    if target == Target.GPU:
-        arr = cp.asnumpy(arr)
+    if target.is_gpu():
+        arr = xp.asnumpy(arr)
 
     for i in range(cpu_index_arr.shape[0]):
         np.testing.assert_allclose(arr[cpu_index_arr[i]['x'], cpu_index_arr[i]['y']], cpu_index_arr[i]['value'], atol=1e-13)
diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py
index b1f2da576750b60e4fbb8d9d4d33e393bc00dcf3..2b3a8ebf0e2fbbd5ed94779b2eb764a6127fc030 100644
--- a/tests/kernelcreation/test_iteration_slices.py
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -144,7 +144,7 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
     islice = make_slice[:, slow_counter:]
     gen_config = replace(gen_config, iteration_slice=islice)
 
-    if gen_config.target == Target.CUDA:
+    if gen_config.target.is_gpu():
         gen_config.gpu.manual_launch_grid = True
 
     kernel = create_kernel(update, gen_config).compile()
@@ -177,7 +177,7 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
     islice = make_slice[:, start::2]
     gen_config.iteration_slice = islice
 
-    if gen_config.target == Target.CUDA:
+    if gen_config.target.is_gpu():
         gen_config.gpu.manual_launch_grid = True
 
     try:
diff --git a/tests/runtime/test_boundary.py b/tests/runtime/test_boundary.py
index fb8f827e88106fd3b7a45b25f9c962f4ebb14f14..226510b83d8832a5a189552df5c8760235f0d598 100644
--- a/tests/runtime/test_boundary.py
+++ b/tests/runtime/test_boundary.py
@@ -98,7 +98,7 @@ def test_kernel_vs_copy_boundary():
 
 def test_boundary_gpu():
     pytest.importorskip('cupy')
-    dh = SerialDataHandling(domain_size=(7, 7), default_target=Target.GPU)
+    dh = SerialDataHandling(domain_size=(7, 7), default_target=Target.CurrentGPU)
     src = dh.add_array('src')
     dh.fill("src", 0.0, ghost_layers=True)
     dh.fill("src", 1.0, ghost_layers=False)
@@ -111,7 +111,7 @@ def test_boundary_gpu():
                                              name="boundary_handling_cpu", target=Target.CPU)
 
     boundary_handling = BoundaryHandling(dh, src.name, boundary_stencil,
-                                         name="boundary_handling_gpu", target=Target.GPU)
+                                         name="boundary_handling_gpu", target=Target.CurrentGPU)
 
     neumann = Neumann()
     for d in ('N', 'S', 'W', 'E'):
diff --git a/tests/runtime/test_datahandling.py b/tests/runtime/test_datahandling.py
index 9d7ff924e8d7eba9039f8f0796145bd7de116ef5..9e7c73cac225131cbb2ba160919846de30bc7ca8 100644
--- a/tests/runtime/test_datahandling.py
+++ b/tests/runtime/test_datahandling.py
@@ -118,7 +118,7 @@ def synchronization(dh, test_gpu=False):
 
 def kernel_execution_jacobi(dh, target):
 
-    test_gpu = target == Target.GPU
+    test_gpu = target == Target.CurrentGPU
     dh.add_array('f', gpu=test_gpu)
     dh.add_array('tmp', gpu=test_gpu)
 
@@ -219,15 +219,15 @@ def test_kernel():
         try:
             import cupy
             dh = create_data_handling(domain_size=domain_shape, periodicity=True)
-            kernel_execution_jacobi(dh, Target.GPU)
+            kernel_execution_jacobi(dh, Target.CurrentGPU)
         except ImportError:
             pass
 
 
-@pytest.mark.parametrize('target', (Target.CPU, Target.GPU))
+@pytest.mark.parametrize('target', (Target.CPU, Target.CurrentGPU))
 def test_kernel_param(target):
     for domain_shape in [(4, 5), (3, 4, 5)]:
-        if target == Target.GPU:
+        if target == Target.CurrentGPU:
             pytest.importorskip('cupy')
 
         dh = create_data_handling(domain_size=domain_shape, periodicity=True, default_target=target)
@@ -262,7 +262,7 @@ def test_add_arrays():
 def test_add_arrays_with_layout(shape, layout):
     pytest.importorskip('cupy')
 
-    dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.GPU)
+    dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.CurrentGPU)
     f1 = dh.add_array("f1", values_per_cell=19)
     dh.fill(f1.name, 1.0)
 
@@ -392,8 +392,6 @@ def test_array_handler(device_number):
     empty = array_handler.empty(shape=size, order="F")
     assert empty.strides == (8, 16)
 
-    random_array = array_handler.randn(size)
-
     cpu_array = np.empty((20, 40), dtype=np.float64)
     gpu_array = array_handler.to_gpu(cpu_array)