Michael Kuron · Michael Kuron · Markus Holzer · Markus Holzer · Markus Holzer · Michael Kuron
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -84,21 +84,21 @@ latest-python:
 # Minimal tests in windows environment
-minimal-windows:
+#minimal-windows:
-  stage: test
+#  stage: test
-  except:
+#  except:
-    variables:
+#    variables:
-      - $ENABLE_NIGHTLY_BUILDS
+#      - $ENABLE_NIGHTLY_BUILDS
-  tags:
+#  tags:
-    - win
+#    - win
-  script:
+#  script:
-    - export NUM_CORES=$(nproc --all)
+#    - export NUM_CORES=$(nproc --all)
-    - source /cygdrive/c/Users/build/Miniconda3/Scripts/activate
+#    - source /cygdrive/c/Users/build/Miniconda3/Scripts/activate
-    - source activate pystencils
+#    - source activate pystencils
-    - pip install joblib
+#    - pip install joblib
-    - pip list
+#    - pip list
-    - python -c "import numpy"
+#    - python -c "import numpy"
-    - py.test -v -m "not (notebook or longrun)"
+#    - py.test -v -m "not (notebook or longrun)"
 ubuntu:
  stage: test

--- a/pystencils/__init__.py
+++ b/pystencils/__init__.py
@@ -40,12 +40,3 @@ from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
-# setting the default GPU to the one with maximal memory. GPU_DEVICE is safe to overwrite for different needs
-try:
-    import cupy
-    if cupy.cuda.runtime.getDeviceCount() > 0:
-        GPU_DEVICE = sorted(range(cupy.cuda.runtime.getDeviceCount()),
-                            key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
-except ImportError:
-    pass
--- a/pystencils/boundaries/boundaryhandling.py
+++ b/pystencils/boundaries/boundaryhandling.py
@@ -18,6 +18,7 @@ try:
    import waLBerla as wlb
    if wlb.cpp_available:
        from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
+        import cupy.cuda.runtime
    else:
        ParallelDataHandling = None
 except ImportError:
@@ -100,7 +101,7 @@ class BoundaryHandling:
        self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
        if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
-            array_handler = GPUArrayHandler()
+            array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
        else:
            array_handler = self.data_handling.array_handler

--- a/pystencils/datahandling/__init__.py
+++ b/pystencils/datahandling/__init__.py
@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                         default_layout: str = 'SoA',
                         default_target: Target = Target.CPU,
                         parallel: bool = False,
-                         default_ghost_layers: int = 1) -> DataHandling:
+                         default_ghost_layers: int = 1,
+                         device_number: Union[int, None] = None) -> DataHandling:
    """Creates a data handling instance.
    Args:
@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...],
        default_target: `Target`
        parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
        default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
+        device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
+                       specified. If none is given, the device with the largest amount of memory is used. If multiple
+                       devices have the same amount of memory, the one with the lower number is used
    """
    if isinstance(default_target, str):
        new_target = Target[default_target.upper()]
@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                                  periodicity=periodicity,
                                  default_target=default_target,
                                  default_layout=default_layout,
-                                  default_ghost_layers=default_ghost_layers)
+                                  default_ghost_layers=default_ghost_layers,
+                                  device_number=device_number)
 __all__ = ['create_data_handling']
--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -22,7 +22,8 @@ class SerialDataHandling(DataHandling):
                 default_layout: str = 'SoA',
                 periodicity: Union[bool, Sequence[bool]] = False,
                 default_target: Target = Target.CPU,
-                 array_handler=None) -> None:
+                 array_handler=None,
+                 device_number=None) -> None:
        """
        Creates a data handling for single node simulations.
@@ -30,9 +31,17 @@ class SerialDataHandling(DataHandling):
            domain_size: size of the spatial domain as tuple
            default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
            default_layout: default layout used, if  not overridden in add_array() method
+            periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
+                         Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
+                         False (non-periodic)
            default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
                            allocated if not overwritten in add_array, and synchronization functions are for the GPU by
                            default
+            array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
+                           and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
+            device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
+                           the device with the largest amount of memory is used. If multiple devices have the same
+                           amount of memory, the one with the lower number is used
        """
        super(SerialDataHandling, self).__init__()
        self._domainSize = tuple(domain_size)
@@ -47,8 +56,13 @@ class SerialDataHandling(DataHandling):
        if not array_handler:
            try:
-                self.array_handler = GPUArrayHandler()
+                if device_number is None:
-            except Exception:
+                    import cupy.cuda.runtime
+                    if cupy.cuda.runtime.getDeviceCount() > 0:
+                        device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
+                                               key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
+                self.array_handler = GPUArrayHandler(device_number)
+            except ImportError:
                self.array_handler = GPUNotAvailableHandler()
        else:
            self.array_handler = array_handler

--- a/pystencils/gpu/gpu_array_handler.py
+++ b/pystencils/gpu/gpu_array_handler.py
@@ -6,30 +6,28 @@ except ImportError:
    cpx = None
 import numpy as np
-import pystencils
 class GPUArrayHandler:
-    @staticmethod
+    def __init__(self, device_number):
-    def zeros(shape, dtype=np.float64, order='C'):
+        self._device_number = device_number
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+    def zeros(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
            return cp.zeros(shape=shape, dtype=dtype, order=order)
-    @staticmethod
+    def ones(self, shape, dtype=np.float64, order='C'):
-    def ones(shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
            return cp.ones(shape=shape, dtype=dtype, order=order)
-    @staticmethod
+    def empty(self, shape, dtype=np.float64, order='C'):
-    def empty(shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
            return cp.empty(shape=shape, dtype=dtype, order=order)
-    @staticmethod
+    def to_gpu(self, numpy_array):
-    def to_gpu(numpy_array):
        swaps = _get_index_swaps(numpy_array)
        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                gpu_array = cp.asarray(numpy_array.base)
            for a, b in reversed(swaps):
                gpu_array = gpu_array.swapaxes(a, b)
@@ -37,27 +35,26 @@ class GPUArrayHandler:
        else:
            return cp.asarray(numpy_array)
-    @staticmethod
+    def upload(self, array, numpy_array):
-    def upload(array, numpy_array):
+        assert self._device_number == array.device.id
        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                array.base.set(numpy_array.base)
        else:
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                array.set(numpy_array)
-    @staticmethod
+    def download(self, array, numpy_array):
-    def download(array, numpy_array):
+        assert self._device_number == array.device.id
        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                numpy_array.base[:] = array.base.get()
        else:
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                numpy_array[:] = array.get()
-    @staticmethod
+    def randn(self, shape, dtype=np.float64):
-    def randn(shape, dtype=np.float64):
+        with cp.cuda.Device(self._device_number):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
            return cp.random.randn(*shape, dtype=dtype)
    @staticmethod

--- a/pystencils/gpu/gpujit.py
+++ b/pystencils/gpu/gpujit.py
 import numpy as np
-import pystencils
 from pystencils.backends.cbackend import get_headers
 from pystencils.backends.cuda_backend import generate_cuda
 from pystencils.typing import StructType
@@ -42,6 +41,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
    headers = get_headers(kernel_function_node)
    if cp.cuda.runtime.is_hip:
        headers.add('"gpu_defines.h"')
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<hip/hip_fp16.h>')
    else:
        headers.update({'"gpu_defines.h"', '<cstdint>'})
        for field in kernel_function_node.fields_accessed:
@@ -72,7 +74,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
                         for k, v in kwargs.items()))
        try:
            args, block_and_thread_numbers = cache[key]
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
        except KeyError:
            full_arguments = argument_dict.copy()
@@ -87,11 +91,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
            args = tuple(_build_numpy_argument_list(parameters, full_arguments))
            cache[key] = (args, block_and_thread_numbers)
            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
-            # useful for debugging:
+                # useful for debugging:
-            # with cp.cuda.Device(pystencils.GPU_DEVICE):
+                # cp.cuda.runtime.deviceSynchronize()
-            #     cp.cuda.runtime.deviceSynchronize()
        # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
    ast = kernel_function_node

--- a/pystencils/gpu/indexing.py
+++ b/pystencils/gpu/indexing.py
 import abc
 from functools import partial
+import math
 import sympy as sp
 from sympy.core.cache import cacheit
-import pystencils
 from pystencils.astnodes import Block, Conditional
 from pystencils.typing import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
@@ -97,11 +97,14 @@ class BlockIndexing(AbstractIndexing):
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
        compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
+        maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
+                            maximum block size from the device properties
+        device_number: device number of the used GPU. By default, the zeroth device is used.
    """
    def __init__(self, field, iteration_slice,
                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
-                 maximum_block_size=(1024, 1024, 64)):
+                 maximum_block_size=(1024, 1024, 64), device_number=None):
        if field.spatial_dimensions > 3:
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
@@ -110,17 +113,22 @@ class BlockIndexing(AbstractIndexing):
        self._block_size = block_size
        if maximum_block_size == 'auto':
+            assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
            # Get device limits
            import cupy as cp
-            device = cp.cuda.Device(pystencils.GPU_DEVICE)
+            # See https://github.com/cupy/cupy/issues/7676
-            da = device.attributes
+            if cp.cuda.runtime.is_hip:
-            maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
+                maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
+            else:
+                da = cp.cuda.Device(device_number).attributes
+                maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
        self._maximum_block_size = maximum_block_size
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._dim = field.spatial_dimensions
        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
        self._compile_time_block_size = compile_time_block_size
+        self._device_number = device_number
    @property
    def cuda_indices(self):
@@ -178,32 +186,34 @@ class BlockIndexing(AbstractIndexing):
    def iteration_space(self, arr_shape):
        return _iteration_space(self._iterationSlice, arr_shape)
-    @staticmethod
+    def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
-    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
+        """Shrinks the block_size if there are too many registers used per block.
-        """Shrinks the block_size if there are too many registers used per multiprocessor.
        This is not done automatically, since the required_registers_per_thread are not known before compilation.
        They can be obtained by ``func.num_regs`` from a cupy function.
-        :returns smaller block_size if too many registers are used.
+        Args:
+            block_size: used block size that is target for limiting
+            required_registers_per_thread: needed registers per thread
+        returns: smaller block_size if too many registers are used.
        """
        import cupy as cp
-        device = cp.cuda.Device(pystencils.GPU_DEVICE)
-        da = device.attributes
-        available_registers_per_mp = da.get("MaxRegistersPerMultiprocessor")
+        # See https://github.com/cupy/cupy/issues/7676
+        if cp.cuda.runtime.is_hip:
-        block = block_size
+            max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
+        else:
+            device = cp.cuda.Device(self._device_number)
+            da = device.attributes
+            max_registers_per_block = da.get("MaxRegistersPerBlock")
+        result = list(block_size)
        while True:
-            num_threads = 1
+            required_registers = math.prod(result) * required_registers_per_thread
-            for t in block:
+            if required_registers <= max_registers_per_block:
-                num_threads *= t
+                return result
-            required_registers_per_mt = num_threads * required_registers_per_thread
-            if required_registers_per_mt <= available_registers_per_mp:
-                return block
            else:
-                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
+                largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
-                assert block[largest_grid_entry_idx] >= 2
+                assert result[largest_list_entry_idx] >= 2
-                block[largest_grid_entry_idx] //= 2
+                result[largest_list_entry_idx] //= 2
    @staticmethod
    def permute_block_size_according_to_layout(block_size, layout):

--- a/pystencils/include/gpu_defines.h
+++ b/pystencils/include/gpu_defines.h
@@ -3,3 +3,10 @@
 #define POS_INFINITY __int_as_float(0x7f800000)
 #define INFINITY POS_INFINITY
 #define NEG_INFINITY __int_as_float(0xff800000)
+#ifdef __HIPCC_RTC__
+typedef __hip_uint8_t uint8_t;
+typedef __hip_int8_t int8_t;
+typedef __hip_uint16_t uint16_t;
+typedef __hip_int16_t int16_t;
+#endif
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
-#ifndef __OPENCL_VERSION__
+#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
 #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <emmintrin.h> // SSE2
 #endif
@@ -38,7 +38,7 @@
 #endif
 #endif
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define QUALIFIERS static __forceinline__ __device__
 #elif defined(__OPENCL_VERSION__)
 #define QUALIFIERS static inline
@@ -59,7 +59,9 @@
 typedef uint32_t uint32;
 typedef uint64_t uint64;
 #else
+#ifndef __HIPCC_RTC__
 #include <cstdint>
+#endif
 typedef std::uint32_t uint32;
 typedef std::uint64_t uint64;
 #endif
@@ -75,7 +77,7 @@ typedef svfloat64_t svfloat64_st;
 QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
 {
-#ifndef __CUDA_ARCH__
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
    // host code
 #if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
    *hip = __mulhwu(a,b);
@@ -186,7 +188,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
 #endif
 }
-#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
+#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__)
 #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
 {

--- a/pystencils/stencil.py
+++ b/pystencils/stencil.py
@@ -5,6 +5,8 @@ from typing import Sequence
 import numpy as np
 import sympy as sp
+from pystencils.utils import binary_numbers
 def inverse_direction(direction):
    """Returns inverse i.e. negative of given direction tuple
@@ -293,6 +295,38 @@ def direction_string_to_offset(direction: str, dim: int = 3):
    return offset[:dim]
+def adjacent_directions(direction):
+    """
+    Returns all adjacent directions for a direction as tuple of tuples. This is useful for exmple to find all directions
+    relevant for neighbour communication.
+    Args:
+        direction: tuple representing a direction. For example (0, 1, 0) for the northern side
+    Examples:
+        >>> adjacent_directions((0, 0, 0))
+        ((0, 0, 0),)
+        >>> adjacent_directions((0, 1, 0))
+        ((0, 1, 0),)
+        >>> adjacent_directions((0, 1, 1))
+        ((0, 0, 1), (0, 1, 0), (0, 1, 1))
+        >>> adjacent_directions((-1, -1))
+        ((-1, -1), (-1, 0), (0, -1))
+    """
+    result = set()
+    if all(e == 0 for e in direction):
+        result.add(direction)
+        return tuple(result)
+    binary_numbers_list = binary_numbers(len(direction))
+    for adjacent_direction in binary_numbers_list:
+        for i, entry in enumerate(direction):
+            if entry == 0:
+                adjacent_direction[i] = 0
+            if entry == -1 and adjacent_direction[i] == 1:
+                adjacent_direction[i] = -1
+        if not all(e == 0 for e in adjacent_direction):
+            result.add(tuple(adjacent_direction))
+    return tuple(sorted(result))
 # -------------------------------------- Visualization -----------------------------------------------------------------

--- a/pystencils/utils.py
+++ b/pystencils/utils.py
@@ -96,6 +96,21 @@ def boolean_array_bounding_box(boolean_array):
    return bounds
+def binary_numbers(n):
+    """Returns all binary numbers up to 2^n - 1
+    Example:
+        >>> binary_numbers(2)
+        [[0, 0], [0, 1], [1, 0], [1, 1]]
+    """
+    result = list()
+    for i in range(1 << n):
+        binary_number = bin(i)[2:]
+        binary_number = '0' * (n - len(binary_number)) + binary_number
+        result.append((list(map(int, binary_number))))
+    return result
 class LinearEquationSystem:
    """Symbolic linear system of equations - consisting of matrix and right hand side.

--- a/pystencils_tests/test_custom_backends.py
+++ b/pystencils_tests/test_custom_backends.py
@@ -47,5 +47,5 @@ def test_custom_backends_gpu():
    ast = pystencils.create_kernel(normal_assignments, target=Target.GPU)
    pystencils.show_code(ast, ScreamingGpuBackend())
-    with pytest.raises(cupy.cuda.compiler.JitifyException):
+    with pytest.raises((cupy.cuda.compiler.JitifyException, cupy.cuda.compiler.CompileException)):
        pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
--- a/pystencils_tests/test_datahandling.py
+++ b/pystencils_tests/test_datahandling.py
@@ -15,6 +15,12 @@ except ImportError:
    import unittest.mock
    pytest = unittest.mock.MagicMock()
+try:
+    import cupy.cuda.runtime
+    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+except ImportError:
+    device_numbers = []
 SCRIPT_FOLDER = Path(__file__).parent.absolute()
 INPUT_FOLDER = SCRIPT_FOLDER / "test_data"
@@ -365,10 +371,11 @@ def test_load_data():
    assert np.all(dh.cpu_arrays['dst2']) == 0
-def test_array_handler():
+@pytest.mark.parametrize("device_number", device_numbers)
+def test_array_handler(device_number):
    size = (2, 2)
    pytest.importorskip('cupy')
-    array_handler = GPUArrayHandler()
+    array_handler = GPUArrayHandler(device_number)
    zero_array = array_handler.zeros(size)
    cpu_array = np.empty(size)

--- a/pystencils_tests/test_gpu.py
+++ b/pystencils_tests/test_gpu.py
+import pytest
 import numpy as np
 import cupy as cp
 import sympy as sp
@@ -8,6 +10,12 @@ from pystencils.gpu import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
+try:
+    import cupy
+    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+except ImportError:
+    device_numbers = []
 def test_averaging_kernel():
    size = (40, 55)
@@ -153,7 +161,8 @@ def test_periodicity():
    np.testing.assert_equal(cpu_result, gpu_result)
-def test_block_indexing():
+@pytest.mark.parametrize("device_number", device_numbers)
+def test_block_indexing(device_number):
    f = fields("f: [3D]")
    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
    assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
@@ -162,8 +171,19 @@ def test_block_indexing():
    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False)
    assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
-    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), maximum_block_size="auto")
+    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
+                       maximum_block_size="auto", device_number=device_number)
    # This function should be used if number of needed registers is known. Can be determined with func.num_regs
-    blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], 1000)
+    registers_per_thread = 1000
+    blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
+    if cp.cuda.runtime.is_hip:
+        max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
+    else:
+        device = cp.cuda.Device(device_number)
+        da = device.attributes
+        max_registers_per_block = da.get("MaxRegistersPerBlock")
+    assert np.prod(blocks) * registers_per_thread < max_registers_per_block
-    assert sum(blocks) < sum([1024, 1024, 1])
--- a/pystencils_tests/test_loop_cutting.py
+++ b/pystencils_tests/test_loop_cutting.py
@@ -29,10 +29,6 @@ def offsets_in_plane(normal_plane, offset_int, dimension):
    return result
-# TODO this fails because the condition of the Conditional is not simplified anymore:
-# TODO: ---> transformation.simplify_conditionals
-# TODO this should be fixed
-@pytest.mark.xfail
 def test_staggered_iteration():
    dim = 2
    f_arr = np.arange(5**dim).reshape([5]*dim).astype(np.float64)
No results found