Merge branch 'indexing' into 'master'

Refactor gpu indexing See merge request !341

Merge branch 'indexing' into 'master'
f097d820 · Michael Kuron · 194d0e65 · 33dc8ae9 · f097d820 · f097d820
Commit f097d820 authored 1 year ago by Michael Kuron
--- a/pystencils/gpu/indexing.py
+++ b/pystencils/gpu/indexing.py
 import abc
 from functools import partial
 import math
+from typing import List, Tuple

 import sympy as sp
 from sympy.core.cache import cacheit

-from pystencils.astnodes import Block, Conditional
+from pystencils.astnodes import Block, Conditional, SympyAssignment
 from pystencils.typing import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
-from pystencils.slicing import normalize_slice
 from pystencils.sympyextensions import is_integer_sequence, prod


@@ -33,12 +33,37 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c

 class AbstractIndexing(abc.ABC):
    """
-    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
-    field is mapped to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
-    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
-    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
+    Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
+    to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
+    and computes the number of blocks and threads a kernel is started with.
+    The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
+    and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
+    coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
    """

+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        for iter_space in iteration_space:
+            assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
+                                                  f"not tuple of type {type(iter_space)}"
+        self._iteration_space = iteration_space
+        self._data_layout = data_layout
+        self._dim = len(iteration_space)
+
+    @property
+    def iteration_space(self):
+        """Iteration space to loop over"""
+        return self._iteration_space
+
+    @property
+    def data_layout(self):
+        """Data layout of the kernels arrays"""
+        return self._data_layout
+
+    @property
+    def dim(self):
+        """Number of spatial dimensions"""
+        return self._dim
+
    @property
    @abc.abstractmethod
    def coordinates(self):
@@ -50,6 +75,16 @@ class AbstractIndexing(abc.ABC):
        """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM

+    @abc.abstractmethod
+    def get_loop_ctr_assignments(self, loop_counter_symbols) -> List[SympyAssignment]:
+        """Adds assignments for the loop counter symbols depending on the gpu threads.
+
+        Args:
+            loop_counter_symbols: typed symbols representing the loop counters
+        Returns:
+            assignments for the loop counters
+        """
+
    @abc.abstractmethod
    def call_parameters(self, arr_shape):
        """Determine grid and block size for kernel call.
@@ -92,8 +127,9 @@ class BlockIndexing(AbstractIndexing):
    """Generic indexing scheme that maps sub-blocks of an array to GPU blocks.

    Args:
-        field: pystencils field (common to all Indexing classes)
-        iteration_slice: slice that defines rectangular subarea which is iterated over
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple specifying loop order with innermost loop last.
+                     This is the same format as returned by `Field.layout`.
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
        compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
@@ -102,14 +138,16 @@ class BlockIndexing(AbstractIndexing):
        device_number: device number of the used GPU. By default, the zeroth device is used.
    """

-    def __init__(self, field, iteration_slice,
-                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple[int],
+                 block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
                 maximum_block_size=(1024, 1024, 64), device_number=None):
-        if field.spatial_dimensions > 3:
-            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
+        super(BlockIndexing, self).__init__(iteration_space, data_layout)

-        if permute_block_size_dependent_on_layout:
-            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
+        if self._dim > 4:
+            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
+
+        if permute_block_size_dependent_on_layout and self._dim < 4:
+            block_size = self.permute_block_size_according_to_layout(block_size, data_layout)

        self._block_size = block_size
        if maximum_block_size == 'auto':
@@ -124,9 +162,6 @@ class BlockIndexing(AbstractIndexing):
                maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])

        self._maximum_block_size = maximum_block_size
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._dim = field.spatial_dimensions
-        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
        self._compile_time_block_size = compile_time_block_size
        self._device_number = device_number

@@ -140,17 +175,28 @@ class BlockIndexing(AbstractIndexing):

    @property
    def coordinates(self):
-        offsets = _get_start_from_slice(self._iterationSlice)
-        coordinates = [c + off for c, off in zip(self.cuda_indices, offsets)]
+        if self._dim < 4:
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
+            return coordinates[:self._dim]
+        else:
+            coordinates = list()
+            width = self._iteration_space[1].stop - self.iteration_space[1].start
+            coordinates.append(div_floor(self.cuda_indices[0], width))
+            coordinates.append(sp.Mod(self.cuda_indices[0], width))
+            coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
+            coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
+            return coordinates

-        return coordinates[:self._dim]
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)

    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)
+
+        if len(widths) > 3:
+            widths = [widths[0] * widths[1], widths[2], widths[3]]

-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
        extend_bs = (1,) * (3 - len(self._block_size))
        block_size = self._block_size + extend_bs
        if not self._compile_time_block_size:
@@ -171,20 +217,22 @@ class BlockIndexing(AbstractIndexing):

    def guard(self, kernel_content, arr_shape):
        arr_shape = arr_shape[:self._dim]
-        end = _get_end_from_slice(self._iterationSlice, arr_shape)
-
-        conditions = [c < e for c, e in zip(self.coordinates, end)]
-        for cuda_index, iter_slice in zip(self.cuda_indices, self._iterationSlice):
-            if isinstance(iter_slice, slice) and iter_slice.step > 1:
-                conditions.append(sp.Eq(sp.Mod(cuda_index, iter_slice.step), 0))
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]

+        if self._dim < 4:
+            conditions = [c < e for c, e in zip(self.coordinates, end)]
+        else:
+            end = [end[0] * end[1], end[2], end[3]]
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
+            conditions = [c < e for c, e in zip(coordinates, end)]
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
        return Block([Conditional(condition, kernel_content)])

-    def iteration_space(self, arr_shape):
-        return _iteration_space(self._iterationSlice, arr_shape)
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)

    def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
        """Shrinks the block_size if there are too many registers used per block.
@@ -246,38 +294,44 @@ class LineIndexing(AbstractIndexing):
    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a GPU block (which depends on device).
+
+    Args:
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple to determine the fast and slow coordinates.
    """

-    def __init__(self, field, iteration_slice):
-        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
-        if field.spatial_dimensions > 4:
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        super(LineIndexing, self).__init__(iteration_space, data_layout)
+
+        if len(iteration_space) > 4:
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

-        coordinates = available_indices[:field.spatial_dimensions]
+    @property
+    def cuda_indices(self):
+        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
+        coordinates = available_indices[:self.dim]

-        fastest_coordinate = field.layout[-1]
+        fastest_coordinate = self.data_layout[-1]
        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]

-        self._coordinates = coordinates
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
+        return coordinates

    @property
    def coordinates(self):
-        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
+        return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]

-    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)

-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
+    def call_parameters(self, arr_shape):
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)

        def get_shape_of_cuda_idx(cuda_idx):
-            if cuda_idx not in self._coordinates:
+            if cuda_idx not in self.cuda_indices:
                return 1
            else:
-                idx = self._coordinates.index(cuda_idx)
+                idx = self.cuda_indices.index(cuda_idx)
                return widths[idx]

        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
@@ -292,50 +346,65 @@ class LineIndexing(AbstractIndexing):
    def symbolic_parameters(self):
        return set()

-    def iteration_space(self, arr_shape):
-        return _iteration_space(self._iterationSlice, arr_shape)
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)


 # -------------------------------------- Helper functions --------------------------------------------------------------

-def _get_start_from_slice(iteration_slice):
+def _get_numeric_iteration_slice(iteration_slice, arr_shape):
    res = []
-    for slice_component in iteration_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.start if slice_component.start is not None else 0)
-        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component)
+    for slice_component, shape in zip(iteration_slice, arr_shape):
+        result_slice = slice_component
+        if not isinstance(result_slice.start, int):
+            start = result_slice.start
+            assert len(start.free_symbols) == 1
+            start = start.subs({symbol: shape for symbol in start.free_symbols})
+            result_slice = slice(start, result_slice.stop, result_slice.step)
+        if not isinstance(result_slice.stop, int):
+            stop = result_slice.stop
+            assert len(stop.free_symbols) == 1
+            stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
+            result_slice = slice(result_slice.start, stop, result_slice.step)
+        assert isinstance(result_slice.step, int)
+        res.append(result_slice)
    return res


-def _get_end_from_slice(iteration_slice, arr_shape):
-    iter_slice = normalize_slice(iteration_slice, arr_shape)
-    res = []
-    for slice_component in iter_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.stop)
+def _get_widths(iteration_slice):
+    widths = []
+    for iter_slice in iteration_slice:
+        step = iter_slice.step
+        assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
+        start = iter_slice.start
+        stop = iter_slice.stop
+        if step == 1:
+            if stop - start == 0:
+                widths.append(1)
+            else:
+                widths.append(stop - start)
        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component + 1)
-    return res
+            width = (stop - start) / step
+            if isinstance(width, int):
+                widths.append(width)
+            elif isinstance(width, float):
+                widths.append(math.ceil(width))
+            else:
+                widths.append(div_ceil(stop - start, step))
+    return widths


-def _get_steps_from_slice(iteration_slice):
-    res = []
-    for slice_component in iteration_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.step)
+def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
+    loop_ctr_assignments = []
+    for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
+        if isinstance(iter_slice, slice) and iter_slice.step > 1:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step))
+        elif iter_slice.start == iter_slice.stop:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
        else:
-            res.append(1)
-    return res
-
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))

-def _iteration_space(iteration_slice, arr_shape):
-    starts = _get_start_from_slice(iteration_slice)
-    ends = _get_end_from_slice(iteration_slice, arr_shape)
-    steps = _get_steps_from_slice(iteration_slice)
-    return [slice(start, end, step) for start, end, step in zip(starts, ends, steps)]
+    return loop_ctr_assignments


 def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):

--- a/pystencils/gpu/kernelcreation.py
+++ b/pystencils/gpu/kernelcreation.py
 from typing import Union

-import numpy as np
-
 from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
 from pystencils.config import CreateKernelConfig
 from pystencils.typing import StructType, TypedSymbol
@@ -12,6 +10,7 @@ from pystencils.gpu.gpujit import make_python_function
 from pystencils.node_collection import NodeCollection
 from pystencils.gpu.indexing import indexing_creator_from_params
 from pystencils.simp.assignment_collection import AssignmentCollection
+from pystencils.slicing import normalize_slice
 from pystencils.transformations import (
    get_base_buffer_index, get_common_field, parse_base_pointer_info,
    resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
@@ -64,17 +63,18 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
                iteration_slice.append(slice(ghost_layers[i][0],
                                             -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None))

-    indexing = indexing_creator(field=common_field, iteration_slice=iteration_slice)
-    coord_mapping = indexing.coordinates
+        iteration_space = normalize_slice(iteration_slice, common_shape)
+    else:
+        iteration_space = normalize_slice(iteration_slice, common_shape)

-    cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value)
-                            for i, value in enumerate(coord_mapping)]
-    cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)]
-    assignments = cell_idx_assignments + assignments
+    iteration_space = tuple([s if isinstance(s, slice) else slice(s, s, 1) for s in iteration_space])
+    loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]

-    block = Block(assignments)
+    indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
+    loop_counter_assignments = indexing.get_loop_ctr_assignments(loop_counter_symbols)
+    assignments = loop_counter_assignments + assignments
+    block = indexing.guard(Block(assignments), common_shape)

-    block = indexing.guard(block, common_shape)
    unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)

    ast = KernelFunction(block,
@@ -91,11 +91,11 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
                                                         f.spatial_dimensions, f.index_dimensions)
                         for f in all_fields}

-    coord_mapping = {f.name: cell_idx_symbols for f in all_fields}
+    coord_mapping = {f.name: loop_counter_symbols for f in all_fields}

    if any(FieldType.is_buffer(f) for f in all_fields):
-        iteration_space = indexing.iteration_space(common_shape)
-        resolve_buffer_accesses(ast, get_base_buffer_index(ast, cell_idx_symbols, iteration_space), read_only_fields)
+        base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
+        resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)

    resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
                           field_to_fixed_coordinates=coord_mapping)
@@ -147,7 +147,7 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
            data_type = ind_f.dtype
            if data_type.has_element(name):
                rhs = ind_f[0](name)
-                lhs = TypedSymbol(name, np.int64)
+                lhs = TypedSymbol(name, data_type.get_element_type(name))
                return SympyAssignment(lhs, rhs)
        raise ValueError(f"Index {name} not found in any of the passed index fields")

@@ -156,8 +156,12 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
    coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments]

    idx_field = list(index_fields)[0]
-    indexing = indexing_creator(field=idx_field,
-                                iteration_slice=[slice(None, None, None)] * len(idx_field.spatial_shape))
+
+    iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape),
+                                      idx_field.spatial_shape)
+
+    indexing = indexing_creator(iteration_space=iteration_space,
+                                data_layout=idx_field.layout)

    function_body = Block(coordinate_symbol_assignments + assignments)
    function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape)

--- a/pystencils_tests/test_buffer_gpu.py
+++ b/pystencils_tests/test_buffer_gpu.py
@@ -275,7 +275,8 @@ def test_buffer_indexing():
    assert len(spatial_shape_symbols) <= 3


-def test_iteration_slices():
+@pytest.mark.parametrize('gpu_indexing', ("block", "line"))
+def test_iteration_slices(gpu_indexing):
    num_cell_values = 19
    dt = np.uint64
    fields = _generate_fields(dt=dt, stencil_directions=num_cell_values)
@@ -303,7 +304,8 @@ def test_iteration_slices():
        gpu_dst_arr.fill(0)

        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
-                                    data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype})
+                                    data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
+                                    gpu_indexing=gpu_indexing)

        pack_code = create_kernel(pack_eqs, config=config)
        pack_kernel = pack_code.compile()
@@ -316,7 +318,8 @@ def test_iteration_slices():
            unpack_eqs.append(eq)

        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
-                                    data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype})
+                                    data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
+                                    gpu_indexing=gpu_indexing)

        unpack_code = create_kernel(unpack_eqs, config=config)
        unpack_kernel = unpack_code.compile()

--- a/pystencils_tests/test_gpu.py
+++ b/pystencils_tests/test_gpu.py
@@ -8,7 +8,7 @@ from scipy.ndimage import convolve
 from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
 from pystencils.gpu import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
-from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
+from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice

 try:
    import cupy
@@ -164,14 +164,17 @@ def test_periodicity():
 @pytest.mark.parametrize("device_number", device_numbers)
 def test_block_indexing(device_number):
    f = fields("f: [3D]")
-    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
+    s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
+    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
+                       permute_block_size_dependent_on_layout=False)
    assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
    assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8)

-    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False)
+    bi = BlockIndexing(s, f.layout, block_size=(32, 1, 1),
+                       permute_block_size_dependent_on_layout=False)
    assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)

-    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
+    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
                       maximum_block_size="auto", device_number=device_number)

    # This function should be used if number of needed registers is known. Can be determined with func.num_regs
@@ -187,3 +190,23 @@ def test_block_indexing(device_number):

    assert np.prod(blocks) * registers_per_thread < max_registers_per_block

+
+@pytest.mark.parametrize('gpu_indexing', ("block", "line"))
+@pytest.mark.parametrize('layout', ("C", "F"))
+@pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
+def test_four_dimensional_kernel(gpu_indexing, layout, shape):
+    n_elements = np.prod(shape)
+
+    arr_cpu = np.arange(n_elements, dtype=np.float64).reshape(shape, order=layout)
+    arr_gpu = cp.asarray(arr_cpu)
+
+    iteration_slice = make_slice[:, :, :, :]
+    f = Field.create_from_numpy_array("f", arr_cpu)
+    update_rule = [Assignment(f.center, sp.Symbol("value"))]
+
+    config = CreateKernelConfig(target=Target.GPU, gpu_indexing=gpu_indexing, iteration_slice=iteration_slice)
+    ast = create_kernel(update_rule, config=config)
+    kernel = ast.compile()
+
+    kernel(f=arr_gpu, value=np.float64(42.0))
+    np.testing.assert_equal(arr_gpu.get(), np.ones(shape) * 42.0)