diff --git a/src/pystencils/gpu/indexing.py b/src/pystencils/gpu/indexing.py
index 843e77bb87f60ea4c509d4ec8827e41ee204c42f..c1b0c0936d1ae0c68345bd472b46cf65bd161cb2 100644
--- a/src/pystencils/gpu/indexing.py
+++ b/src/pystencils/gpu/indexing.py
@@ -224,6 +224,9 @@ class BlockIndexing(AbstractIndexing):
             assert len(self._iteration_space) == len(arr_shape), "Iteration space must be equal to the array shape"
             numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
         end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
+        for i, s in enumerate(numeric_iteration_slice):
+            if s.step and s.step != 1:
+                end[i] = div_ceil(s.stop - s.start, s.step) + s.start
 
         if self._dim < 4:
             conditions = [c < e for c, e in zip(self.coordinates, end)]
diff --git a/tests/test_gpu.py b/tests/test_gpu.py
index 04d616d4ead4b84df6ff1ee46dec83d8ce1c5503..7e35a4f9a66b5478cad6ffe49fbe0cd09db573c9 100644
--- a/tests/test_gpu.py
+++ b/tests/test_gpu.py
@@ -1,23 +1,25 @@
 import pytest
 
 import numpy as np
-import cupy as cp
 import sympy as sp
+import math
 from scipy.ndimage import convolve
 
-from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
+from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target, get_code_str
 from pystencils.gpu import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice
 
 try:
-    import cupy
-    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+    import cupy as cp
+    device_numbers = range(cp.cuda.runtime.getDeviceCount())
 except ImportError:
     device_numbers = []
+    cp = None
 
 
 def test_averaging_kernel():
+    pytest.importorskip('cupy')
     size = (40, 55)
     src_arr = np.random.rand(*size)
     src_arr = add_ghost_layers(src_arr)
@@ -44,6 +46,7 @@ def test_averaging_kernel():
 
 
 def test_variable_sized_fields():
+    pytest.importorskip('cupy')
     src_field = Field.create_generic('src', spatial_dimensions=2)
     dst_field = Field.create_generic('dst', spatial_dimensions=2)
 
@@ -71,6 +74,7 @@ def test_variable_sized_fields():
 
 
 def test_multiple_index_dimensions():
+    pytest.importorskip('cupy')
     """Sums along the last axis of a numpy array"""
     src_size = (7, 6, 4)
     dst_size = src_size[:2]
@@ -103,6 +107,7 @@ def test_multiple_index_dimensions():
 
 
 def test_ghost_layer():
+    pytest.importorskip('cupy')
     size = (6, 5)
     src_arr = np.ones(size)
     dst_arr = np.zeros_like(src_arr)
@@ -127,6 +132,7 @@ def test_ghost_layer():
 
 
 def test_setting_value():
+    pytest.importorskip('cupy')
     arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5)
     arr_gpu = cp.asarray(arr_cpu)
 
@@ -143,6 +149,7 @@ def test_setting_value():
 
 
 def test_periodicity():
+    pytest.importorskip('cupy')
     from pystencils.gpu.periodicity import get_periodic_boundary_functor as periodic_gpu
     from pystencils.slicing import get_periodic_boundary_functor as periodic_cpu
 
@@ -163,6 +170,7 @@ def test_periodicity():
 
 @pytest.mark.parametrize("device_number", device_numbers)
 def test_block_indexing(device_number):
+    pytest.importorskip('cupy')
     f = fields("f: [3D]")
     s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
     bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
@@ -195,6 +203,7 @@ def test_block_indexing(device_number):
 @pytest.mark.parametrize('layout', ("C", "F"))
 @pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
 def test_four_dimensional_kernel(gpu_indexing, layout, shape):
+    pytest.importorskip('cupy')
     n_elements = np.prod(shape)
 
     arr_cpu = np.arange(n_elements, dtype=np.float64).reshape(shape, order=layout)
@@ -210,3 +219,39 @@ def test_four_dimensional_kernel(gpu_indexing, layout, shape):
 
     kernel(f=arr_gpu, value=np.float64(42.0))
     np.testing.assert_equal(arr_gpu.get(), np.ones(shape) * 42.0)
+
+
+@pytest.mark.parametrize('start', (1, 5))
+@pytest.mark.parametrize('end', (-1, -2, -3, -4))
+@pytest.mark.parametrize('step', (1, 2, 3, 4))
+@pytest.mark.parametrize('shape', ([55, 60], [77, 101, 80], [44, 64, 66]))
+def test_guards_with_iteration_slices(start, end, step, shape):
+    iter_slice = tuple([slice(start, end, step)] * len(shape))
+
+    kernel_config_gpu = CreateKernelConfig(target=Target.GPU, iteration_slice=iter_slice)
+    field_1 = fields(f"f(1) : double{list(shape)}")
+    assignment = Assignment(field_1.center, 1)
+    ast = create_kernel(assignment, config=kernel_config_gpu)
+    code_str = get_code_str(ast)
+
+    test_strings = list()
+    iteration_ranges = list()
+    for i, s in enumerate(iter_slice):
+        e = ((shape[i] + end) - s.start) / s.step
+        e = math.ceil(e) + s.start
+        test_strings.append(f"{s.start} < {e}")
+
+        a = s.start
+        counter = 0
+        while a < e:
+            a += 1
+            counter += 1
+        iteration_ranges.append(counter)
+
+    # check if the expected if statement is in the GPU code
+    for s in test_strings:
+        assert s in code_str
+
+    # check if these bounds lead to same lengths as the range function would produce
+    for i in range(len(iter_slice)):
+        assert iteration_ranges[i] == len(range(iter_slice[i].start, shape[i] + end, iter_slice[i].step))