diff --git a/pystencils/datahandling/__init__.py b/pystencils/datahandling/__init__.py
index 7f142428cf14b62813a7b9b1b245ffae021c1fb8..18053d2d9d6546bcb5ac2093f5f63c1633965a4e 100644
--- a/pystencils/datahandling/__init__.py
+++ b/pystencils/datahandling/__init__.py
@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                          default_layout: str = 'SoA',
                          default_target: Target = Target.CPU,
                          parallel: bool = False,
-                         default_ghost_layers: int = 1) -> DataHandling:
+                         default_ghost_layers: int = 1,
+                         device_number: Union[int, None] = None) -> DataHandling:
     """Creates a data handling instance.
 
     Args:
@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...],
         default_target: `Target`
         parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
         default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
+        device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
+                       specified. If none is given, the device with the largest amount of memory is used. If multiple
+                       devices have the same amount of memory, the one with the lower number is used
     """
     if isinstance(default_target, str):
         new_target = Target[default_target.upper()]
@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                                   periodicity=periodicity,
                                   default_target=default_target,
                                   default_layout=default_layout,
-                                  default_ghost_layers=default_ghost_layers)
+                                  default_ghost_layers=default_ghost_layers,
+                                  device_number=device_number)
 
 
 __all__ = ['create_data_handling']
diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py
index e0b42771d081d83593868adadc436a86dba57f30..0f5ddb431a869f3326f25b46a4f276268d2afd44 100644
--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -57,7 +57,7 @@ class SerialDataHandling(DataHandling):
         if not array_handler:
             try:
                 if device_number is None:
-                    import cupy
+                    import cupy.cuda.runtime
                     if cupy.cuda.runtime.getDeviceCount() > 0:
                         device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
                                                key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
diff --git a/pystencils/gpu/gpujit.py b/pystencils/gpu/gpujit.py
index 420c3241db2f5c1c8dcfe349bbaeaadf3b927286..52268924126870508c921a8569f6579707e72fda 100644
--- a/pystencils/gpu/gpujit.py
+++ b/pystencils/gpu/gpujit.py
@@ -75,7 +75,7 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
         try:
             args, block_and_thread_numbers = cache[key]
             device = set(a.device.id for a in args if type(a) is cp.ndarray)
-            assert len(device) == 1
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
             with cp.cuda.Device(device.pop()):
                 func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
         except KeyError:
@@ -92,7 +92,7 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
             cache[key] = (args, block_and_thread_numbers)
             cache_values.append(kwargs)  # keep objects alive such that ids remain unique
             device = set(a.device.id for a in args if type(a) is cp.ndarray)
-            assert len(device) == 1
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
             with cp.cuda.Device(device.pop()):
                 func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
                 # useful for debugging:
diff --git a/pystencils_tests/test_datahandling.py b/pystencils_tests/test_datahandling.py
index d31ae1bed0dacf9397126f9129210d404fb8e730..15e9cd74baf7df4e8922ce5f4a624a1fc0f6fb75 100644
--- a/pystencils_tests/test_datahandling.py
+++ b/pystencils_tests/test_datahandling.py
@@ -16,7 +16,7 @@ except ImportError:
     pytest = unittest.mock.MagicMock()
 
 try:
-    import cupy
+    import cupy.cuda.runtime
     device_numbers = range(cupy.cuda.runtime.getDeviceCount())
 except ImportError:
     device_numbers = []