diff --git a/pystencils/config.py b/pystencils/config.py
index 7c2be7a66f3d9d9ec0f3c4e6f4a406bbea471caa..23570d625dc96257ad3e6c420f5e1a54467479cb 100644
--- a/pystencils/config.py
+++ b/pystencils/config.py
@@ -82,7 +82,7 @@ class CreateKernelConfig:
     """
     Either 'block' or 'line' , or custom indexing class, see `pystencils.gpucuda.AbstractIndexing`
     """
-    gpu_indexing_params: MappingProxyType = field(default=MappingProxyType({}))
+    gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({}))
     """
     Dict with indexing parameters (constructor parameters of indexing class)
     e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
@@ -121,12 +121,12 @@ class CreateKernelConfig:
     allow_double_writes: bool = False
     """
     If True, don't check if every field is only written at a single location. This is required
-    for example for kernels that are compiled with loop step sizes > 1, that handle multiple 
+    for example for kernels that are compiled with loop step sizes > 1, that handle multiple
     cells at once. Use with care!
     """
     skip_independence_check: bool = False
     """
-    Don't check that loop iterations are independent. This is needed e.g. for 
+    Don't check that loop iterations are independent. This is needed e.g. for
     periodicity kernel, that access the field outside the iteration bounds. Use with care!
     """
 
diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py
index b6fb901750895b341d44fde26040ff3b91d0e9e9..378f338ade4f0e5ea35dce84e8fdd70b44123059 100644
--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
@@ -2,17 +2,19 @@ import numpy as np
 
 from pystencils.backends.cbackend import get_headers
 from pystencils.backends.cuda_backend import generate_cuda
-from pystencils.typing import StructType
 from pystencils.field import FieldType
 from pystencils.include import get_pycuda_include_path, get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
+from pystencils.typing import StructType
 from pystencils.typing.typed_sympy import FieldPointerSymbol
+from pystencils.utils import DotDict
 
 USE_FAST_MATH = True
+USE_PYCUDA = True
 
 
 def get_cubic_interpolation_include_paths():
-    from os.path import join, dirname
+    from os.path import dirname, join
 
     return [join(dirname(__file__), "CubicInterpolationCUDA", "code"),
             join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
@@ -32,9 +34,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
     Returns:
         compiled kernel as Python function
     """
-    import pycuda.autoinit  # NOQA
-    from pycuda.compiler import SourceModule
-
     if argument_dict is None:
         argument_dict = {}
 
@@ -42,17 +41,25 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
     includes = "\n".join([f"#include {include_file}" for include_file in header_list])
 
     code = includes + "\n"
-    code += "#define FUNC_PREFIX __global__\n"
+    code += "#define FUNC_PREFIX extern \"C\" __global__\n"
     code += "#define RESTRICT __restrict__\n\n"
     code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
 
-    nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
+    nvcc_options = ["-w", "-std=c++11"]
     if USE_FAST_MATH:
         nvcc_options.append("-use_fast_math")
 
-    mod = SourceModule(code, options=nvcc_options, include_dirs=[
-                       get_pystencils_include_path(), get_pycuda_include_path()])
-    func = mod.get_function(kernel_function_node.function_name)
+    if USE_PYCUDA:
+        import pycuda.autoinit  # NOQA
+        from pycuda.compiler import SourceModule
+        nvcc_options.append("-Wno-deprecated-gpu-targets")
+        mod = SourceModule(code, options=nvcc_options, include_dirs=[
+                           get_pystencils_include_path(), get_pycuda_include_path()])
+        func = mod.get_function(kernel_function_node.function_name)
+    else:
+        import cupy
+        nvcc_options.append("-I" + get_pystencils_include_path())
+        func = cupy.RawKernel(code, kernel_function_node.function_name, options=tuple(nvcc_options), jitify=True)
 
     parameters = kernel_function_node.get_parameters()
 
@@ -78,13 +85,19 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
             args = _build_numpy_argument_list(parameters, full_arguments)
             cache[key] = (args, block_and_thread_numbers)
             cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(*args, **block_and_thread_numbers)
+            if USE_PYCUDA:
+                func(*args, **block_and_thread_numbers)
+            else:
+                func(block_and_thread_numbers['grid'],
+                     block_and_thread_numbers['block'], [cupy.asarray(a) for a in args])
+
         # import pycuda.driver as cuda
         # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
     ast = kernel_function_node
     parameters = kernel_function_node.get_parameters()
     wrapper = KernelWrapper(wrapper, parameters, ast)
-    wrapper.num_regs = func.num_regs
+    if USE_PYCUDA:
+        wrapper.num_regs = func.num_regs
     return wrapper
 
 
@@ -95,7 +108,16 @@ def _build_numpy_argument_list(parameters, argument_dict):
     for param in parameters:
         if param.is_field_pointer:
             array = argument_dict[param.field_name]
-            actual_type = array.dtype
+            if not all(hasattr(array, attr) for attr in ["shape", "stride", "data", "dtype"]) or not isinstance(
+                    array.dtype, np.dtype):
+                interface = DotDict()
+                interface.shape = array.__cuda_array_interface__["shape"]
+                interface.dtype = np.dtype(array.__cuda_array_interface__["typestr"])
+                interface.strides = array.__cuda_array_interface__["strides"]
+                interface.data = array.__cuda_array_interface__["data"][0]
+            else:
+                interface = array
+            actual_type = interface.dtype
             expected_type = param.fields[0].dtype.numpy_dtype
             if expected_type != actual_type:
                 raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." %
@@ -136,6 +158,16 @@ def _check_arguments(parameter_specification, argument_dict):
             except KeyError:
                 raise KeyError("Missing field parameter for kernel call " + str(symbolic_field))
 
+            if not all(hasattr(field_arr, attr) for attr in ["shape", "stride", "data", "dtype"]) or not isinstance(
+                    field_arr.dtype, np.dtype):
+                if hasattr(field_arr, "__cuda_array_interface__"):
+                    field_interface = DotDict()
+                    field_interface.shape = field_arr.__cuda_array_interface__["shape"]
+                    field_interface.dtype = np.dtype(field_arr.__cuda_array_interface__["typestr"])
+                    field_interface.strides = field_arr.__cuda_array_interface__["strides"]
+                    field_interface.data = field_arr.__cuda_array_interface__["data"][0]
+                    field_arr = field_interface
+
             if symbolic_field.has_fixed_shape:
                 symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
                 if isinstance(symbolic_field.dtype, StructType):
@@ -147,7 +179,7 @@ def _check_arguments(parameter_specification, argument_dict):
                 symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides)
                 if isinstance(symbolic_field.dtype, StructType):
                     symbolic_field_strides = symbolic_field_strides[:-1]
-                if symbolic_field_strides != field_arr.strides:
+                if field_arr.strides and symbolic_field_strides != field_arr.strides:
                     raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
                                      (symbolic_field.name, str(field_arr.strides), str(symbolic_field_strides)))
 
diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py
index a65a08ba6d24b30002822e9916b2d3d44639d26a..8dac067b29b8573fc8c9a4e03a83cde81b96ce8d 100644
--- a/pystencils_tests/test_cudagpu.py
+++ b/pystencils_tests/test_cudagpu.py
@@ -1,16 +1,28 @@
 import numpy as np
 import pycuda.autoinit
 import pycuda.gpuarray as gpuarray
+import pytest
 import sympy as sp
 from scipy.ndimage import convolve
 
-from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
+from pystencils import Assignment, CreateKernelConfig, Field, Target, create_kernel, fields
 from pystencils.gpucuda import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
 
 
-def test_averaging_kernel():
+@pytest.mark.parametrize("framework", ("cupy", "torch", "numba", "pycuda"))
+def test_averaging_kernel(framework):
+    if framework == "cupy":
+        cupy = pytest.importorskip(framework)
+    elif framework == "torch":
+        torch = pytest.importorskip(framework)
+    elif framework == "numba":
+        numba = pytest.importorskip(framework)
+    elif framework == "pycuda":
+        pycuda = pytest.importorskip(framework)
+        import pycuda.autoinit
+        import pycuda.gpuarray
     size = (40, 55)
     src_arr = np.random.rand(*size)
     src_arr = add_ghost_layers(src_arr)
@@ -25,10 +37,29 @@ def test_averaging_kernel():
     ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
     kernel = ast.compile()
 
-    gpu_src_arr = gpuarray.to_gpu(src_arr)
-    gpu_dst_arr = gpuarray.to_gpu(dst_arr)
+    if framework == "cupy":
+        gpu_src_arr = cupy.array(src_arr)
+        gpu_dst_arr = cupy.array(dst_arr)
+    elif framework == "torch":
+        gpu_src_arr = torch.from_numpy(src_arr).cuda()
+        gpu_dst_arr = torch.from_numpy(dst_arr).cuda()
+    elif framework == "numba":
+        gpu_src_arr = numba.cuda.device_array_like(src_arr)
+        gpu_dst_arr = numba.cuda.device_array_like(dst_arr)
+    elif framework == "pycuda":
+        gpu_src_arr = pycuda.gpuarray.to_gpu(src_arr)
+        gpu_dst_arr = pycuda.gpuarray.to_gpu(dst_arr)
+
     kernel(src=gpu_src_arr, dst=gpu_dst_arr)
-    gpu_dst_arr.get(dst_arr)
+
+    if framework == "cupy":
+        dst_arr = gpu_dst_arr.get()
+    elif framework == "torch":
+        dst_arr = gpu_dst_arr.cpu()
+    elif framework == "numba":
+        gpu_dst_arr.copy_to_host(dst_arr)
+    elif framework == "pycuda":
+        gpu_dst_arr.get(dst_arr)
 
     stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
     reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)