feat: implement `__cuda_array_interface__`

https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html Also allow to execute with cupy (https://docs.cupy.dev/en/stable/index.html) instead of pycuda TODO: - [ ] check that pointers in correct CUDA context and if not import into current - [ ] make execution with pycuda aware of `__cuda_array_interface__`

feat: implement `__cuda_array_interface__`
902cab8f · Stephan Seitz · 262446d1 · 902cab8f · 902cab8f · 902cab8f
Commit 902cab8f authored 2 years ago by Stephan Seitz
--- a/pystencils/config.py
+++ b/pystencils/config.py
@@ -82,7 +82,7 @@ class CreateKernelConfig:
    """
    Either 'block' or 'line' , or custom indexing class, see `pystencils.gpucuda.AbstractIndexing`
    """
-    gpu_indexing_params: MappingProxyType = field(default=MappingProxyType({}))
+    gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({}))
    """
    Dict with indexing parameters (constructor parameters of indexing class)
    e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
@@ -121,12 +121,12 @@ class CreateKernelConfig:
    allow_double_writes: bool = False
    """
    If True, don't check if every field is only written at a single location. This is required
-    for example for kernels that are compiled with loop step sizes > 1, that handle multiple 
+    for example for kernels that are compiled with loop step sizes > 1, that handle multiple
    cells at once. Use with care!
    """
    skip_independence_check: bool = False
    """
-    Don't check that loop iterations are independent. This is needed e.g. for 
+    Don't check that loop iterations are independent. This is needed e.g. for
    periodicity kernel, that access the field outside the iteration bounds. Use with care!
    """

--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
@@ -2,17 +2,19 @@ import numpy as np
 from pystencils.backends.cbackend import get_headers
 from pystencils.backends.cuda_backend import generate_cuda
-from pystencils.typing import StructType
 from pystencils.field import FieldType
 from pystencils.include import get_pycuda_include_path, get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
+from pystencils.typing import StructType
 from pystencils.typing.typed_sympy import FieldPointerSymbol
+from pystencils.utils import DotDict
 USE_FAST_MATH = True
+USE_PYCUDA = True
 def get_cubic_interpolation_include_paths():
-    from os.path import join, dirname
+    from os.path import dirname, join
    return [join(dirname(__file__), "CubicInterpolationCUDA", "code"),
            join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
@@ -32,9 +34,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
    Returns:
        compiled kernel as Python function
    """
-    import pycuda.autoinit  # NOQA
-    from pycuda.compiler import SourceModule
    if argument_dict is None:
        argument_dict = {}
@@ -42,17 +41,25 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
    includes = "\n".join([f"#include {include_file}" for include_file in header_list])
    code = includes + "\n"
-    code += "#define FUNC_PREFIX __global__\n"
+    code += "#define FUNC_PREFIX extern \"C\" __global__\n"
    code += "#define RESTRICT __restrict__\n\n"
    code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
-    nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
+    nvcc_options = ["-w", "-std=c++11"]
    if USE_FAST_MATH:
        nvcc_options.append("-use_fast_math")
-    mod = SourceModule(code, options=nvcc_options, include_dirs=[
+    if USE_PYCUDA:
-                       get_pystencils_include_path(), get_pycuda_include_path()])
+        import pycuda.autoinit  # NOQA
-    func = mod.get_function(kernel_function_node.function_name)
+        from pycuda.compiler import SourceModule
+        nvcc_options.append("-Wno-deprecated-gpu-targets")
+        mod = SourceModule(code, options=nvcc_options, include_dirs=[
+                           get_pystencils_include_path(), get_pycuda_include_path()])
+        func = mod.get_function(kernel_function_node.function_name)
+    else:
+        import cupy
+        nvcc_options.append("-I" + get_pystencils_include_path())
+        func = cupy.RawKernel(code, kernel_function_node.function_name, options=tuple(nvcc_options), jitify=True)
    parameters = kernel_function_node.get_parameters()
@@ -78,13 +85,19 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
            args = _build_numpy_argument_list(parameters, full_arguments)
            cache[key] = (args, block_and_thread_numbers)
            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(*args, **block_and_thread_numbers)
+            if USE_PYCUDA:
+                func(*args, **block_and_thread_numbers)
+            else:
+                func(block_and_thread_numbers['grid'],
+                     block_and_thread_numbers['block'], [cupy.asarray(a) for a in args])
        # import pycuda.driver as cuda
        # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
    ast = kernel_function_node
    parameters = kernel_function_node.get_parameters()
    wrapper = KernelWrapper(wrapper, parameters, ast)
-    wrapper.num_regs = func.num_regs
+    if USE_PYCUDA:
+        wrapper.num_regs = func.num_regs
    return wrapper
@@ -95,7 +108,15 @@ def _build_numpy_argument_list(parameters, argument_dict):
    for param in parameters:
        if param.is_field_pointer:
            array = argument_dict[param.field_name]
-            actual_type = array.dtype
+            if hasattr(array, "__cuda_array_interface__"):
+                interface = DotDict()
+                interface.shape = array.__cuda_array_interface__["shape"]
+                interface.dtype = np.dtype(array.__cuda_array_interface__["typestr"])
+                interface.strides = array.__cuda_array_interface__["strides"]
+                interface.data = array.__cuda_array_interface__["data"][0]
+            else:
+                interface = array
+            actual_type = interface.dtype
            expected_type = param.fields[0].dtype.numpy_dtype
            if expected_type != actual_type:
                raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." %
@@ -136,6 +157,14 @@ def _check_arguments(parameter_specification, argument_dict):
            except KeyError:
                raise KeyError("Missing field parameter for kernel call " + str(symbolic_field))
+            if hasattr(field_arr, "__cuda_array_interface__"):
+                field_interface = DotDict()
+                field_interface.shape = field_arr.__cuda_array_interface__["shape"]
+                field_interface.dtype = np.dtype(field_arr.__cuda_array_interface__["typestr"])
+                field_interface.strides = field_arr.__cuda_array_interface__["strides"]
+                field_interface.data = field_arr.__cuda_array_interface__["data"][0]
+                field_arr = field_interface
            if symbolic_field.has_fixed_shape:
                symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
                if isinstance(symbolic_field.dtype, StructType):
@@ -147,7 +176,7 @@ def _check_arguments(parameter_specification, argument_dict):
                symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides)
                if isinstance(symbolic_field.dtype, StructType):
                    symbolic_field_strides = symbolic_field_strides[:-1]
-                if symbolic_field_strides != field_arr.strides:
+                if field_arr.strides and symbolic_field_strides != field_arr.strides:
                    raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
                                     (symbolic_field.name, str(field_arr.strides), str(symbolic_field_strides)))

--- a/pystencils_tests/test_cuda_array_interface.py
+++ b/pystencils_tests/test_cuda_array_interface.py
+import numpy as np
+import pytest
+import sympy as sp
+from scipy.ndimage import convolve
+from pystencils import Assignment, CreateKernelConfig, Field, Target, create_kernel
+from pystencils.simp import sympy_cse_on_assignment_list
+from pystencils.slicing import add_ghost_layers, remove_ghost_layers
+@pytest.mark.parametrize("framework", ("cupy", "torch", "numba"))
+def test_cuda_array_interface(framework):
+    if framework == "cupy":
+        cupy = pytest.importorskip(framework)
+    elif framework == "torch":
+        torch = pytest.importorskip(framework)
+    elif framework == "numba":
+        numba = pytest.importorskip(framework)
+    elif framework == "numba":
+        pycuda = pytest.importorskip(framework)
+        import pycuda.autoinit
+    size = (40, 55)
+    src_arr = np.random.rand(*size)
+    src_arr = add_ghost_layers(src_arr)
+    dst_arr = np.zeros_like(src_arr)
+    src_field = Field.create_from_numpy_array('src', src_arr)
+    dst_field = Field.create_from_numpy_array('dst', dst_arr)
+    update_rule = Assignment(dst_field[0, 0],
+                             (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4)
+    config = CreateKernelConfig(target=Target.GPU)
+    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    kernel = ast.compile()
+    if framework == "cupy":
+        gpu_src_arr = cupy.array(src_arr)
+        gpu_dst_arr = cupy.array(dst_arr)
+    elif framework == "torch":
+        gpu_src_arr = torch.from_numpy(src_arr).cuda()
+        gpu_dst_arr = torch.from_numpy(dst_arr).cuda()
+    elif framework == "numba":
+        gpu_src_arr = numba.cuda.device_array_like(src_arr)
+        gpu_dst_arr = numba.cuda.device_array_like(dst_arr)
+    elif framework == "pycuda":
+        gpu_src_arr = pycuda.gpuarray.device_array_like(src_arr)
+        gpu_dst_arr = numba.cuda.device_array_like(dst_arr)
+    kernel(src=gpu_src_arr, dst=gpu_dst_arr)
+    if framework == "cupy":
+        dst_arr = gpu_dst_arr.get()
+    elif framework == "torch":
+        dst_arr = gpu_dst_arr.cpu()
+    elif framework == "numba":
+        gpu_dst_arr.copy_to_host(dst_arr)
+    stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
+    reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)
+    reference = add_ghost_layers(reference)
+    np.testing.assert_almost_equal(reference, dst_arr)
--- a/pystencils_tests/test_cudagpu.py
+++ b/pystencils_tests/test_cudagpu.py
 import numpy as np
 import pycuda.autoinit
 import pycuda.gpuarray as gpuarray
+import pytest
 import sympy as sp
 from scipy.ndimage import convolve
-from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
+from pystencils import Assignment, CreateKernelConfig, Field, Target, create_kernel, fields
 from pystencils.gpucuda import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
-def test_averaging_kernel():
+@pytest.mark.parametrize("framework", ("cupy", "torch", "numba", "pycuda"))
+def test_averaging_kernel(framework):
+    if framework == "cupy":
+        cupy = pytest.importorskip(framework)
+    elif framework == "torch":
+        torch = pytest.importorskip(framework)
+    elif framework == "numba":
+        numba = pytest.importorskip(framework)
+    elif framework == "pycuda":
+        pycuda = pytest.importorskip(framework)
+        import pycuda.autoinit
+        import pycuda.gpuarray
    size = (40, 55)
    src_arr = np.random.rand(*size)
    src_arr = add_ghost_layers(src_arr)
@@ -25,10 +37,29 @@ def test_averaging_kernel():
    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
    kernel = ast.compile()
-    gpu_src_arr = gpuarray.to_gpu(src_arr)
+    if framework == "cupy":
-    gpu_dst_arr = gpuarray.to_gpu(dst_arr)
+        gpu_src_arr = cupy.array(src_arr)
+        gpu_dst_arr = cupy.array(dst_arr)
+    elif framework == "torch":
+        gpu_src_arr = torch.from_numpy(src_arr).cuda()
+        gpu_dst_arr = torch.from_numpy(dst_arr).cuda()
+    elif framework == "numba":
+        gpu_src_arr = numba.cuda.device_array_like(src_arr)
+        gpu_dst_arr = numba.cuda.device_array_like(dst_arr)
+    elif framework == "pycuda":
+        gpu_src_arr = pycuda.gpuarray.to_gpu(src_arr)
+        gpu_dst_arr = pycuda.gpuarray.to_gpu(dst_arr)
    kernel(src=gpu_src_arr, dst=gpu_dst_arr)
-    gpu_dst_arr.get(dst_arr)
+    if framework == "cupy":
+        dst_arr = gpu_dst_arr.get()
+    elif framework == "torch":
+        dst_arr = gpu_dst_arr.cpu()
+    elif framework == "numba":
+        gpu_dst_arr.copy_to_host(dst_arr)
+    elif framework == "pycuda":
+        gpu_dst_arr.get(dst_arr)
    stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
    reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)