Skip to content
Snippets Groups Projects
Commit 902cab8f authored by Stephan Seitz's avatar Stephan Seitz
Browse files

feat: implement `__cuda_array_interface__`

https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html

Also allow to execute with cupy (https://docs.cupy.dev/en/stable/index.html)
instead of pycuda

TODO:
- [ ] check that pointers in correct CUDA context and if not import into
  current
- [ ] make execution with pycuda aware of `__cuda_array_interface__`
parent 262446d1
No related branches found
No related tags found
No related merge requests found
Pipeline #51352 failed
...@@ -82,7 +82,7 @@ class CreateKernelConfig: ...@@ -82,7 +82,7 @@ class CreateKernelConfig:
""" """
Either 'block' or 'line' , or custom indexing class, see `pystencils.gpucuda.AbstractIndexing` Either 'block' or 'line' , or custom indexing class, see `pystencils.gpucuda.AbstractIndexing`
""" """
gpu_indexing_params: MappingProxyType = field(default=MappingProxyType({})) gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({}))
""" """
Dict with indexing parameters (constructor parameters of indexing class) Dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'. e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
...@@ -121,12 +121,12 @@ class CreateKernelConfig: ...@@ -121,12 +121,12 @@ class CreateKernelConfig:
allow_double_writes: bool = False allow_double_writes: bool = False
""" """
If True, don't check if every field is only written at a single location. This is required If True, don't check if every field is only written at a single location. This is required
for example for kernels that are compiled with loop step sizes > 1, that handle multiple for example for kernels that are compiled with loop step sizes > 1, that handle multiple
cells at once. Use with care! cells at once. Use with care!
""" """
skip_independence_check: bool = False skip_independence_check: bool = False
""" """
Don't check that loop iterations are independent. This is needed e.g. for Don't check that loop iterations are independent. This is needed e.g. for
periodicity kernel, that access the field outside the iteration bounds. Use with care! periodicity kernel, that access the field outside the iteration bounds. Use with care!
""" """
......
...@@ -2,17 +2,19 @@ import numpy as np ...@@ -2,17 +2,19 @@ import numpy as np
from pystencils.backends.cbackend import get_headers from pystencils.backends.cbackend import get_headers
from pystencils.backends.cuda_backend import generate_cuda from pystencils.backends.cuda_backend import generate_cuda
from pystencils.typing import StructType
from pystencils.field import FieldType from pystencils.field import FieldType
from pystencils.include import get_pycuda_include_path, get_pystencils_include_path from pystencils.include import get_pycuda_include_path, get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper from pystencils.kernel_wrapper import KernelWrapper
from pystencils.typing import StructType
from pystencils.typing.typed_sympy import FieldPointerSymbol from pystencils.typing.typed_sympy import FieldPointerSymbol
from pystencils.utils import DotDict
USE_FAST_MATH = True USE_FAST_MATH = True
USE_PYCUDA = True
def get_cubic_interpolation_include_paths(): def get_cubic_interpolation_include_paths():
from os.path import join, dirname from os.path import dirname, join
return [join(dirname(__file__), "CubicInterpolationCUDA", "code"), return [join(dirname(__file__), "CubicInterpolationCUDA", "code"),
join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")] join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
...@@ -32,9 +34,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -32,9 +34,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
Returns: Returns:
compiled kernel as Python function compiled kernel as Python function
""" """
import pycuda.autoinit # NOQA
from pycuda.compiler import SourceModule
if argument_dict is None: if argument_dict is None:
argument_dict = {} argument_dict = {}
...@@ -42,17 +41,25 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -42,17 +41,25 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
includes = "\n".join([f"#include {include_file}" for include_file in header_list]) includes = "\n".join([f"#include {include_file}" for include_file in header_list])
code = includes + "\n" code = includes + "\n"
code += "#define FUNC_PREFIX __global__\n" code += "#define FUNC_PREFIX extern \"C\" __global__\n"
code += "#define RESTRICT __restrict__\n\n" code += "#define RESTRICT __restrict__\n\n"
code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend)) code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] nvcc_options = ["-w", "-std=c++11"]
if USE_FAST_MATH: if USE_FAST_MATH:
nvcc_options.append("-use_fast_math") nvcc_options.append("-use_fast_math")
mod = SourceModule(code, options=nvcc_options, include_dirs=[ if USE_PYCUDA:
get_pystencils_include_path(), get_pycuda_include_path()]) import pycuda.autoinit # NOQA
func = mod.get_function(kernel_function_node.function_name) from pycuda.compiler import SourceModule
nvcc_options.append("-Wno-deprecated-gpu-targets")
mod = SourceModule(code, options=nvcc_options, include_dirs=[
get_pystencils_include_path(), get_pycuda_include_path()])
func = mod.get_function(kernel_function_node.function_name)
else:
import cupy
nvcc_options.append("-I" + get_pystencils_include_path())
func = cupy.RawKernel(code, kernel_function_node.function_name, options=tuple(nvcc_options), jitify=True)
parameters = kernel_function_node.get_parameters() parameters = kernel_function_node.get_parameters()
...@@ -78,13 +85,19 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -78,13 +85,19 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
args = _build_numpy_argument_list(parameters, full_arguments) args = _build_numpy_argument_list(parameters, full_arguments)
cache[key] = (args, block_and_thread_numbers) cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args, **block_and_thread_numbers) if USE_PYCUDA:
func(*args, **block_and_thread_numbers)
else:
func(block_and_thread_numbers['grid'],
block_and_thread_numbers['block'], [cupy.asarray(a) for a in args])
# import pycuda.driver as cuda # import pycuda.driver as cuda
# cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
ast = kernel_function_node ast = kernel_function_node
parameters = kernel_function_node.get_parameters() parameters = kernel_function_node.get_parameters()
wrapper = KernelWrapper(wrapper, parameters, ast) wrapper = KernelWrapper(wrapper, parameters, ast)
wrapper.num_regs = func.num_regs if USE_PYCUDA:
wrapper.num_regs = func.num_regs
return wrapper return wrapper
...@@ -95,7 +108,15 @@ def _build_numpy_argument_list(parameters, argument_dict): ...@@ -95,7 +108,15 @@ def _build_numpy_argument_list(parameters, argument_dict):
for param in parameters: for param in parameters:
if param.is_field_pointer: if param.is_field_pointer:
array = argument_dict[param.field_name] array = argument_dict[param.field_name]
actual_type = array.dtype if hasattr(array, "__cuda_array_interface__"):
interface = DotDict()
interface.shape = array.__cuda_array_interface__["shape"]
interface.dtype = np.dtype(array.__cuda_array_interface__["typestr"])
interface.strides = array.__cuda_array_interface__["strides"]
interface.data = array.__cuda_array_interface__["data"][0]
else:
interface = array
actual_type = interface.dtype
expected_type = param.fields[0].dtype.numpy_dtype expected_type = param.fields[0].dtype.numpy_dtype
if expected_type != actual_type: if expected_type != actual_type:
raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." % raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." %
...@@ -136,6 +157,14 @@ def _check_arguments(parameter_specification, argument_dict): ...@@ -136,6 +157,14 @@ def _check_arguments(parameter_specification, argument_dict):
except KeyError: except KeyError:
raise KeyError("Missing field parameter for kernel call " + str(symbolic_field)) raise KeyError("Missing field parameter for kernel call " + str(symbolic_field))
if hasattr(field_arr, "__cuda_array_interface__"):
field_interface = DotDict()
field_interface.shape = field_arr.__cuda_array_interface__["shape"]
field_interface.dtype = np.dtype(field_arr.__cuda_array_interface__["typestr"])
field_interface.strides = field_arr.__cuda_array_interface__["strides"]
field_interface.data = field_arr.__cuda_array_interface__["data"][0]
field_arr = field_interface
if symbolic_field.has_fixed_shape: if symbolic_field.has_fixed_shape:
symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape) symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
if isinstance(symbolic_field.dtype, StructType): if isinstance(symbolic_field.dtype, StructType):
...@@ -147,7 +176,7 @@ def _check_arguments(parameter_specification, argument_dict): ...@@ -147,7 +176,7 @@ def _check_arguments(parameter_specification, argument_dict):
symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides) symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides)
if isinstance(symbolic_field.dtype, StructType): if isinstance(symbolic_field.dtype, StructType):
symbolic_field_strides = symbolic_field_strides[:-1] symbolic_field_strides = symbolic_field_strides[:-1]
if symbolic_field_strides != field_arr.strides: if field_arr.strides and symbolic_field_strides != field_arr.strides:
raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" % raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
(symbolic_field.name, str(field_arr.strides), str(symbolic_field_strides))) (symbolic_field.name, str(field_arr.strides), str(symbolic_field_strides)))
......
import numpy as np
import pytest
import sympy as sp
from scipy.ndimage import convolve
from pystencils import Assignment, CreateKernelConfig, Field, Target, create_kernel
from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import add_ghost_layers, remove_ghost_layers
@pytest.mark.parametrize("framework", ("cupy", "torch", "numba"))
def test_cuda_array_interface(framework):
if framework == "cupy":
cupy = pytest.importorskip(framework)
elif framework == "torch":
torch = pytest.importorskip(framework)
elif framework == "numba":
numba = pytest.importorskip(framework)
elif framework == "numba":
pycuda = pytest.importorskip(framework)
import pycuda.autoinit
size = (40, 55)
src_arr = np.random.rand(*size)
src_arr = add_ghost_layers(src_arr)
dst_arr = np.zeros_like(src_arr)
src_field = Field.create_from_numpy_array('src', src_arr)
dst_field = Field.create_from_numpy_array('dst', dst_arr)
update_rule = Assignment(dst_field[0, 0],
(src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4)
config = CreateKernelConfig(target=Target.GPU)
ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
kernel = ast.compile()
if framework == "cupy":
gpu_src_arr = cupy.array(src_arr)
gpu_dst_arr = cupy.array(dst_arr)
elif framework == "torch":
gpu_src_arr = torch.from_numpy(src_arr).cuda()
gpu_dst_arr = torch.from_numpy(dst_arr).cuda()
elif framework == "numba":
gpu_src_arr = numba.cuda.device_array_like(src_arr)
gpu_dst_arr = numba.cuda.device_array_like(dst_arr)
elif framework == "pycuda":
gpu_src_arr = pycuda.gpuarray.device_array_like(src_arr)
gpu_dst_arr = numba.cuda.device_array_like(dst_arr)
kernel(src=gpu_src_arr, dst=gpu_dst_arr)
if framework == "cupy":
dst_arr = gpu_dst_arr.get()
elif framework == "torch":
dst_arr = gpu_dst_arr.cpu()
elif framework == "numba":
gpu_dst_arr.copy_to_host(dst_arr)
stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)
reference = add_ghost_layers(reference)
np.testing.assert_almost_equal(reference, dst_arr)
import numpy as np import numpy as np
import pycuda.autoinit import pycuda.autoinit
import pycuda.gpuarray as gpuarray import pycuda.gpuarray as gpuarray
import pytest
import sympy as sp import sympy as sp
from scipy.ndimage import convolve from scipy.ndimage import convolve
from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target from pystencils import Assignment, CreateKernelConfig, Field, Target, create_kernel, fields
from pystencils.gpucuda import BlockIndexing from pystencils.gpucuda import BlockIndexing
from pystencils.simp import sympy_cse_on_assignment_list from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
def test_averaging_kernel(): @pytest.mark.parametrize("framework", ("cupy", "torch", "numba", "pycuda"))
def test_averaging_kernel(framework):
if framework == "cupy":
cupy = pytest.importorskip(framework)
elif framework == "torch":
torch = pytest.importorskip(framework)
elif framework == "numba":
numba = pytest.importorskip(framework)
elif framework == "pycuda":
pycuda = pytest.importorskip(framework)
import pycuda.autoinit
import pycuda.gpuarray
size = (40, 55) size = (40, 55)
src_arr = np.random.rand(*size) src_arr = np.random.rand(*size)
src_arr = add_ghost_layers(src_arr) src_arr = add_ghost_layers(src_arr)
...@@ -25,10 +37,29 @@ def test_averaging_kernel(): ...@@ -25,10 +37,29 @@ def test_averaging_kernel():
ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config) ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
kernel = ast.compile() kernel = ast.compile()
gpu_src_arr = gpuarray.to_gpu(src_arr) if framework == "cupy":
gpu_dst_arr = gpuarray.to_gpu(dst_arr) gpu_src_arr = cupy.array(src_arr)
gpu_dst_arr = cupy.array(dst_arr)
elif framework == "torch":
gpu_src_arr = torch.from_numpy(src_arr).cuda()
gpu_dst_arr = torch.from_numpy(dst_arr).cuda()
elif framework == "numba":
gpu_src_arr = numba.cuda.device_array_like(src_arr)
gpu_dst_arr = numba.cuda.device_array_like(dst_arr)
elif framework == "pycuda":
gpu_src_arr = pycuda.gpuarray.to_gpu(src_arr)
gpu_dst_arr = pycuda.gpuarray.to_gpu(dst_arr)
kernel(src=gpu_src_arr, dst=gpu_dst_arr) kernel(src=gpu_src_arr, dst=gpu_dst_arr)
gpu_dst_arr.get(dst_arr)
if framework == "cupy":
dst_arr = gpu_dst_arr.get()
elif framework == "torch":
dst_arr = gpu_dst_arr.cpu()
elif framework == "numba":
gpu_dst_arr.copy_to_host(dst_arr)
elif framework == "pycuda":
gpu_dst_arr.get(dst_arr)
stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0 stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0) reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment