diff --git a/pystencils/config.py b/pystencils/config.py index 7c2be7a66f3d9d9ec0f3c4e6f4a406bbea471caa..23570d625dc96257ad3e6c420f5e1a54467479cb 100644 --- a/pystencils/config.py +++ b/pystencils/config.py @@ -82,7 +82,7 @@ class CreateKernelConfig: """ Either 'block' or 'line' , or custom indexing class, see `pystencils.gpucuda.AbstractIndexing` """ - gpu_indexing_params: MappingProxyType = field(default=MappingProxyType({})) + gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({})) """ Dict with indexing parameters (constructor parameters of indexing class) e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'. @@ -121,12 +121,12 @@ class CreateKernelConfig: allow_double_writes: bool = False """ If True, don't check if every field is only written at a single location. This is required - for example for kernels that are compiled with loop step sizes > 1, that handle multiple + for example for kernels that are compiled with loop step sizes > 1, that handle multiple cells at once. Use with care! """ skip_independence_check: bool = False """ - Don't check that loop iterations are independent. This is needed e.g. for + Don't check that loop iterations are independent. This is needed e.g. for periodicity kernel, that access the field outside the iteration bounds. Use with care! """ diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py index b6fb901750895b341d44fde26040ff3b91d0e9e9..378f338ade4f0e5ea35dce84e8fdd70b44123059 100644 --- a/pystencils/gpucuda/cudajit.py +++ b/pystencils/gpucuda/cudajit.py @@ -2,17 +2,19 @@ import numpy as np from pystencils.backends.cbackend import get_headers from pystencils.backends.cuda_backend import generate_cuda -from pystencils.typing import StructType from pystencils.field import FieldType from pystencils.include import get_pycuda_include_path, get_pystencils_include_path from pystencils.kernel_wrapper import KernelWrapper +from pystencils.typing import StructType from pystencils.typing.typed_sympy import FieldPointerSymbol +from pystencils.utils import DotDict USE_FAST_MATH = True +USE_PYCUDA = True def get_cubic_interpolation_include_paths(): - from os.path import join, dirname + from os.path import dirname, join return [join(dirname(__file__), "CubicInterpolationCUDA", "code"), join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")] @@ -32,9 +34,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen Returns: compiled kernel as Python function """ - import pycuda.autoinit # NOQA - from pycuda.compiler import SourceModule - if argument_dict is None: argument_dict = {} @@ -42,17 +41,25 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen includes = "\n".join([f"#include {include_file}" for include_file in header_list]) code = includes + "\n" - code += "#define FUNC_PREFIX __global__\n" + code += "#define FUNC_PREFIX extern \"C\" __global__\n" code += "#define RESTRICT __restrict__\n\n" code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend)) - nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] + nvcc_options = ["-w", "-std=c++11"] if USE_FAST_MATH: nvcc_options.append("-use_fast_math") - mod = SourceModule(code, options=nvcc_options, include_dirs=[ - get_pystencils_include_path(), get_pycuda_include_path()]) - func = mod.get_function(kernel_function_node.function_name) + if USE_PYCUDA: + import pycuda.autoinit # NOQA + from pycuda.compiler import SourceModule + nvcc_options.append("-Wno-deprecated-gpu-targets") + mod = SourceModule(code, options=nvcc_options, include_dirs=[ + get_pystencils_include_path(), get_pycuda_include_path()]) + func = mod.get_function(kernel_function_node.function_name) + else: + import cupy + nvcc_options.append("-I" + get_pystencils_include_path()) + func = cupy.RawKernel(code, kernel_function_node.function_name, options=tuple(nvcc_options), jitify=True) parameters = kernel_function_node.get_parameters() @@ -78,13 +85,19 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen args = _build_numpy_argument_list(parameters, full_arguments) cache[key] = (args, block_and_thread_numbers) cache_values.append(kwargs) # keep objects alive such that ids remain unique - func(*args, **block_and_thread_numbers) + if USE_PYCUDA: + func(*args, **block_and_thread_numbers) + else: + func(block_and_thread_numbers['grid'], + block_and_thread_numbers['block'], [cupy.asarray(a) for a in args]) + # import pycuda.driver as cuda # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called ast = kernel_function_node parameters = kernel_function_node.get_parameters() wrapper = KernelWrapper(wrapper, parameters, ast) - wrapper.num_regs = func.num_regs + if USE_PYCUDA: + wrapper.num_regs = func.num_regs return wrapper @@ -95,7 +108,16 @@ def _build_numpy_argument_list(parameters, argument_dict): for param in parameters: if param.is_field_pointer: array = argument_dict[param.field_name] - actual_type = array.dtype + if not all(hasattr(array, attr) for attr in ["shape", "stride", "data", "dtype"]) or not isinstance( + array.dtype, np.dtype): + interface = DotDict() + interface.shape = array.__cuda_array_interface__["shape"] + interface.dtype = np.dtype(array.__cuda_array_interface__["typestr"]) + interface.strides = array.__cuda_array_interface__["strides"] + interface.data = array.__cuda_array_interface__["data"][0] + else: + interface = array + actual_type = interface.dtype expected_type = param.fields[0].dtype.numpy_dtype if expected_type != actual_type: raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." % @@ -136,6 +158,16 @@ def _check_arguments(parameter_specification, argument_dict): except KeyError: raise KeyError("Missing field parameter for kernel call " + str(symbolic_field)) + if not all(hasattr(field_arr, attr) for attr in ["shape", "stride", "data", "dtype"]) or not isinstance( + field_arr.dtype, np.dtype): + if hasattr(field_arr, "__cuda_array_interface__"): + field_interface = DotDict() + field_interface.shape = field_arr.__cuda_array_interface__["shape"] + field_interface.dtype = np.dtype(field_arr.__cuda_array_interface__["typestr"]) + field_interface.strides = field_arr.__cuda_array_interface__["strides"] + field_interface.data = field_arr.__cuda_array_interface__["data"][0] + field_arr = field_interface + if symbolic_field.has_fixed_shape: symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape) if isinstance(symbolic_field.dtype, StructType): @@ -147,7 +179,7 @@ def _check_arguments(parameter_specification, argument_dict): symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides) if isinstance(symbolic_field.dtype, StructType): symbolic_field_strides = symbolic_field_strides[:-1] - if symbolic_field_strides != field_arr.strides: + if field_arr.strides and symbolic_field_strides != field_arr.strides: raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" % (symbolic_field.name, str(field_arr.strides), str(symbolic_field_strides))) diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py index a65a08ba6d24b30002822e9916b2d3d44639d26a..8dac067b29b8573fc8c9a4e03a83cde81b96ce8d 100644 --- a/pystencils_tests/test_cudagpu.py +++ b/pystencils_tests/test_cudagpu.py @@ -1,16 +1,28 @@ import numpy as np import pycuda.autoinit import pycuda.gpuarray as gpuarray +import pytest import sympy as sp from scipy.ndimage import convolve -from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target +from pystencils import Assignment, CreateKernelConfig, Field, Target, create_kernel, fields from pystencils.gpucuda import BlockIndexing from pystencils.simp import sympy_cse_on_assignment_list from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers -def test_averaging_kernel(): +@pytest.mark.parametrize("framework", ("cupy", "torch", "numba", "pycuda")) +def test_averaging_kernel(framework): + if framework == "cupy": + cupy = pytest.importorskip(framework) + elif framework == "torch": + torch = pytest.importorskip(framework) + elif framework == "numba": + numba = pytest.importorskip(framework) + elif framework == "pycuda": + pycuda = pytest.importorskip(framework) + import pycuda.autoinit + import pycuda.gpuarray size = (40, 55) src_arr = np.random.rand(*size) src_arr = add_ghost_layers(src_arr) @@ -25,10 +37,29 @@ def test_averaging_kernel(): ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config) kernel = ast.compile() - gpu_src_arr = gpuarray.to_gpu(src_arr) - gpu_dst_arr = gpuarray.to_gpu(dst_arr) + if framework == "cupy": + gpu_src_arr = cupy.array(src_arr) + gpu_dst_arr = cupy.array(dst_arr) + elif framework == "torch": + gpu_src_arr = torch.from_numpy(src_arr).cuda() + gpu_dst_arr = torch.from_numpy(dst_arr).cuda() + elif framework == "numba": + gpu_src_arr = numba.cuda.device_array_like(src_arr) + gpu_dst_arr = numba.cuda.device_array_like(dst_arr) + elif framework == "pycuda": + gpu_src_arr = pycuda.gpuarray.to_gpu(src_arr) + gpu_dst_arr = pycuda.gpuarray.to_gpu(dst_arr) + kernel(src=gpu_src_arr, dst=gpu_dst_arr) - gpu_dst_arr.get(dst_arr) + + if framework == "cupy": + dst_arr = gpu_dst_arr.get() + elif framework == "torch": + dst_arr = gpu_dst_arr.cpu() + elif framework == "numba": + gpu_dst_arr.copy_to_host(dst_arr) + elif framework == "pycuda": + gpu_dst_arr.get(dst_arr) stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0 reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)