diff --git a/mypy.ini b/mypy.ini index 07228fe24009da6ea4f21cb6cdf15a0516041149..e89adf9f5eaa918753f94ffbf6ba00b1be6e39cd 100644 --- a/mypy.ini +++ b/mypy.ini @@ -19,3 +19,6 @@ ignore_missing_imports=true [mypy-islpy.*] ignore_missing_imports=true + +[mypy-cupy.*] +ignore_missing_imports=true diff --git a/src/pystencils/backend/ast/expressions.py b/src/pystencils/backend/ast/expressions.py index 4063f7b539ab387d1f950a75e735f3c6201b5ef2..18cc8c402083dca727280d79e37aa96c8024d711 100644 --- a/src/pystencils/backend/ast/expressions.py +++ b/src/pystencils/backend/ast/expressions.py @@ -697,3 +697,33 @@ class PsArrayInitList(PsExpression): def __repr__(self) -> str: return f"PsArrayInitList({repr(self._items)})" + + +def evaluate_expression( + expr: PsExpression, valuation: dict[str, Any] +) -> Any: + """Evaluate a pystencils backend expression tree with values assigned to symbols according to the given valuation. + + Only a subset of expression nodes can be processed by this evaluator. + """ + + def visit(node): + match node: + case PsSymbolExpr(symb): + return valuation[symb.name] + + case PsConstantExpr(c): + return c.value + + case PsUnOp(op1) if node.python_operator is not None: + return node.python_operator(visit(op1)) + + case PsBinOp(op1, op2) if node.python_operator is not None: + return node.python_operator(visit(op1), visit(op2)) + + case other: + raise NotImplementedError( + f"Unable to evaluate {other}: No implementation available." + ) + + return visit(expr) diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/backend/jit/gpu_cupy.py new file mode 100644 index 0000000000000000000000000000000000000000..77816175be347d50914440d7737cc572540eb03e --- /dev/null +++ b/src/pystencils/backend/jit/gpu_cupy.py @@ -0,0 +1,160 @@ +from typing import Callable, Any +from dataclasses import dataclass + +import cupy as cp + +from ...enums import Target + +from ...types import PsType +from .jit import JitBase, JitError +from ..kernelfunction import ( + KernelFunction, + GpuKernelFunction, + FieldPointerParam, + FieldShapeParam, + FieldStrideParam, + KernelParameter, +) +from ..emission import emit_code + +from ...include import get_pystencils_include_path + + +@dataclass +class LaunchGrid: + grid: tuple[int, int, int] + block: tuple[int, int, int] + + +class CupyKernelWrapper: + def __init__( + self, + kfunc: GpuKernelFunction, + raw_kernel: Any, + block_size: tuple[int, int, int], + ): + self._kfunc = kfunc + self._kernel = raw_kernel + self._block_size = block_size + + def __call__(self, **kwargs: Any) -> Any: + kernel_args, launch_grid = self._get_args(**kwargs) + device = self._get_device(kernel_args) + with cp.cuda.device(device): + self._kernel(launch_grid.grid, launch_grid.block, kernel_args) + + def _get_device(self, kernel_args): + devices = set(a.device.id for a in kernel_args if type(a) is cp.ndarray) + if len(devices) != 1: + raise JitError("Could not determine CUDA device to execute on") + return devices.pop() + + def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]: + args = [] + valuation: dict[str, Any] = dict() + + def add_arg(name: str, arg: Any, dtype: PsType): + nptype = dtype.numpy_dtype + assert nptype is not None + typecast = nptype.type + arg = typecast(arg) + args.append(arg) + valuation[name] = arg + + # Collect parameter values + arr: cp.ndarray + + for kparam in self._kfunc.parameters: + match kparam: + case FieldPointerParam(_, dtype, field): + arr = kwargs[field.name] + if arr.dtype != field.dtype.numpy_dtype: + raise JitError( + f"Data type mismatch at array argument {field.name}:" + f"Expected {field.dtype}, got {arr.dtype}" + ) + args.append(arr) + + case FieldShapeParam(name, dtype, field, coord): + arr = kwargs[field.name] + add_arg(name, arr.shape[coord], dtype) + + case FieldStrideParam(name, dtype, field, coord): + arr = kwargs[field.name] + add_arg(name, arr.strides[coord], dtype) + + case KernelParameter(name, dtype): + val: Any = kwargs[name] + add_arg(name, val, dtype) + + # Determine launch grid + from ..ast.expressions import evaluate_expression + + symbolic_threads_range = self._kfunc.threads_range + + threads_range: list[int] = [ + evaluate_expression(expr, valuation) + for expr in symbolic_threads_range.num_work_items + ] + + if symbolic_threads_range.dim < 3: + threads_range += [1] * (3 - symbolic_threads_range.dim) + + def div_ceil(a, b): + return a // b if a % b == 0 else a // b + 1 + + # TODO: Refine this? + grid_size = tuple( + div_ceil(threads, tpb) + for threads, tpb in zip(threads_range, self._block_size) + ) + assert len(grid_size) == 3 + + launch_grid = LaunchGrid(grid_size, self._block_size) + + return tuple(args), launch_grid + + +class CupyJit(JitBase): + + def __init__(self, default_block_size: tuple[int, int, int] = (128, 2, 1)): + # TODO: Fp16 headers + self._runtime_headers = {"<cstdint>", '"gpu_defines.h"'} + self._default_block_size = default_block_size + + def compile(self, kfunc: KernelFunction) -> Callable[..., None]: + import cupy as cp + + if not isinstance(kfunc, GpuKernelFunction) or kfunc.target != Target.CUDA: + raise ValueError( + "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP" + ) + + options = self._compiler_options() + prelude = self._prelude(kfunc) + kernel_code = self._kernel_code(kfunc) + code = prelude + kernel_code + + raw_kernel = cp.RawKernel( + code, kfunc.name, options=options, backend="nvrtc", jitify=True + ) + return CupyKernelWrapper(kfunc, raw_kernel, self._default_block_size) + + def _compiler_options(self) -> tuple[str, ...]: + options = ["-w", "-std=c++11"] + options.append("-I" + get_pystencils_include_path()) + return tuple(options) + + def _prelude(self, kfunc: GpuKernelFunction) -> str: + headers = self._runtime_headers + headers |= kfunc.required_headers + + code = "\n".join(f"#include {header}" for header in headers) + + code += "\n\n#define RESTRICT __restrict__\n\n" + + return code + + def _kernel_code(self, kfunc: GpuKernelFunction) -> str: + kernel_code = emit_code(kfunc) + return f'extern "C" {{\n{kernel_code}\n}}\n'