Skip to content
Snippets Groups Projects
Commit f340af58 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

started implementing cupy JIT

parent 356a8343
No related branches found
No related tags found
1 merge request!384Fundamental GPU Support
Pipeline #67192 failed
......@@ -19,3 +19,6 @@ ignore_missing_imports=true
[mypy-islpy.*]
ignore_missing_imports=true
[mypy-cupy.*]
ignore_missing_imports=true
......@@ -697,3 +697,33 @@ class PsArrayInitList(PsExpression):
def __repr__(self) -> str:
return f"PsArrayInitList({repr(self._items)})"
def evaluate_expression(
expr: PsExpression, valuation: dict[str, Any]
) -> Any:
"""Evaluate a pystencils backend expression tree with values assigned to symbols according to the given valuation.
Only a subset of expression nodes can be processed by this evaluator.
"""
def visit(node):
match node:
case PsSymbolExpr(symb):
return valuation[symb.name]
case PsConstantExpr(c):
return c.value
case PsUnOp(op1) if node.python_operator is not None:
return node.python_operator(visit(op1))
case PsBinOp(op1, op2) if node.python_operator is not None:
return node.python_operator(visit(op1), visit(op2))
case other:
raise NotImplementedError(
f"Unable to evaluate {other}: No implementation available."
)
return visit(expr)
from typing import Callable, Any
from dataclasses import dataclass
import cupy as cp
from ...enums import Target
from ...types import PsType
from .jit import JitBase, JitError
from ..kernelfunction import (
KernelFunction,
GpuKernelFunction,
FieldPointerParam,
FieldShapeParam,
FieldStrideParam,
KernelParameter,
)
from ..emission import emit_code
from ...include import get_pystencils_include_path
@dataclass
class LaunchGrid:
grid: tuple[int, int, int]
block: tuple[int, int, int]
class CupyKernelWrapper:
def __init__(
self,
kfunc: GpuKernelFunction,
raw_kernel: Any,
block_size: tuple[int, int, int],
):
self._kfunc = kfunc
self._kernel = raw_kernel
self._block_size = block_size
def __call__(self, **kwargs: Any) -> Any:
kernel_args, launch_grid = self._get_args(**kwargs)
device = self._get_device(kernel_args)
with cp.cuda.device(device):
self._kernel(launch_grid.grid, launch_grid.block, kernel_args)
def _get_device(self, kernel_args):
devices = set(a.device.id for a in kernel_args if type(a) is cp.ndarray)
if len(devices) != 1:
raise JitError("Could not determine CUDA device to execute on")
return devices.pop()
def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]:
args = []
valuation: dict[str, Any] = dict()
def add_arg(name: str, arg: Any, dtype: PsType):
nptype = dtype.numpy_dtype
assert nptype is not None
typecast = nptype.type
arg = typecast(arg)
args.append(arg)
valuation[name] = arg
# Collect parameter values
arr: cp.ndarray
for kparam in self._kfunc.parameters:
match kparam:
case FieldPointerParam(_, dtype, field):
arr = kwargs[field.name]
if arr.dtype != field.dtype.numpy_dtype:
raise JitError(
f"Data type mismatch at array argument {field.name}:"
f"Expected {field.dtype}, got {arr.dtype}"
)
args.append(arr)
case FieldShapeParam(name, dtype, field, coord):
arr = kwargs[field.name]
add_arg(name, arr.shape[coord], dtype)
case FieldStrideParam(name, dtype, field, coord):
arr = kwargs[field.name]
add_arg(name, arr.strides[coord], dtype)
case KernelParameter(name, dtype):
val: Any = kwargs[name]
add_arg(name, val, dtype)
# Determine launch grid
from ..ast.expressions import evaluate_expression
symbolic_threads_range = self._kfunc.threads_range
threads_range: list[int] = [
evaluate_expression(expr, valuation)
for expr in symbolic_threads_range.num_work_items
]
if symbolic_threads_range.dim < 3:
threads_range += [1] * (3 - symbolic_threads_range.dim)
def div_ceil(a, b):
return a // b if a % b == 0 else a // b + 1
# TODO: Refine this?
grid_size = tuple(
div_ceil(threads, tpb)
for threads, tpb in zip(threads_range, self._block_size)
)
assert len(grid_size) == 3
launch_grid = LaunchGrid(grid_size, self._block_size)
return tuple(args), launch_grid
class CupyJit(JitBase):
def __init__(self, default_block_size: tuple[int, int, int] = (128, 2, 1)):
# TODO: Fp16 headers
self._runtime_headers = {"<cstdint>", '"gpu_defines.h"'}
self._default_block_size = default_block_size
def compile(self, kfunc: KernelFunction) -> Callable[..., None]:
import cupy as cp
if not isinstance(kfunc, GpuKernelFunction) or kfunc.target != Target.CUDA:
raise ValueError(
"The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP"
)
options = self._compiler_options()
prelude = self._prelude(kfunc)
kernel_code = self._kernel_code(kfunc)
code = prelude + kernel_code
raw_kernel = cp.RawKernel(
code, kfunc.name, options=options, backend="nvrtc", jitify=True
)
return CupyKernelWrapper(kfunc, raw_kernel, self._default_block_size)
def _compiler_options(self) -> tuple[str, ...]:
options = ["-w", "-std=c++11"]
options.append("-I" + get_pystencils_include_path())
return tuple(options)
def _prelude(self, kfunc: GpuKernelFunction) -> str:
headers = self._runtime_headers
headers |= kfunc.required_headers
code = "\n".join(f"#include {header}" for header in headers)
code += "\n\n#define RESTRICT __restrict__\n\n"
return code
def _kernel_code(self, kfunc: GpuKernelFunction) -> str:
kernel_code = emit_code(kfunc)
return f'extern "C" {{\n{kernel_code}\n}}\n'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment