From b1b8f513f59d79ea908e6cc2e656bc434c2b1e13 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Sun, 30 Jun 2024 16:37:27 +0200 Subject: [PATCH] first successful tests of cupy JIT; waiting for pycodegen/pystencils!393 to continue --- src/pystencils/backend/exceptions.py | 4 --- src/pystencils/backend/jit/__init__.py | 9 ++--- src/pystencils/backend/jit/gpu_cupy.py | 36 ++++++++++++++----- .../kernelcreation/cpu_optimization.py | 6 ++-- src/pystencils/backend/kernelfunction.py | 10 +++--- src/pystencils/backend/platforms/cuda.py | 2 +- src/pystencils/config.py | 25 +++++++++++-- .../kernelcreation/test_domain_kernels.py | 28 +++++++++++---- 8 files changed, 83 insertions(+), 37 deletions(-) diff --git a/src/pystencils/backend/exceptions.py b/src/pystencils/backend/exceptions.py index 4c0812249..d42f7c11f 100644 --- a/src/pystencils/backend/exceptions.py +++ b/src/pystencils/backend/exceptions.py @@ -5,10 +5,6 @@ class PsInternalCompilerError(Exception): """Indicates an internal error during kernel translation, most likely due to a bug inside pystencils.""" -class PsOptionsError(Exception): - """Indicates an option clash in the `CreateKernelConfig`.""" - - class PsInputError(Exception): """Indicates unsupported user input to the translation system""" diff --git a/src/pystencils/backend/jit/__init__.py b/src/pystencils/backend/jit/__init__.py index 7938f7083..2282476d8 100644 --- a/src/pystencils/backend/jit/__init__.py +++ b/src/pystencils/backend/jit/__init__.py @@ -27,14 +27,9 @@ Both are available here through `LegacyCpuJit` and `LegacyGpuJit`. """ from .jit import JitBase, NoJit, LegacyCpuJit, LegacyGpuJit +from .gpu_cupy import CupyJit no_jit = NoJit() """Disables just-in-time compilation for a kernel.""" -__all__ = [ - "JitBase", - "LegacyCpuJit", - "NoJit", - "no_jit", - "LegacyGpuJit", -] +__all__ = ["JitBase", "LegacyCpuJit", "NoJit", "no_jit", "LegacyGpuJit", "CupyJit"] diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/backend/jit/gpu_cupy.py index 77816175b..9a4b9774a 100644 --- a/src/pystencils/backend/jit/gpu_cupy.py +++ b/src/pystencils/backend/jit/gpu_cupy.py @@ -1,9 +1,15 @@ from typing import Callable, Any from dataclasses import dataclass -import cupy as cp +try: + import cupy as cp + + HAVE_CUPY = True +except ImportError: + HAVE_CUPY = False from ...enums import Target +from ...kernel_wrapper import KernelWrapper from ...types import PsType from .jit import JitBase, JitError @@ -26,7 +32,7 @@ class LaunchGrid: block: tuple[int, int, int] -class CupyKernelWrapper: +class CupyKernelWrapper(KernelWrapper): def __init__( self, kfunc: GpuKernelFunction, @@ -34,14 +40,22 @@ class CupyKernelWrapper: block_size: tuple[int, int, int], ): self._kfunc = kfunc - self._kernel = raw_kernel + self._raw_kernel = raw_kernel self._block_size = block_size + @property + def kernel_function(self) -> GpuKernelFunction: + return self._kfunc + + @property + def raw_kernel(self) -> cp.RawKernel: + return self._raw_kernel + def __call__(self, **kwargs: Any) -> Any: kernel_args, launch_grid = self._get_args(**kwargs) device = self._get_device(kernel_args) - with cp.cuda.device(device): - self._kernel(launch_grid.grid, launch_grid.block, kernel_args) + with cp.cuda.Device(device): + self._raw_kernel(launch_grid.grid, launch_grid.block, kernel_args) def _get_device(self, kernel_args): devices = set(a.device.id for a in kernel_args if type(a) is cp.ndarray) @@ -62,6 +76,7 @@ class CupyKernelWrapper: valuation[name] = arg # Collect parameter values + # TODO: Check array sizes arr: cp.ndarray for kparam in self._kfunc.parameters: @@ -81,7 +96,7 @@ class CupyKernelWrapper: case FieldStrideParam(name, dtype, field, coord): arr = kwargs[field.name] - add_arg(name, arr.strides[coord], dtype) + add_arg(name, arr.strides[coord] // arr.dtype.itemsize, dtype) case KernelParameter(name, dtype): val: Any = kwargs[name] @@ -119,11 +134,14 @@ class CupyJit(JitBase): def __init__(self, default_block_size: tuple[int, int, int] = (128, 2, 1)): # TODO: Fp16 headers - self._runtime_headers = {"<cstdint>", '"gpu_defines.h"'} + self._runtime_headers = {"<cstdint>"} self._default_block_size = default_block_size def compile(self, kfunc: KernelFunction) -> Callable[..., None]: - import cupy as cp + if not HAVE_CUPY: + raise JitError( + "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable." + ) if not isinstance(kfunc, GpuKernelFunction) or kfunc.target != Target.CUDA: raise ValueError( @@ -157,4 +175,4 @@ class CupyJit(JitBase): def _kernel_code(self, kfunc: GpuKernelFunction) -> str: kernel_code = emit_code(kfunc) - return f'extern "C" {{\n{kernel_code}\n}}\n' + return f'extern "C" {kernel_code}' diff --git a/src/pystencils/backend/kernelcreation/cpu_optimization.py b/src/pystencils/backend/kernelcreation/cpu_optimization.py index 29b133ff1..46fef6603 100644 --- a/src/pystencils/backend/kernelcreation/cpu_optimization.py +++ b/src/pystencils/backend/kernelcreation/cpu_optimization.py @@ -1,12 +1,14 @@ from __future__ import annotations -from typing import cast +from typing import cast, TYPE_CHECKING from .context import KernelCreationContext -from ..platforms import GenericCpu from ..ast.structural import PsBlock from ...config import CpuOptimConfig, OpenMpConfig +if TYPE_CHECKING: + from ..platforms import GenericCpu + def optimize_cpu( ctx: KernelCreationContext, diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py index dbc1b6519..86bd505cb 100644 --- a/src/pystencils/backend/kernelfunction.py +++ b/src/pystencils/backend/kernelfunction.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC -from typing import Callable, Sequence, Iterable +from typing import Callable, Sequence, Iterable, TYPE_CHECKING from .ast.structural import PsBlock from .ast.analysis import collect_required_headers, collect_undefined_symbols @@ -12,11 +12,13 @@ from .platforms import Platform, GpuThreadsRange from .constraints import KernelParamsConstraint from ..types import PsType -from .jit import JitBase, no_jit from ..enums import Target from ..field import Field +if TYPE_CHECKING: + from .jit import JitBase + class KernelParameter: __match_args__ = ("name", "dtype") @@ -121,7 +123,7 @@ class KernelFunction: parameters: Sequence[KernelParameter], required_headers: set[str], constraints: Sequence[KernelParamsConstraint], - jit: JitBase = no_jit, + jit: JitBase, ): self._body: PsBlock = body self._target = target @@ -196,7 +198,7 @@ class GpuKernelFunction(KernelFunction): parameters: Sequence[KernelParameter], required_headers: set[str], constraints: Sequence[KernelParamsConstraint], - jit: JitBase = no_jit, + jit: JitBase, ): super().__init__( body, target, name, parameters, required_headers, constraints, jit diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 338a7d490..c9cb97210 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -45,7 +45,7 @@ class CudaPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return {"gpu_defines.h"} + return {'"gpu_defines.h"'} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace diff --git a/src/pystencils/config.py b/src/pystencils/config.py index d2af213c1..b98079794 100644 --- a/src/pystencils/config.py +++ b/src/pystencils/config.py @@ -1,4 +1,5 @@ from __future__ import annotations +from typing import TYPE_CHECKING from collections.abc import Collection @@ -8,12 +9,17 @@ from dataclasses import dataclass from .enums import Target from .field import Field, FieldType -from .backend.jit import JitBase -from .backend.exceptions import PsOptionsError from .types import PsIntegerType, PsNumericType, PsIeeeFloatType from .defaults import DEFAULTS +if TYPE_CHECKING: + from .backend.jit import JitBase + + +class PsOptionsError(Exception): + """Indicates an option clash in the `CreateKernelConfig`.""" + @dataclass class OpenMpConfig: @@ -252,7 +258,9 @@ class CreateKernelConfig: if self.gpu_indexing is not None: if self.target != Target.SYCL: - raise PsOptionsError(f"`sycl_indexing` cannot be set for non-SYCL target {self.target}") + raise PsOptionsError( + f"`sycl_indexing` cannot be set for non-SYCL target {self.target}" + ) # Infer JIT if self.jit is None: @@ -260,8 +268,19 @@ class CreateKernelConfig: from .backend.jit import LegacyCpuJit self.jit = LegacyCpuJit() + elif self.target == Target.CUDA: + try: + from .backend.jit.gpu_cupy import CupyJit + + self.jit = CupyJit() + except ImportError: + from .backend.jit import no_jit + + self.jit = no_jit + elif self.target == Target.SYCL: from .backend.jit import no_jit + self.jit = no_jit else: raise NotImplementedError( diff --git a/tests/nbackend/kernelcreation/test_domain_kernels.py b/tests/nbackend/kernelcreation/test_domain_kernels.py index 9ce2f661d..79a23fdb3 100644 --- a/tests/nbackend/kernelcreation/test_domain_kernels.py +++ b/tests/nbackend/kernelcreation/test_domain_kernels.py @@ -1,13 +1,21 @@ +#%% +import pytest import sympy as sp import numpy as np -from pystencils import fields, Field, AssignmentCollection +from pystencils import fields, Field, AssignmentCollection, Target, CreateKernelConfig from pystencils.sympyextensions.astnodes import assignment_from_stencil from pystencils.kernelcreation import create_kernel -def test_filter_kernel(): +@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA)) +def test_filter_kernel(target): + if target == Target.CUDA: + xp = pytest.importorskip("cupy") + else: + xp = np + weight = sp.Symbol("weight") stencil = [ [1, 1, 1], @@ -19,18 +27,19 @@ def test_filter_kernel(): asm = assignment_from_stencil(stencil, src, dst, normalization_factor=weight) asms = AssignmentCollection([asm]) - ast = create_kernel(asms) + gen_config = CreateKernelConfig(target=target) + ast = create_kernel(asms, gen_config) kernel = ast.compile() - src_arr = np.ones((42, 42)) - dst_arr = np.zeros_like(src_arr) + src_arr = xp.ones((42, 42)) + dst_arr = xp.zeros_like(src_arr) kernel(src=src_arr, dst=dst_arr, weight=2.0) - expected = np.zeros_like(src_arr) + expected = xp.zeros_like(src_arr) expected[1:-1, 1:-1].fill(18.0) - np.testing.assert_allclose(dst_arr, expected) + xp.testing.assert_allclose(dst_arr, expected) def test_filter_kernel_fixedsize(): @@ -59,3 +68,8 @@ def test_filter_kernel_fixedsize(): expected[1:-1, 1:-1].fill(18.0) np.testing.assert_allclose(dst_arr, expected) + +#%% +test_filter_kernel(Target.CUDA) + +# %% -- GitLab