From b1b8f513f59d79ea908e6cc2e656bc434c2b1e13 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Sun, 30 Jun 2024 16:37:27 +0200
Subject: [PATCH] first successful tests of cupy JIT; waiting for
 pycodegen/pystencils!393 to continue

---
 src/pystencils/backend/exceptions.py          |  4 ---
 src/pystencils/backend/jit/__init__.py        |  9 ++---
 src/pystencils/backend/jit/gpu_cupy.py        | 36 ++++++++++++++-----
 .../kernelcreation/cpu_optimization.py        |  6 ++--
 src/pystencils/backend/kernelfunction.py      | 10 +++---
 src/pystencils/backend/platforms/cuda.py      |  2 +-
 src/pystencils/config.py                      | 25 +++++++++++--
 .../kernelcreation/test_domain_kernels.py     | 28 +++++++++++----
 8 files changed, 83 insertions(+), 37 deletions(-)

diff --git a/src/pystencils/backend/exceptions.py b/src/pystencils/backend/exceptions.py
index 4c0812249..d42f7c11f 100644
--- a/src/pystencils/backend/exceptions.py
+++ b/src/pystencils/backend/exceptions.py
@@ -5,10 +5,6 @@ class PsInternalCompilerError(Exception):
     """Indicates an internal error during kernel translation, most likely due to a bug inside pystencils."""
 
 
-class PsOptionsError(Exception):
-    """Indicates an option clash in the `CreateKernelConfig`."""
-
-
 class PsInputError(Exception):
     """Indicates unsupported user input to the translation system"""
 
diff --git a/src/pystencils/backend/jit/__init__.py b/src/pystencils/backend/jit/__init__.py
index 7938f7083..2282476d8 100644
--- a/src/pystencils/backend/jit/__init__.py
+++ b/src/pystencils/backend/jit/__init__.py
@@ -27,14 +27,9 @@ Both are available here through `LegacyCpuJit` and `LegacyGpuJit`.
 """
 
 from .jit import JitBase, NoJit, LegacyCpuJit, LegacyGpuJit
+from .gpu_cupy import CupyJit
 
 no_jit = NoJit()
 """Disables just-in-time compilation for a kernel."""
 
-__all__ = [
-    "JitBase",
-    "LegacyCpuJit",
-    "NoJit",
-    "no_jit",
-    "LegacyGpuJit",
-]
+__all__ = ["JitBase", "LegacyCpuJit", "NoJit", "no_jit", "LegacyGpuJit", "CupyJit"]
diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/backend/jit/gpu_cupy.py
index 77816175b..9a4b9774a 100644
--- a/src/pystencils/backend/jit/gpu_cupy.py
+++ b/src/pystencils/backend/jit/gpu_cupy.py
@@ -1,9 +1,15 @@
 from typing import Callable, Any
 from dataclasses import dataclass
 
-import cupy as cp
+try:
+    import cupy as cp
+
+    HAVE_CUPY = True
+except ImportError:
+    HAVE_CUPY = False
 
 from ...enums import Target
+from ...kernel_wrapper import KernelWrapper
 
 from ...types import PsType
 from .jit import JitBase, JitError
@@ -26,7 +32,7 @@ class LaunchGrid:
     block: tuple[int, int, int]
 
 
-class CupyKernelWrapper:
+class CupyKernelWrapper(KernelWrapper):
     def __init__(
         self,
         kfunc: GpuKernelFunction,
@@ -34,14 +40,22 @@ class CupyKernelWrapper:
         block_size: tuple[int, int, int],
     ):
         self._kfunc = kfunc
-        self._kernel = raw_kernel
+        self._raw_kernel = raw_kernel
         self._block_size = block_size
 
+    @property
+    def kernel_function(self) -> GpuKernelFunction:
+        return self._kfunc
+
+    @property
+    def raw_kernel(self) -> cp.RawKernel:
+        return self._raw_kernel
+
     def __call__(self, **kwargs: Any) -> Any:
         kernel_args, launch_grid = self._get_args(**kwargs)
         device = self._get_device(kernel_args)
-        with cp.cuda.device(device):
-            self._kernel(launch_grid.grid, launch_grid.block, kernel_args)
+        with cp.cuda.Device(device):
+            self._raw_kernel(launch_grid.grid, launch_grid.block, kernel_args)
 
     def _get_device(self, kernel_args):
         devices = set(a.device.id for a in kernel_args if type(a) is cp.ndarray)
@@ -62,6 +76,7 @@ class CupyKernelWrapper:
             valuation[name] = arg
 
         #   Collect parameter values
+        #   TODO: Check array sizes
         arr: cp.ndarray
 
         for kparam in self._kfunc.parameters:
@@ -81,7 +96,7 @@ class CupyKernelWrapper:
 
                 case FieldStrideParam(name, dtype, field, coord):
                     arr = kwargs[field.name]
-                    add_arg(name, arr.strides[coord], dtype)
+                    add_arg(name, arr.strides[coord] // arr.dtype.itemsize, dtype)
 
                 case KernelParameter(name, dtype):
                     val: Any = kwargs[name]
@@ -119,11 +134,14 @@ class CupyJit(JitBase):
 
     def __init__(self, default_block_size: tuple[int, int, int] = (128, 2, 1)):
         #   TODO: Fp16 headers
-        self._runtime_headers = {"<cstdint>", '"gpu_defines.h"'}
+        self._runtime_headers = {"<cstdint>"}
         self._default_block_size = default_block_size
 
     def compile(self, kfunc: KernelFunction) -> Callable[..., None]:
-        import cupy as cp
+        if not HAVE_CUPY:
+            raise JitError(
+                "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
+            )
 
         if not isinstance(kfunc, GpuKernelFunction) or kfunc.target != Target.CUDA:
             raise ValueError(
@@ -157,4 +175,4 @@ class CupyJit(JitBase):
 
     def _kernel_code(self, kfunc: GpuKernelFunction) -> str:
         kernel_code = emit_code(kfunc)
-        return f'extern "C" {{\n{kernel_code}\n}}\n'
+        return f'extern "C" {kernel_code}'
diff --git a/src/pystencils/backend/kernelcreation/cpu_optimization.py b/src/pystencils/backend/kernelcreation/cpu_optimization.py
index 29b133ff1..46fef6603 100644
--- a/src/pystencils/backend/kernelcreation/cpu_optimization.py
+++ b/src/pystencils/backend/kernelcreation/cpu_optimization.py
@@ -1,12 +1,14 @@
 from __future__ import annotations
-from typing import cast
+from typing import cast, TYPE_CHECKING
 
 from .context import KernelCreationContext
-from ..platforms import GenericCpu
 from ..ast.structural import PsBlock
 
 from ...config import CpuOptimConfig, OpenMpConfig
 
+if TYPE_CHECKING:
+    from ..platforms import GenericCpu
+
 
 def optimize_cpu(
     ctx: KernelCreationContext,
diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py
index dbc1b6519..86bd505cb 100644
--- a/src/pystencils/backend/kernelfunction.py
+++ b/src/pystencils/backend/kernelfunction.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC
-from typing import Callable, Sequence, Iterable
+from typing import Callable, Sequence, Iterable, TYPE_CHECKING
 
 from .ast.structural import PsBlock
 from .ast.analysis import collect_required_headers, collect_undefined_symbols
@@ -12,11 +12,13 @@ from .platforms import Platform, GpuThreadsRange
 
 from .constraints import KernelParamsConstraint
 from ..types import PsType
-from .jit import JitBase, no_jit
 
 from ..enums import Target
 from ..field import Field
 
+if TYPE_CHECKING:
+    from .jit import JitBase
+
 
 class KernelParameter:
     __match_args__ = ("name", "dtype")
@@ -121,7 +123,7 @@ class KernelFunction:
         parameters: Sequence[KernelParameter],
         required_headers: set[str],
         constraints: Sequence[KernelParamsConstraint],
-        jit: JitBase = no_jit,
+        jit: JitBase,
     ):
         self._body: PsBlock = body
         self._target = target
@@ -196,7 +198,7 @@ class GpuKernelFunction(KernelFunction):
         parameters: Sequence[KernelParameter],
         required_headers: set[str],
         constraints: Sequence[KernelParamsConstraint],
-        jit: JitBase = no_jit,
+        jit: JitBase,
     ):
         super().__init__(
             body, target, name, parameters, required_headers, constraints, jit
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 338a7d490..c9cb97210 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -45,7 +45,7 @@ class CudaPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"gpu_defines.h"}
+        return {'"gpu_defines.h"'}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
diff --git a/src/pystencils/config.py b/src/pystencils/config.py
index d2af213c1..b98079794 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+from typing import TYPE_CHECKING
 
 from collections.abc import Collection
 
@@ -8,12 +9,17 @@ from dataclasses import dataclass
 from .enums import Target
 from .field import Field, FieldType
 
-from .backend.jit import JitBase
-from .backend.exceptions import PsOptionsError
 from .types import PsIntegerType, PsNumericType, PsIeeeFloatType
 
 from .defaults import DEFAULTS
 
+if TYPE_CHECKING:
+    from .backend.jit import JitBase
+
+
+class PsOptionsError(Exception):
+    """Indicates an option clash in the `CreateKernelConfig`."""
+
 
 @dataclass
 class OpenMpConfig:
@@ -252,7 +258,9 @@ class CreateKernelConfig:
 
         if self.gpu_indexing is not None:
             if self.target != Target.SYCL:
-                raise PsOptionsError(f"`sycl_indexing` cannot be set for non-SYCL target {self.target}")
+                raise PsOptionsError(
+                    f"`sycl_indexing` cannot be set for non-SYCL target {self.target}"
+                )
 
         #   Infer JIT
         if self.jit is None:
@@ -260,8 +268,19 @@ class CreateKernelConfig:
                 from .backend.jit import LegacyCpuJit
 
                 self.jit = LegacyCpuJit()
+            elif self.target == Target.CUDA:
+                try:
+                    from .backend.jit.gpu_cupy import CupyJit
+
+                    self.jit = CupyJit()
+                except ImportError:
+                    from .backend.jit import no_jit
+
+                    self.jit = no_jit
+
             elif self.target == Target.SYCL:
                 from .backend.jit import no_jit
+
                 self.jit = no_jit
             else:
                 raise NotImplementedError(
diff --git a/tests/nbackend/kernelcreation/test_domain_kernels.py b/tests/nbackend/kernelcreation/test_domain_kernels.py
index 9ce2f661d..79a23fdb3 100644
--- a/tests/nbackend/kernelcreation/test_domain_kernels.py
+++ b/tests/nbackend/kernelcreation/test_domain_kernels.py
@@ -1,13 +1,21 @@
+#%%
+import pytest
 import sympy as sp
 import numpy as np
 
-from pystencils import fields, Field, AssignmentCollection
+from pystencils import fields, Field, AssignmentCollection, Target, CreateKernelConfig
 from pystencils.sympyextensions.astnodes import assignment_from_stencil
 
 from pystencils.kernelcreation import create_kernel
 
 
-def test_filter_kernel():
+@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA))
+def test_filter_kernel(target):
+    if target == Target.CUDA:
+        xp = pytest.importorskip("cupy")
+    else:
+        xp = np
+
     weight = sp.Symbol("weight")
     stencil = [
         [1, 1, 1],
@@ -19,18 +27,19 @@ def test_filter_kernel():
     asm = assignment_from_stencil(stencil, src, dst, normalization_factor=weight)
     asms = AssignmentCollection([asm])
 
-    ast = create_kernel(asms)
+    gen_config = CreateKernelConfig(target=target)
+    ast = create_kernel(asms, gen_config)
     kernel = ast.compile()
 
-    src_arr = np.ones((42, 42))
-    dst_arr = np.zeros_like(src_arr)
+    src_arr = xp.ones((42, 42))
+    dst_arr = xp.zeros_like(src_arr)
 
     kernel(src=src_arr, dst=dst_arr, weight=2.0)
 
-    expected = np.zeros_like(src_arr)
+    expected = xp.zeros_like(src_arr)
     expected[1:-1, 1:-1].fill(18.0)
 
-    np.testing.assert_allclose(dst_arr, expected)
+    xp.testing.assert_allclose(dst_arr, expected)
 
 
 def test_filter_kernel_fixedsize():
@@ -59,3 +68,8 @@ def test_filter_kernel_fixedsize():
     expected[1:-1, 1:-1].fill(18.0)
 
     np.testing.assert_allclose(dst_arr, expected)
+
+#%%
+test_filter_kernel(Target.CUDA)
+
+# %%
-- 
GitLab