diff --git a/conftest.py b/conftest.py index ff0467eff85d2b43854246ec98091a15ad347e63..7ea8f5ba0abebe731c229311e12d6346786f8345 100644 --- a/conftest.py +++ b/conftest.py @@ -43,10 +43,16 @@ def add_path_to_ignore(path): ] -collect_ignore = [ - os.path.join(SCRIPT_FOLDER, "doc", "conf.py"), - os.path.join(SCRIPT_FOLDER, "src", "pystencils", "opencl", "opencl.autoinit"), -] +def ignore_file(fp): + global collect_ignore + collect_ignore += [os.path.join(SCRIPT_FOLDER, fp)] + + +collect_ignore = [] + +ignore_file("noxfile.py") +ignore_file("docs/source/conf.py") +add_path_to_ignore("docs/build") add_path_to_ignore("tests/benchmark") add_path_to_ignore("_local_tmp") diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index e5035d7c7b4f2f21699f04b3cc0a7245a7ba2ff4..3fe00840ed13faeb837d91711bca0be1a390aa4e 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -2,23 +2,19 @@ The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components: - - The {any}`CudaPlatform` at `backend.platforms` which performs materialization of a kernel's iteration - space by mapping GPU block and thread indices to iteration space points. To perform this task, - it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping. + - The platforms {any}`CudaPlatform` and {any}`HipPlatform` at `backend.platforms` + which perform materialization of a kernel's iteration space + by mapping GPU block and thread indices to iteration space points. + To perform this task, it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping. The platform also takes care of lowering mathematical functions to their CUDA runtime library implementation. - In the code generation driver, the strings are drawn by the `GpuIndexing` helper class. It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration for the runtime system. -:::{attention} - -Code generation for HIP through the `CudaPlatform` is experimental and not tested at the moment. -::: - -## The CUDA Platform and Thread Mappings +## The GPU Platform and Thread Mappings ```{eval-rst} -.. module:: pystencils.backend.platforms.cuda +.. module:: pystencils.backend.platforms.generic_gpu .. autosummary:: :toctree: generated diff --git a/docs/source/backend/platforms.md b/docs/source/backend/platforms.md index e7ffc6f1523272621d273a17624d6323c25651b1..2d2c33d862627a0e92ca8afb466fee86cb78e3d5 100644 --- a/docs/source/backend/platforms.md +++ b/docs/source/backend/platforms.md @@ -26,7 +26,6 @@ targets in the future. Platform GenericCpu GenericVectorCpu - GenericGpu ``` ## CPU Platforms @@ -49,6 +48,18 @@ targets in the future. :nosignatures: :template: autosummary/entire_class.rst + GenericGpu CudaPlatform - SyclPlatform + HipPlatform ``` + +## Experimental Platforms + +```{eval-rst} +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + SyclPlatform +``` \ No newline at end of file diff --git a/docs/source/contributing/dev-workflow.md b/docs/source/contributing/dev-workflow.md index 8daac8cbd179a9922d9d70ebbcee2cd7b5dbbba2..d9291613083439baeb226f4536844665a63d353f 100644 --- a/docs/source/contributing/dev-workflow.md +++ b/docs/source/contributing/dev-workflow.md @@ -48,16 +48,22 @@ git pull --set-upstream upstream master ## Set Up the Python Environment +### Prerequesites + To develop pystencils, you will need at least the following software installed on your machine: - Python 3.10 or later: Since pystencils minimal supported version is Python 3.10, we recommend that you work with Python 3.10 directly. - An up-to-date C++ compiler, used by pystencils to JIT-compile generated code - [Nox](https://nox.thea.codes/en/stable/), which we use for test automation. Nox will be used extensively in the instructions on testing below. -- Optionally [CUDA](https://developer.nvidia.com/cuda-toolkit), - if you have an Nvidia or AMD GPU and plan to develop on pystencils' GPU capabilities +- Optionally, for GPU development: + - At least CUDA 11 for Nvidia GPUs, or + - At least ROCm/HIP 6.1 for AMD GPUs. + +### Virtual Environment Setup -Once you have these, set up a [virtual environment](https://docs.python.org/3/library/venv.html) for development. +Once you have all the prerequesites, +set up a [virtual environment](https://docs.python.org/3/library/venv.html) for development. This ensures that your system's installation of Python is kept clean, and isolates your development environment from outside influence. Use the following commands to create a virtual environment at `.venv` and perform an editable install of pystencils into it: @@ -74,7 +80,39 @@ Setting `PIP_REQUIRE_VIRTUALENV` ensures that pip refuses to install packages gl Consider setting this variable globally in your shell's configuration file. ::: -You are now ready to go! Create a new git branch to work on, open up an IDE, and start coding. +:::{admonition} Feature Groups +The above installation instructions assume that you will be running all code checking +and test tasks through `nox`. +If you need or want to run them manually, you will need to add one or more +of these feature groups to your installation: + + - `doc`, which contains all dependencies required to build this documentation; + - `dev`, which adds `flake8` for code style checking, + `mypy` for static type checking, + and the `black` formatter; + - `testsuite`, which adds `pytest` plus plugins and some more dependencies required + for running the test suite. + +Depending on your development focus, you might also need to add some of the user feature +groups listed in [the installation guide](#installation_guide). +::: + +### Cupy for CUDA and HIP + +When developing for NVidia or AMD GPUs, you will likely need an installation of [cupy](https://cupy.dev/). +Since cupy has to be built specifically against the libraries of a given CUDA or ROCm version, +it cannot be installed directly via dependency resolution from pystencils. +For instructions on how to install Cupy, refer to their [installation manual](https://docs.cupy.dev/en/stable/install.html). + +### Test Your Setup + +To check if your setup is complete, a good check is to invoke the pystencils test suite: + +```bash +nox -s "testsuite(cpu)" +``` + +If this finishes without errors, you are ready to go! Create a new git branch to work on, open up an IDE, and start coding. Make sure your IDE recognizes the virtual environment you created, though. ## Static Code Analysis diff --git a/docs/source/installation.md b/docs/source/installation.md index deb2b0613564f98468f623544acf3cc1ca9d279e..8fdb5684fdfea60a117b090e4d3ab1976ef9d0b7 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,4 +1,4 @@ -(_installation)= +(installation_guide)= # Setup and Installation ## Install pystencils @@ -17,7 +17,7 @@ git clone -b v2.0-dev https://i10git.cs.fau.de/pycodegen/pystencils.git pip install -e pystencils ``` -### Feature Groups +## Feature Groups In both cases, you can add a set of optional features to your installation by listing them in square brackets (e.g. `pip install -e pystencils[feature1, feature2]`). @@ -33,25 +33,22 @@ The following feature sets are available: - `use_cython`: Install [Cython](https://cython.org/), which is used internally by pystencils to accelerate the setup of boundary conditions. -:::{dropdown} For Developers - -If you are developing pystencils, we recommend you perform an editable install of your -local clone of the repository, with all optional features: -```bash -pip install -e pystencils[alltrafos,interactive,use_cython,doc,testsuite] -``` - -This includes the additional feature groups `doc`, which contains all dependencies required -to build this documentation, and `tests`, which adds `flake8` for code style checking, -`mypy` for static type checking, and `pytest` plus plugins for running the test suite. - -For more information on developing pystencils, see the [](#contribution_guide). -::: - -### For Nvidia GPUs +## For GPUs If you have an Nvidia graphics processor and CUDA installed, you can use pystencils to directly compile and execute kernels running on your GPU. -This requires a working installation of [cupy](https://cupy.dev). +This requires a working installation of [Cupy](https://cupy.dev). Please refer to the cupy's [installation manual](https://docs.cupy.dev/en/stable/install.html) for details about installing cupy. + +You can also use Cupy together with AMD ROCm and HIP for AMD graphics cards, +but the setup steps are a bit more complicated - you might have to build cupy from source. +The Cupy documentation covers this in their [installation guide for Cupy on ROCm][cupy-rocm]. + +:::{note} +Since Cupy's support for ROCm is at this time still an experimental feature, +just-in-time compilation of pystencils HIP kernels +for the ROCm platform must also considered *experimental*. +::: + +[cupy-rocm]: https://docs.cupy.dev/en/stable/install.html#using-cupy-on-amd-gpu-experimental "Cupy on ROCm" diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 48a24e703584e8c87365550c7a7d5d8338feb717..7a3d54f6f0a41fefdb4f841351061ba3d383d74b 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -26,23 +26,46 @@ import matplotlib.pyplot as plt (guide_gpukernels)= # Pystencils for GPUs -Pystencils offers code generation for Nvidia GPUs using the CUDA programming model, +Pystencils offers code generation for Nvidia and AMD GPUs +using the CUDA and HIP programming models, as well as just-in-time compilation and execution of CUDA kernels from within Python based on the [cupy] library. This section's objective is to give a detailed introduction into the creation of GPU kernels with pystencils. -## Generate, Compile and Run CUDA Kernels +:::{note} +[CuPy][cupy] is a Python library for numerical computations on GPU arrays, +which operates much in the same way that [NumPy][numpy] works on CPU arrays. +Cupy and NumPy expose nearly the same APIs for array operations; +the difference being that CuPy allocates all its arrays on the GPU +and performs its operations as CUDA kernels. +Also, CuPy exposes a just-in-time-compiler for GPU kernels. +In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code, +and to allocate and manage the data these kernels can be executed on. + +For more information on CuPy, refer to [their documentation][cupy-docs]. +::: + +## Generate, Compile and Run GPU Kernels + +The CUDA and HIP platforms are made available in pystencils via the code generation targets +{any}`Target.CUDA` and {any}`Target.HIP`. +For pystencils code to be portable between both, we can use {any}`Target.CurrentGPU` to +automatically select one or the other, depending on the current runtime environment. + +:::{note} +If `cupy` is not installed, `create_kernel` will raise an exception when using `Target.CurrentGPU`. +You can still generate kernels for CUDA or HIP directly even without Cupy; +you just won't be able to just-in-time compile and run them. +::: -In order to obtain a CUDA implementation of a symbolic kernel, naught more is required -than setting the {any}`target <CreateKernelConfig.target>` code generator option to -{any}`Target.CUDA`: +Here is a snippet creating a kernel for the locally available GPU target: ```{code-cell} ipython3 f, g = ps.fields("f, g: float64[3D]") update = ps.Assignment(f.center(), 2 * g.center()) -cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) +cfg = ps.CreateKernelConfig(target=ps.Target.CurrentGPU) kernel = ps.create_kernel(update, cfg) ps.inspect(kernel) @@ -68,19 +91,6 @@ kfunc = kernel.compile() kfunc(f=f_arr, g=g_arr) ``` -:::{note} -[CuPy][cupy] is a Python library for numerical computations on GPU arrays, -which operates much in the same way that [NumPy][numpy] works on CPU arrays. -Cupy and NumPy expose nearly the same APIs for array operations; -the difference being that CuPy allocates all its arrays on the GPU -and performs its operations as CUDA kernels. -Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc]. -In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code, -and to allocate and manage the data these kernels can be executed on. - -For more information on CuPy, refer to [their documentation][cupy-docs]. -::: - (indexing_and_launch_config)= ## Modify the Indexing Scheme and Launch Configuration @@ -233,7 +243,7 @@ assignments = [ ```{code-cell} ipython3 y = ps.DEFAULTS.spatial_counters[0] cfg = ps.CreateKernelConfig() -cfg.target= ps.Target.CUDA +cfg.target= ps.Target.CurrentGPU cfg.iteration_slice = ps.make_slice[:, y:] ``` @@ -286,5 +296,4 @@ only a part of the triangle is being processed. [cupy]: https://cupy.dev "CuPy Homepage" [numpy]: https://numpy.org "NumPy Homepage" -[nvrtc]: https://docs.nvidia.com/cuda/nvrtc/index.html "NVIDIA Runtime Compilation Library" [cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation" diff --git a/pyproject.toml b/pyproject.toml index b3c6b1c0238654bedd1ba90b09c8d98541e1c73f..ae539b12c7399694f46af0261851de5d68bcc86e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ classifiers = [ "Source Code" = "https://i10git.cs.fau.de/pycodegen/pystencils" [project.optional-dependencies] -gpu = ['cupy'] alltrafos = ['islpy', 'py-cpuinfo'] bench_db = ['blitzdb', 'pymongo', 'pandas'] interactive = [ @@ -76,7 +75,7 @@ testsuite = [ 'matplotlib', 'py-cpuinfo', 'randomgen>=1.18', - 'scipy' + 'scipy', ] [build-system] @@ -89,7 +88,8 @@ build-backend = "setuptools.build_meta" [tool.setuptools.package-data] pystencils = [ - "include/*.h", + "include/**/*.h", + "include/**/*.hpp", "jit/cpu/*.tmpl.cpp", "boundaries/createindexlistcython.pyx" ] diff --git a/src/pystencils/backend/emission/base_printer.py b/src/pystencils/backend/emission/base_printer.py index cc4b50e217d68a4bba28c2c95705464a91182212..c4ac0640c44a222f3fbecef9086fdbd36e68f8bc 100644 --- a/src/pystencils/backend/emission/base_printer.py +++ b/src/pystencils/backend/emission/base_printer.py @@ -57,7 +57,6 @@ from ..extensions.foreign_ast import PsForeignExpression from ..memory import PsSymbol from ..constants import PsConstant from ...types import PsType -from ...codegen import Target if TYPE_CHECKING: from ...codegen import Kernel @@ -383,7 +382,7 @@ class BasePrinter(ABC): from ...codegen import GpuKernel sig_parts = [self._func_prefix] if self._func_prefix is not None else [] - if isinstance(func, GpuKernel) and func.target == Target.CUDA: + if isinstance(func, GpuKernel) and func.target.is_gpu(): sig_parts.append("__global__") sig_parts += ["void", func.name, f"({params_str})"] signature = " ".join(sig_parts) diff --git a/src/pystencils/backend/platforms/__init__.py b/src/pystencils/backend/platforms/__init__.py index 589841db87efb598ffeed20d4d11db7ffcd452cc..3b602964bf2442823c9965f3398517635c8cc217 100644 --- a/src/pystencils/backend/platforms/__init__.py +++ b/src/pystencils/backend/platforms/__init__.py @@ -2,6 +2,7 @@ from .platform import Platform from .generic_cpu import GenericCpu, GenericVectorCpu from .generic_gpu import GenericGpu from .cuda import CudaPlatform +from .hip import HipPlatform from .x86 import X86VectorCpu, X86VectorArch from .sycl import SyclPlatform @@ -13,5 +14,6 @@ __all__ = [ "X86VectorArch", "GenericGpu", "CudaPlatform", + "HipPlatform", "SyclPlatform", ] diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index c97fad413198c16b87d6c27949e70d6aa59a3aa1..98ff3e3d332a46074931514ba3af1603dc6318b2 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,318 +1,11 @@ from __future__ import annotations -from abc import ABC, abstractmethod -from ...types import constify, deconstify -from ..exceptions import MaterializationError from .generic_gpu import GenericGpu -from ..memory import PsSymbol -from ..kernelcreation import ( - Typifier, - IterationSpace, - FullIterationSpace, - SparseIterationSpace, - AstFactory, -) - -from ..kernelcreation.context import KernelCreationContext -from ..ast.structural import PsBlock, PsConditional, PsDeclaration -from ..ast.expressions import ( - PsExpression, - PsLiteralExpr, - PsCast, - PsCall, - PsLookup, - PsBufferAcc, -) -from ..ast.expressions import PsLt, PsAnd -from ...types import PsSignedIntegerType, PsIeeeFloatType -from ..literals import PsLiteral -from ..functions import PsMathFunction, MathFunctions, CFunction - - -int32 = PsSignedIntegerType(width=32, const=False) - -BLOCK_IDX = [ - PsLiteralExpr(PsLiteral(f"blockIdx.{coord}", int32)) for coord in ("x", "y", "z") -] -THREAD_IDX = [ - PsLiteralExpr(PsLiteral(f"threadIdx.{coord}", int32)) for coord in ("x", "y", "z") -] -BLOCK_DIM = [ - PsLiteralExpr(PsLiteral(f"blockDim.{coord}", int32)) for coord in ("x", "y", "z") -] -GRID_DIM = [ - PsLiteralExpr(PsLiteral(f"gridDim.{coord}", int32)) for coord in ("x", "y", "z") -] - - -class ThreadMapping(ABC): - - @abstractmethod - def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: - """Map the current thread index onto a point in the given iteration space. - - Implementations of this method must return a declaration for each dimension counter - of the given iteration space. - """ - - -class Linear3DMapping(ThreadMapping): - """3D globally linearized mapping, where each thread is assigned a work item according to - its location in the global launch grid.""" - - def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: - match ispace: - case FullIterationSpace(): - return self._dense_mapping(ispace) - case SparseIterationSpace(): - return self._sparse_mapping(ispace) - case _: - assert False, "unexpected iteration space" - - def _dense_mapping( - self, ispace: FullIterationSpace - ) -> dict[PsSymbol, PsExpression]: - if ispace.rank > 3: - raise MaterializationError( - f"Cannot handle {ispace.rank}-dimensional iteration space " - "using the Linear3D GPU thread index mapping." - ) - - dimensions = ispace.dimensions_in_loop_order() - idx_map: dict[PsSymbol, PsExpression] = dict() - - for coord, dim in enumerate(dimensions[::-1]): - tid = self._linear_thread_idx(coord) - idx_map[dim.counter] = dim.start + dim.step * PsCast( - deconstify(dim.counter.get_dtype()), tid - ) - - return idx_map - - def _sparse_mapping( - self, ispace: SparseIterationSpace - ) -> dict[PsSymbol, PsExpression]: - sparse_ctr = PsExpression.make(ispace.sparse_counter) - thread_idx = self._linear_thread_idx(0) - idx_map: dict[PsSymbol, PsExpression] = { - ispace.sparse_counter: PsCast( - deconstify(sparse_ctr.get_dtype()), thread_idx - ) - } - return idx_map - - def _linear_thread_idx(self, coord: int): - block_size = BLOCK_DIM[coord] - block_idx = BLOCK_IDX[coord] - thread_idx = THREAD_IDX[coord] - return block_idx * block_size + thread_idx - - -class Blockwise4DMapping(ThreadMapping): - """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions - are mapped to block indices.""" - - _indices_fastest_first = [ # slowest to fastest - THREAD_IDX[0], - BLOCK_IDX[0], - BLOCK_IDX[1], - BLOCK_IDX[2], - ] - - def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: - match ispace: - case FullIterationSpace(): - return self._dense_mapping(ispace) - case SparseIterationSpace(): - return self._sparse_mapping(ispace) - case _: - assert False, "unexpected iteration space" - - def _dense_mapping( - self, ispace: FullIterationSpace - ) -> dict[PsSymbol, PsExpression]: - if ispace.rank > 4: - raise MaterializationError( - f"Cannot handle {ispace.rank}-dimensional iteration space " - "using the Blockwise4D GPU thread index mapping." - ) - - dimensions = ispace.dimensions_in_loop_order() - idx_map: dict[PsSymbol, PsExpression] = dict() - - for dim, tid in zip(dimensions[::-1], self._indices_fastest_first): - idx_map[dim.counter] = dim.start + dim.step * PsCast( - deconstify(dim.counter.get_dtype()), tid - ) - - return idx_map - - def _sparse_mapping( - self, ispace: SparseIterationSpace - ) -> dict[PsSymbol, PsExpression]: - sparse_ctr = PsExpression.make(ispace.sparse_counter) - thread_idx = self._indices_fastest_first[0] - idx_map: dict[PsSymbol, PsExpression] = { - ispace.sparse_counter: PsCast( - deconstify(sparse_ctr.get_dtype()), thread_idx - ) - } - return idx_map - class CudaPlatform(GenericGpu): - """Platform for CUDA-based GPUs. - - Args: - ctx: The kernel creation context - thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points - """ - - def __init__( - self, - ctx: KernelCreationContext, - thread_mapping: ThreadMapping | None = None, - ) -> None: - super().__init__(ctx) - - self._thread_mapping = ( - thread_mapping if thread_mapping is not None else Linear3DMapping() - ) - - self._typify = Typifier(ctx) + """Platform for the CUDA GPU taret.""" @property def required_headers(self) -> set[str]: - return {'"pystencils_runtime/hip.h"'} # TODO: move to HipPlatform once it is introduced - - def materialize_iteration_space( - self, body: PsBlock, ispace: IterationSpace - ) -> PsBlock: - if isinstance(ispace, FullIterationSpace): - return self._prepend_dense_translation(body, ispace) - elif isinstance(ispace, SparseIterationSpace): - return self._prepend_sparse_translation(body, ispace) - else: - raise MaterializationError(f"Unknown type of iteration space: {ispace}") - - def select_function(self, call: PsCall) -> PsExpression: - assert isinstance(call.function, PsMathFunction) - - func = call.function.func - dtype = call.get_dtype() - arg_types = (dtype,) * func.num_args - - if isinstance(dtype, PsIeeeFloatType): - match func: - case ( - MathFunctions.Exp - | MathFunctions.Log - | MathFunctions.Sin - | MathFunctions.Cos - | MathFunctions.Sqrt - | MathFunctions.Ceil - | MathFunctions.Floor - ) if dtype.width in (16, 32, 64): - prefix = "h" if dtype.width == 16 else "" - suffix = "f" if dtype.width == 32 else "" - name = f"{prefix}{func.function_name}{suffix}" - cfunc = CFunction(name, arg_types, dtype) - - case ( - MathFunctions.Pow - | MathFunctions.Tan - | MathFunctions.Sinh - | MathFunctions.Cosh - | MathFunctions.ASin - | MathFunctions.ACos - | MathFunctions.ATan - | MathFunctions.ATan2 - ) if dtype.width in (32, 64): - # These are unavailable for fp16 - suffix = "f" if dtype.width == 32 else "" - name = f"{func.function_name}{suffix}" - cfunc = CFunction(name, arg_types, dtype) - - case ( - MathFunctions.Min | MathFunctions.Max | MathFunctions.Abs - ) if dtype.width in (32, 64): - suffix = "f" if dtype.width == 32 else "" - name = f"f{func.function_name}{suffix}" - cfunc = CFunction(name, arg_types, dtype) - - case MathFunctions.Abs if dtype.width == 16: - cfunc = CFunction(" __habs", arg_types, dtype) - - case _: - raise MaterializationError( - f"Cannot materialize call to function {func}" - ) - - call.function = cfunc - return call - - raise MaterializationError( - f"No implementation available for function {func} on data type {dtype}" - ) - - # Internals - - def _prepend_dense_translation( - self, body: PsBlock, ispace: FullIterationSpace - ) -> PsBlock: - ctr_mapping = self._thread_mapping(ispace) - - indexing_decls = [] - conds = [] - - dimensions = ispace.dimensions_in_loop_order() - - for dim in dimensions: - # counter declarations must be ordered slowest-to-fastest - # such that inner dimensions can depend on outer ones - - dim.counter.dtype = constify(dim.counter.get_dtype()) - - ctr_expr = PsExpression.make(dim.counter) - indexing_decls.append( - self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) - ) - conds.append(PsLt(ctr_expr, dim.stop)) - - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - return PsBlock(indexing_decls + [PsConditional(condition, body)]) - - def _prepend_sparse_translation( - self, body: PsBlock, ispace: SparseIterationSpace - ) -> PsBlock: - factory = AstFactory(self._ctx) - ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) - - sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) - ctr_mapping = self._thread_mapping(ispace) - - sparse_idx_decl = self._typify( - PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter]) - ) - - mappings = [ - PsDeclaration( - PsExpression.make(ctr), - PsLookup( - PsBufferAcc( - ispace.index_list.base_pointer, - (sparse_ctr_expr.clone(), factory.parse_index(0)), - ), - coord.name, - ), - ) - for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members) - ] - body.statements = mappings + body.statements - - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr_expr.clone(), stop) - return PsBlock([sparse_idx_decl, PsConditional(condition, body)]) + return set() diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index b5b35c8b03447f1d5c35ed1289b89542bb1127ca..11425d9238e8270b06a628555908622b623931d6 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,7 +1,320 @@ from __future__ import annotations +from abc import ABC, abstractmethod +from ...types import constify, deconstify +from ..exceptions import MaterializationError from .platform import Platform +from ..memory import PsSymbol +from ..kernelcreation import ( + Typifier, + IterationSpace, + FullIterationSpace, + SparseIterationSpace, + AstFactory, +) + +from ..kernelcreation.context import KernelCreationContext +from ..ast.structural import PsBlock, PsConditional, PsDeclaration +from ..ast.expressions import ( + PsExpression, + PsLiteralExpr, + PsCast, + PsCall, + PsLookup, + PsBufferAcc, +) +from ..ast.expressions import PsLt, PsAnd +from ...types import PsSignedIntegerType, PsIeeeFloatType +from ..literals import PsLiteral +from ..functions import PsMathFunction, MathFunctions, CFunction + + +int32 = PsSignedIntegerType(width=32, const=False) + +BLOCK_IDX = [ + PsLiteralExpr(PsLiteral(f"blockIdx.{coord}", int32)) for coord in ("x", "y", "z") +] +THREAD_IDX = [ + PsLiteralExpr(PsLiteral(f"threadIdx.{coord}", int32)) for coord in ("x", "y", "z") +] +BLOCK_DIM = [ + PsLiteralExpr(PsLiteral(f"blockDim.{coord}", int32)) for coord in ("x", "y", "z") +] +GRID_DIM = [ + PsLiteralExpr(PsLiteral(f"gridDim.{coord}", int32)) for coord in ("x", "y", "z") +] + + +class ThreadMapping(ABC): + + @abstractmethod + def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: + """Map the current thread index onto a point in the given iteration space. + + Implementations of this method must return a declaration for each dimension counter + of the given iteration space. + """ + + +class Linear3DMapping(ThreadMapping): + """3D globally linearized mapping, where each thread is assigned a work item according to + its location in the global launch grid.""" + + def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: + match ispace: + case FullIterationSpace(): + return self._dense_mapping(ispace) + case SparseIterationSpace(): + return self._sparse_mapping(ispace) + case _: + assert False, "unexpected iteration space" + + def _dense_mapping( + self, ispace: FullIterationSpace + ) -> dict[PsSymbol, PsExpression]: + if ispace.rank > 3: + raise MaterializationError( + f"Cannot handle {ispace.rank}-dimensional iteration space " + "using the Linear3D GPU thread index mapping." + ) + + dimensions = ispace.dimensions_in_loop_order() + idx_map: dict[PsSymbol, PsExpression] = dict() + + for coord, dim in enumerate(dimensions[::-1]): + tid = self._linear_thread_idx(coord) + idx_map[dim.counter] = dim.start + dim.step * PsCast( + deconstify(dim.counter.get_dtype()), tid + ) + + return idx_map + + def _sparse_mapping( + self, ispace: SparseIterationSpace + ) -> dict[PsSymbol, PsExpression]: + sparse_ctr = PsExpression.make(ispace.sparse_counter) + thread_idx = self._linear_thread_idx(0) + idx_map: dict[PsSymbol, PsExpression] = { + ispace.sparse_counter: PsCast( + deconstify(sparse_ctr.get_dtype()), thread_idx + ) + } + return idx_map + + def _linear_thread_idx(self, coord: int): + block_size = BLOCK_DIM[coord] + block_idx = BLOCK_IDX[coord] + thread_idx = THREAD_IDX[coord] + return block_idx * block_size + thread_idx + + +class Blockwise4DMapping(ThreadMapping): + """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions + are mapped to block indices.""" + + _indices_fastest_first = [ # slowest to fastest + THREAD_IDX[0], + BLOCK_IDX[0], + BLOCK_IDX[1], + BLOCK_IDX[2] + ] + + def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: + match ispace: + case FullIterationSpace(): + return self._dense_mapping(ispace) + case SparseIterationSpace(): + return self._sparse_mapping(ispace) + case _: + assert False, "unexpected iteration space" + + def _dense_mapping( + self, ispace: FullIterationSpace + ) -> dict[PsSymbol, PsExpression]: + if ispace.rank > 4: + raise MaterializationError( + f"Cannot handle {ispace.rank}-dimensional iteration space " + "using the Blockwise4D GPU thread index mapping." + ) + + dimensions = ispace.dimensions_in_loop_order() + idx_map: dict[PsSymbol, PsExpression] = dict() + + for dim, tid in zip(dimensions[::-1], self._indices_fastest_first): + idx_map[dim.counter] = dim.start + dim.step * PsCast( + deconstify(dim.counter.get_dtype()), tid + ) + + return idx_map + + def _sparse_mapping( + self, ispace: SparseIterationSpace + ) -> dict[PsSymbol, PsExpression]: + sparse_ctr = PsExpression.make(ispace.sparse_counter) + thread_idx = self._indices_fastest_first[0] + idx_map: dict[PsSymbol, PsExpression] = { + ispace.sparse_counter: PsCast( + deconstify(sparse_ctr.get_dtype()), thread_idx + ) + } + return idx_map + class GenericGpu(Platform): - """Base class for GPU platforms.""" + """Common base platform for CUDA- and HIP-type GPU targets. + + Args: + ctx: The kernel creation context + omit_range_check: If `True`, generated index translation code will not check if the point identified + by block and thread indices is actually contained in the iteration space + thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points + """ + + def __init__( + self, + ctx: KernelCreationContext, + thread_mapping: ThreadMapping | None = None, + ) -> None: + super().__init__(ctx) + + self._thread_mapping = ( + thread_mapping if thread_mapping is not None else Linear3DMapping() + ) + + self._typify = Typifier(ctx) + + def materialize_iteration_space( + self, body: PsBlock, ispace: IterationSpace + ) -> PsBlock: + if isinstance(ispace, FullIterationSpace): + return self._prepend_dense_translation(body, ispace) + elif isinstance(ispace, SparseIterationSpace): + return self._prepend_sparse_translation(body, ispace) + else: + raise MaterializationError(f"Unknown type of iteration space: {ispace}") + + def select_function(self, call: PsCall) -> PsExpression: + assert isinstance(call.function, PsMathFunction) + + func = call.function.func + dtype = call.get_dtype() + arg_types = (dtype,) * func.num_args + + if isinstance(dtype, PsIeeeFloatType): + match func: + case ( + MathFunctions.Exp + | MathFunctions.Log + | MathFunctions.Sin + | MathFunctions.Cos + | MathFunctions.Sqrt + | MathFunctions.Ceil + | MathFunctions.Floor + ) if dtype.width in (16, 32, 64): + prefix = "h" if dtype.width == 16 else "" + suffix = "f" if dtype.width == 32 else "" + name = f"{prefix}{func.function_name}{suffix}" + cfunc = CFunction(name, arg_types, dtype) + + case ( + MathFunctions.Pow + | MathFunctions.Tan + | MathFunctions.Sinh + | MathFunctions.Cosh + | MathFunctions.ASin + | MathFunctions.ACos + | MathFunctions.ATan + | MathFunctions.ATan2 + ) if dtype.width in (32, 64): + # These are unavailable for fp16 + suffix = "f" if dtype.width == 32 else "" + name = f"{func.function_name}{suffix}" + cfunc = CFunction(name, arg_types, dtype) + + case ( + MathFunctions.Min | MathFunctions.Max | MathFunctions.Abs + ) if dtype.width in (32, 64): + suffix = "f" if dtype.width == 32 else "" + name = f"f{func.function_name}{suffix}" + cfunc = CFunction(name, arg_types, dtype) + + case MathFunctions.Abs if dtype.width == 16: + cfunc = CFunction(" __habs", arg_types, dtype) + + case _: + raise MaterializationError( + f"Cannot materialize call to function {func}" + ) + + call.function = cfunc + return call + + raise MaterializationError( + f"No implementation available for function {func} on data type {dtype}" + ) + + # Internals + + def _prepend_dense_translation( + self, body: PsBlock, ispace: FullIterationSpace + ) -> PsBlock: + ctr_mapping = self._thread_mapping(ispace) + + indexing_decls = [] + conds = [] + + dimensions = ispace.dimensions_in_loop_order() + + for dim in dimensions: + # counter declarations must be ordered slowest-to-fastest + # such that inner dimensions can depend on outer ones + + dim.counter.dtype = constify(dim.counter.get_dtype()) + + ctr_expr = PsExpression.make(dim.counter) + indexing_decls.append( + self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) + ) + conds.append(PsLt(ctr_expr, dim.stop)) + + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) + + return ast + + def _prepend_sparse_translation( + self, body: PsBlock, ispace: SparseIterationSpace + ) -> PsBlock: + factory = AstFactory(self._ctx) + ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) + + sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) + ctr_mapping = self._thread_mapping(ispace) + + sparse_idx_decl = self._typify( + PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter]) + ) + + mappings = [ + PsDeclaration( + PsExpression.make(ctr), + PsLookup( + PsBufferAcc( + ispace.index_list.base_pointer, + (sparse_ctr_expr.clone(), factory.parse_index(0)), + ), + coord.name, + ), + ) + for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members) + ] + body.statements = mappings + body.statements + + stop = PsExpression.make(ispace.index_list.shape[0]) + condition = PsLt(sparse_ctr_expr.clone(), stop) + ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) + + return ast diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py new file mode 100644 index 0000000000000000000000000000000000000000..c758995a0d9f8fbbb2e9e424bf2cfa6ab7eca086 --- /dev/null +++ b/src/pystencils/backend/platforms/hip.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .generic_gpu import GenericGpu + + +class HipPlatform(GenericGpu): + """Platform for the HIP GPU taret.""" + + @property + def required_headers(self) -> set[str]: + return {'"pystencils_runtime/hip.h"'} diff --git a/src/pystencils/boundaries/boundaryhandling.py b/src/pystencils/boundaries/boundaryhandling.py index 1f6e3d126365de0af08ee98ddd26d1600af15027..58340c3e0fbb16b98af2cf08c3d1894ca34a2309 100644 --- a/src/pystencils/boundaries/boundaryhandling.py +++ b/src/pystencils/boundaries/boundaryhandling.py @@ -123,7 +123,7 @@ class BoundaryHandling: class_ = self.IndexFieldBlockData class_.to_cpu = to_cpu class_.to_gpu = to_gpu - gpu = self._target in data_handling._GPU_LIKE_TARGETS + gpu = self._target.is_gpu() data_handling.add_custom_class(self._index_array_name, class_, cpu=True, gpu=gpu) @property @@ -240,7 +240,7 @@ class BoundaryHandling: if self._dirty: self.prepare() - for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS): + for b in self._data_handling.iterate(gpu=self._target.is_gpu()): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): kwargs[self._field_name] = b[self._field_name] kwargs['indexField'] = idx_arr @@ -255,7 +255,7 @@ class BoundaryHandling: if self._dirty: self.prepare() - for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS): + for b in self._data_handling.iterate(gpu=self._target.is_gpu()): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): arguments = kwargs.copy() arguments[self._field_name] = b[self._field_name] @@ -341,7 +341,7 @@ class BoundaryHandling: def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs): if boundary_obj.additional_data_init_callback: boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs) - if self._target in self._data_handling._GPU_LIKE_TARGETS: + if self._target.is_gpu(): self._data_handling.to_gpu(self._index_array_name) class BoundaryInfo(object): diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index d6f8e403cdce0c99853ee1127567e11c14b92b32..8e7e54ff1125a8bba2ba35c223277ee2867c28b7 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -395,10 +395,12 @@ class GpuOptions(ConfigBase): """ @staticmethod - def default_warp_size(target: Target): + def default_warp_size(target: Target) -> int | None: match target: case Target.CUDA: return 32 + case Target.HIP: + return None case _: raise NotImplementedError( f"No default warp/wavefront size known for target {target}" @@ -594,6 +596,8 @@ class CreateKernelConfig(ConfigBase): match t: case Target.CurrentCPU: return Target.auto_cpu() + case Target.CurrentGPU: + return Target.auto_gpu() case _: return t @@ -601,12 +605,14 @@ class CreateKernelConfig(ConfigBase): """Returns either the user-specified JIT compiler, or infers one from the target if none is given.""" jit: JitBase | None = self.get_option("jit") + target = self.get_target() + if jit is None: - if self.get_target().is_cpu(): + if target.is_cpu(): from ..jit import LegacyCpuJit return LegacyCpuJit() - elif self.get_target() == Target.CUDA: + elif target == Target.CUDA or target == Target.HIP: try: from ..jit.gpu_cupy import CupyJit @@ -617,7 +623,7 @@ class CreateKernelConfig(ConfigBase): return no_jit - elif self.get_target() == Target.SYCL: + elif target == Target.SYCL: from ..jit import no_jit return no_jit diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 65352f5205125b05acbd088bc42e9205cd7eb415..e9fc69b76b3d88024f9cbde880617d6a3a3696ff 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import cast, Sequence, Callable, TYPE_CHECKING from dataclasses import dataclass, replace +from warnings import warn from .target import Target from .config import ( @@ -399,7 +400,7 @@ class DefaultKernelCreationDriver: return kernel_ast def _get_gpu_indexing(self) -> GpuIndexing | None: - if self._target != Target.CUDA: + if not self._target.is_gpu(): return None idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") @@ -410,6 +411,9 @@ class DefaultKernelCreationDriver: if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) + if warp_size is None and assume_warp_aligned_block_size: + warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.") + return GpuIndexing( self._ctx, self._target, @@ -447,20 +451,26 @@ class DefaultKernelCreationDriver: ) elif self._target.is_gpu(): + thread_mapping = ( + self._gpu_indexing.get_thread_mapping() + if self._gpu_indexing is not None + else None + ) + + GpuPlatform: type match self._target: case Target.CUDA: - from ..backend.platforms import CudaPlatform - - thread_mapping = ( - self._gpu_indexing.get_thread_mapping() - if self._gpu_indexing is not None - else None - ) - - return CudaPlatform( - self._ctx, - thread_mapping=thread_mapping, - ) + from ..backend.platforms import CudaPlatform as GpuPlatform + case Target.HIP: + from ..backend.platforms import HipPlatform as GpuPlatform + case _: + assert False, f"unexpected GPU target: {self._target}" + + return GpuPlatform( + self._ctx, + thread_mapping=thread_mapping, + ) + elif self._target == Target.SYCL: from ..backend.platforms import SyclPlatform diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index d473e5b4a7408d2be77f0c4f2bc3b55a5ca9df0a..43b612bd77f535ef45c09c38489784dd12a53ce0 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -17,7 +17,7 @@ from ..backend.kernelcreation import ( FullIterationSpace, SparseIterationSpace, ) -from ..backend.platforms.cuda import ThreadMapping +from ..backend.platforms.generic_gpu import ThreadMapping from ..backend.ast.expressions import PsExpression, PsIntDiv from math import prod @@ -30,14 +30,11 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda] @dataclass class HardwareProperties: - warp_size: int + warp_size: int | None max_threads_per_block: int max_block_sizes: dim3 - def block_size_exceeds_hw_limits( - self, - block_size: tuple[int, ...] - ) -> bool: + def block_size_exceeds_hw_limits(self, block_size: tuple[int, ...]) -> bool: """Checks if provided block size conforms limits given by the hardware.""" return ( @@ -106,8 +103,10 @@ class GpuLaunchConfiguration(ABC): @staticmethod def _excessive_block_size_error_msg(block_size: tuple[int, ...]): - return f"Unable to determine GPU block size for this kernel. \ - Final block size was too large: {block_size}." + return ( + "Unable to determine GPU block size for this kernel. " + f"Final block size was too large: {block_size}." + ) class AutomaticLaunchConfiguration(GpuLaunchConfiguration): @@ -139,7 +138,9 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): @block_size.setter def block_size(self, val: dim3): - AttributeError("Setting `block_size` on an automatic launch configuration has no effect.") + AttributeError( + "Setting `block_size` on an automatic launch configuration has no effect." + ) @property def parameters(self) -> frozenset[Parameter]: @@ -203,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): if ( self._assume_warp_aligned_block_size + and self._hw_props.warp_size is not None and prod(self._block_size) % self._hw_props.warp_size != 0 ): raise CodegenError( @@ -289,6 +291,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def parameters(self) -> frozenset[Parameter]: """Parameters of this launch configuration""" return self._params + + @property + def default_block_size(self) -> dim3: + return self._default_block_size @property def block_size(self) -> dim3 | None: @@ -297,7 +303,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): @block_size.setter def block_size(self, val: dim3): - AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.") + AttributeError( + "Setting `block_size` on an dynamic launch configuration has no effect." + ) @staticmethod def _round_block_sizes_to_warp_size( @@ -348,7 +356,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): if ( self._assume_warp_aligned_block_size - and prod(ret) % self._hw_props.warp_size != 0 + and hw_props.warp_size is not None + and prod(ret) % hw_props.warp_size != 0 ): self._round_block_sizes_to_warp_size(ret, hw_props.warp_size) @@ -384,6 +393,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): return ret trimmed = trim(list(block_size)) + + if hw_props.warp_size is None: + return tuple(trimmed) + if ( prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size == 0 @@ -490,14 +503,13 @@ class GpuIndexing: ctx: KernelCreationContext, target: Target, scheme: GpuIndexingScheme, - warp_size: int, + warp_size: int | None, manual_launch_grid: bool = False, assume_warp_aligned_block_size: bool = False, ) -> None: self._ctx = ctx self._target = target self._scheme = scheme - self._warp_size = warp_size self._manual_launch_grid = manual_launch_grid self._assume_warp_aligned_block_size = assume_warp_aligned_block_size @@ -518,6 +530,8 @@ class GpuIndexing: match target: case Target.CUDA: return (1024, 1024, 64) + case Target.HIP: + return (1024, 1024, 1024) case _: raise CodegenError( f"Cannot determine max GPU block sizes for target {target}" @@ -526,7 +540,7 @@ class GpuIndexing: @staticmethod def get_max_threads_per_block(target: Target): match target: - case Target.CUDA: + case Target.CUDA | Target.HIP: return 1024 case _: raise CodegenError( @@ -536,7 +550,7 @@ class GpuIndexing: def get_thread_mapping(self) -> ThreadMapping: """Retrieve a thread mapping object for use by the backend""" - from ..backend.platforms.cuda import Linear3DMapping, Blockwise4DMapping + from ..backend.platforms.generic_gpu import Linear3DMapping, Blockwise4DMapping match self._scheme: case GpuIndexingScheme.Linear3D: @@ -603,11 +617,20 @@ class GpuIndexing: # impossible to use block size determination function since the iteration space is unknown # -> round block size in fastest moving dimension up to multiple of warp size rounded_block_size: PsExpression - if self._assume_warp_aligned_block_size: + if ( + self._assume_warp_aligned_block_size + and self._hw_props.warp_size is not None + ): warp_size = self._ast_factory.parse_index(self._hw_props.warp_size) rounded_block_size = self._ast_factory.parse_index( - PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1), - warp_size.clone()) * warp_size.clone()) + PsIntDiv( + work_items[0].clone() + + warp_size.clone() + - self._ast_factory.parse_index(1), + warp_size.clone(), + ) + * warp_size.clone() + ) else: rounded_block_size = work_items[0] diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py index 0d724b87730f0ec327772bccbb55a8bfff7c8ddd..5e214430cdd828768ab468cba04d1f9f5e69aeb9 100644 --- a/src/pystencils/codegen/target.py +++ b/src/pystencils/codegen/target.py @@ -30,6 +30,7 @@ class Target(Flag): _GPU = auto() _CUDA = auto() + _HIP = auto() _SYCL = auto() @@ -86,8 +87,14 @@ class Target(Flag): Generate a CUDA kernel for a generic Nvidia GPU. """ - GPU = CUDA - """Alias for `Target.CUDA`, for backward compatibility.""" + HIP = _GPU | _HIP + """Generic HIP GPU target. + + Generate a HIP kernel for generic AMD or NVidia GPUs. + """ + + GPU = CurrentGPU + """Alias for `Target.CurrentGPU`, for backward compatibility.""" SYCL = _SYCL """SYCL kernel target. @@ -99,15 +106,24 @@ class Target(Flag): """ def is_automatic(self) -> bool: + """Determine if this target is a proxy target that is automatically resolved + according to the runtime environment.""" return Target._AUTOMATIC in self def is_cpu(self) -> bool: + """Determine if this target is a CPU target.""" return Target._CPU in self def is_vector_cpu(self) -> bool: + """Determine if this target is a vector CPU target.""" return self.is_cpu() and Target._VECTOR in self def is_gpu(self) -> bool: + """Determine if this target is a GPU target. + + This refers to targets for the CUDA and HIP family of platforms. + `Target.SYCL` is *not* a GPU target. + """ return Target._GPU in self @staticmethod @@ -119,12 +135,30 @@ class Target(Flag): else: return Target.GenericCPU + @staticmethod + def auto_gpu() -> Target: + """Return the GPU target available in the current runtime environment. + + Raises: + RuntimeError: If `cupy` is not installed and therefore no GPU runtime is available. + """ + try: + import cupy + + if cupy.cuda.runtime.is_hip: + return Target.HIP + else: + return Target.CUDA + except ImportError: + raise RuntimeError("Cannot infer GPU target since cupy is not installed.") + @staticmethod def available_targets() -> list[Target]: + """List available""" targets = [Target.GenericCPU] try: import cupy # noqa: F401 - targets.append(Target.CUDA) + targets.append(Target.auto_gpu()) except ImportError: pass diff --git a/src/pystencils/datahandling/datahandling_interface.py b/src/pystencils/datahandling/datahandling_interface.py index 867bbf062d7307187c6a72f465fe54177bbfacc1..a6b1fcb559f183f3e82d7a68db8c75423df8c520 100644 --- a/src/pystencils/datahandling/datahandling_interface.py +++ b/src/pystencils/datahandling/datahandling_interface.py @@ -17,8 +17,6 @@ class DataHandling(ABC): 'gather' function that has collects (parts of the) distributed data on a single process. """ - _GPU_LIKE_TARGETS = [Target.GPU] - # ---------------------------- Adding and accessing data ----------------------------------------------------------- @property @abstractmethod diff --git a/src/pystencils/datahandling/serial_datahandling.py b/src/pystencils/datahandling/serial_datahandling.py index 73b749ca46171c86f6183789bff2731efb8a1a5d..dc6904c3a59cc7e86746718a226e9738ad9db1a0 100644 --- a/src/pystencils/datahandling/serial_datahandling.py +++ b/src/pystencils/datahandling/serial_datahandling.py @@ -110,7 +110,7 @@ class SerialDataHandling(DataHandling): if layout is None: layout = self.default_layout if gpu is None: - gpu = self.default_target in self._GPU_LIKE_TARGETS + gpu = self.default_target.is_gpu() kwargs = { 'shape': tuple(s + 2 * ghost_layers for s in self._domainSize), @@ -241,7 +241,7 @@ class SerialDataHandling(DataHandling): def swap(self, name1, name2, gpu=None): if gpu is None: - gpu = self.default_target in self._GPU_LIKE_TARGETS + gpu = self.default_target.is_gpu() arr = self.gpu_arrays if gpu else self.cpu_arrays arr[name1], arr[name2] = arr[name2], arr[name1] @@ -292,7 +292,7 @@ class SerialDataHandling(DataHandling): if target is None: target = self.default_target - if not (target.is_cpu() or target == Target.CUDA): + if not (target.is_cpu() or target.is_gpu()): raise ValueError(f"Unsupported target: {target}") if not hasattr(names, '__len__') or type(names) is str: diff --git a/src/pystencils/enums.py b/src/pystencils/enums.py index bcea50e84cfba9190a2353245c7c29168443ca13..9d6470ed71cbea42b50564dadeca6fa9943e5506 100644 --- a/src/pystencils/enums.py +++ b/src/pystencils/enums.py @@ -5,7 +5,8 @@ from warnings import warn warn( "Importing anything from `pystencils.enums` is deprecated and the module will be removed in pystencils 2.1. " "Import from `pystencils` instead.", - FutureWarning + FutureWarning, + stacklevel=2, ) Target = _Target diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 4e36e369e3b65f3f6e8c8f268feae9e16e1b6b56..2217809618ec9a161d5e2d6f8c3c374e399e0471 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -207,18 +207,25 @@ class CupyKernelWrapper(KernelWrapper): class CupyJit(JitBase): - def __init__(self): - self._runtime_headers = {"<cstdint>"} - def compile(self, kernel: Kernel) -> KernelWrapper: if not HAVE_CUPY: raise JitError( "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable." ) - if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA: - raise ValueError( - "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP" + if not isinstance(kernel, GpuKernel): + raise JitError( + "The CupyJit just-in-time compiler only accepts GPU kernels generated for CUDA or HIP" + ) + + if kernel.target == Target.CUDA and cp.cuda.runtime.is_hip: + raise JitError( + "Cannot compile a CUDA kernel on a HIP-based Cupy installation." + ) + + if kernel.target == Target.HIP and not cp.cuda.runtime.is_hip: + raise JitError( + "Cannot compile a HIP kernel on a CUDA-based Cupy installation." ) options = self._compiler_options() @@ -237,7 +244,13 @@ class CupyJit(JitBase): return tuple(options) def _prelude(self, kfunc: GpuKernel) -> str: - headers = self._runtime_headers + + headers: set[str] + if cp.cuda.runtime.is_hip: + headers = set() + else: + headers = {"<cstdint>"} + headers |= kfunc.required_headers if '"pystencils_runtime/half.h"' in headers: diff --git a/tests/fixtures.py b/tests/fixtures.py index ba2593f76f4fae81d7e785fdb5f7b0c9a4639c28..a4c77f550e228ecfeb7b4e61b95bd2dce9739f9f 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -18,16 +18,7 @@ from types import ModuleType import pystencils as ps -AVAILABLE_TARGETS = [ps.Target.GenericCPU] - -try: - import cupy - - AVAILABLE_TARGETS += [ps.Target.CUDA] -except ImportError: - pass - -AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets() +AVAILABLE_TARGETS = ps.Target.available_targets() TARGET_IDS = [t.name for t in AVAILABLE_TARGETS] @@ -72,9 +63,9 @@ def xp(target: ps.Target) -> ModuleType: """Primary array module for the current target. Returns: - `cupy` if `target == Target.CUDA`, and `numpy` otherwise + `cupy` if `target.is_gpu()`, and `numpy` otherwise """ - if target == ps.Target.CUDA: + if target.is_gpu(): import cupy as xp return xp diff --git a/tests/kernelcreation/test_buffer_gpu.py b/tests/kernelcreation/test_buffer_gpu.py index 0b5019fba49ba439253ef1257b3c2f12a728d064..bd9d2156b451e10f7a91ed21b76a64e60af9bd03 100644 --- a/tests/kernelcreation/test_buffer_gpu.py +++ b/tests/kernelcreation/test_buffer_gpu.py @@ -58,7 +58,7 @@ def test_full_scalar_field(): pack_eqs = [Assignment(buffer.center(), src_field.center())] - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_ast = create_kernel(pack_eqs, config=config) pack_kernel = pack_ast.compile() @@ -66,7 +66,7 @@ def test_full_scalar_field(): unpack_eqs = [Assignment(dst_field.center(), buffer.center())] - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() @@ -94,7 +94,7 @@ def test_field_slice(): pack_eqs = [Assignment(buffer.center(), src_field.center())] - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_ast = create_kernel(pack_eqs, config=config) pack_kernel = pack_ast.compile() @@ -103,7 +103,7 @@ def test_field_slice(): # Unpack into ghost layer of dst_field in N direction unpack_eqs = [Assignment(dst_field.center(), buffer.center())] - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() @@ -131,7 +131,7 @@ def test_all_cell_values(): eq = Assignment(buffer(idx), src_field(idx)) pack_eqs.append(eq) - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() @@ -143,7 +143,7 @@ def test_all_cell_values(): eq = Assignment(dst_field(idx), buffer(idx)) unpack_eqs.append(eq) - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr) @@ -173,7 +173,7 @@ def test_subset_cell_values(): pack_eqs.append(eq) pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype} - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_ast = create_kernel(pack_eqs, config=config) pack_kernel = pack_ast.compile() pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr) @@ -185,7 +185,7 @@ def test_subset_cell_values(): unpack_eqs.append(eq) unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype} - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() @@ -215,7 +215,7 @@ def test_field_layouts(): pack_eqs.append(eq) pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype} - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_ast = create_kernel(pack_eqs, config=config) pack_kernel = pack_ast.compile() @@ -228,7 +228,7 @@ def test_field_layouts(): unpack_eqs.append(eq) unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype} - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() @@ -299,7 +299,7 @@ def test_iteration_slices(gpu_indexing): gpu_src_arr.set(src_arr) gpu_dst_arr.fill(0) - config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice) + config = CreateKernelConfig(target=Target.CurrentGPU, iteration_slice=pack_slice) pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() @@ -311,7 +311,7 @@ def test_iteration_slices(gpu_indexing): eq = Assignment(dst_field(idx), buffer(idx)) unpack_eqs.append(eq) - config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice) + config = CreateKernelConfig(target=Target.CurrentGPU, iteration_slice=pack_slice) unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() diff --git a/tests/kernelcreation/test_functions.py b/tests/kernelcreation/test_functions.py index a4d154d4b0c86ea694bbe94f66372aa2ba3a190c..182a590056d68a9677a657877416574db9f81e25 100644 --- a/tests/kernelcreation/test_functions.py +++ b/tests/kernelcreation/test_functions.py @@ -106,14 +106,14 @@ def function_domain(function_name, dtype): case "pow": return np.concatenate( [ - [0., 1., 1.], - rng.uniform(-1., 1., 8), - rng.uniform(0., 5., 8), + [0.0, 1.0, 1.0], + rng.uniform(-1.0, 1.0, 8), + rng.uniform(0.0, 5.0, 8), ] ).astype(dtype), np.concatenate( [ - [1., 0., 2.], - np.arange(2., 10., 1.), + [1.0, 0.0, 2.0], + np.arange(2.0, 10.0, 1.0), rng.uniform(-2.0, 2.0, 8), ] ).astype( @@ -211,14 +211,14 @@ def test_binary_functions(gen_config, xp, function_name, dtype, function_domain) dtype_and_target_for_integer_funcs = pytest.mark.parametrize( "dtype, target", - list(product([np.int32], [t for t in AVAIL_TARGETS if t is not Target.CUDA])) + list(product([np.int32], [t for t in AVAIL_TARGETS if not t.is_gpu()])) + list( product( [np.int64], [ t for t in AVAIL_TARGETS - if t not in (Target.X86_SSE, Target.X86_AVX, Target.CUDA) + if t not in (Target.X86_SSE, Target.X86_AVX) and not t.is_gpu() ], ) ), diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 70b4edd35852b1d2ff674e6cb0421bf4fea66e1f..a3f8a5482ab24ecb53fb7f297a28930a6bc7deeb 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -49,7 +49,7 @@ def test_indexing_options_3d( + src[0, 0, 1], ) - cfg = CreateKernelConfig(target=Target.CUDA) + cfg = CreateKernelConfig(target=Target.CurrentGPU) cfg.gpu.indexing_scheme = indexing_scheme cfg.gpu.manual_launch_grid = manual_grid cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size @@ -90,10 +90,32 @@ def test_indexing_options_3d( cp.testing.assert_allclose(dst_arr, expected) -@pytest.mark.parametrize("iteration_space", - [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)]) -@pytest.mark.parametrize("initial_block_size", - [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)]) + +@pytest.mark.parametrize( + "iteration_space", + [ + (8, 4, 4), + (1, 8, 8), + (1, 1, 16), + (17, 1, 1), + (1, 12, 56), + (65, 65, 65), + (1, 7, 9), + ], +) +@pytest.mark.parametrize( + "initial_block_size", + [ + (8, 4, 4), + (1, 8, 8), + (1, 1, 16), + (2, 2, 64), + (8, 2, 1), + (3, 1, 32), + (32, 1, 1), + (1, 2, 3), + ], +) @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) @pytest.mark.parametrize("use_block_fitting", [True, False]) def test_block_size_adaptations( @@ -102,7 +124,13 @@ def test_block_size_adaptations( assume_warp_aligned_block_size: bool, use_block_fitting: bool, ): - src, dst = fields("src, dst: [3D]") + field_shape = tuple(2 + x for x in iteration_space[::-1]) + src_arr = cp.ones(field_shape) + dst_arr = cp.zeros_like(src_arr) + + src = Field.create_from_numpy_array("src", src_arr) + dst = Field.create_from_numpy_array("dst", dst_arr) + asm = Assignment( dst.center(), src[-1, 0, 0] @@ -113,25 +141,21 @@ def test_block_size_adaptations( + src[0, 0, 1], ) - target = Target.CUDA + target = Target.CurrentGPU cfg = CreateKernelConfig(target=target) cfg.gpu.indexing_scheme = "linear3d" cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size - - warp_size = cfg.gpu.default_warp_size(target) - max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) - max_block_sizes = GpuIndexing.get_max_block_sizes(target) + + warp_size = 32 + cfg.gpu.warp_size = warp_size ast = create_kernel(asm, cfg) kernel = ast.compile() if use_block_fitting: # test internal block fitting function later used in `kernel.launch_config.fit_block_size` - internal_block_size = kernel.launch_config._fit_block_size_to_it_space( - iteration_space, - initial_block_size, - HardwareProperties(warp_size, max_threads_per_block, max_block_sizes), - ) + kernel.launch_config.fit_block_size(initial_block_size) + internal_block_size, _ = kernel.launch_config.evaluate() # checks if criterion for warp size alignment is fulfilled def check_suitability(b): @@ -139,25 +163,20 @@ def test_block_size_adaptations( # block size fitting should not modify an already ideal configuration # -> check if ideal configurations are modified - if ( - check_suitability(initial_block_size) - and all(x == y for x, y in zip(initial_block_size, iteration_space)) # trimming may alter results - ): - assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \ - f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}." - - assert check_suitability(internal_block_size), \ - "Determined block size shall be divisible by warp size." - - # set block size via fitting algorithm - kernel.launch_config.fit_block_size(initial_block_size) + if check_suitability(initial_block_size) and all( + x == y for x, y in zip(initial_block_size, iteration_space) + ): # trimming may alter results + assert all( + x == y for x, y in zip(initial_block_size, internal_block_size) + ), f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}." + + assert check_suitability( + internal_block_size + ), "Determined block size shall be divisible by warp size." else: # set block size via trimming algorithm kernel.launch_config.trim_block_size(initial_block_size) - src_arr = cp.ones(iteration_space) - dst_arr = cp.zeros_like(src_arr) - kernel(src=src_arr, dst=dst_arr) expected = cp.zeros_like(src_arr) @@ -173,15 +192,9 @@ def test_indexing_options_2d( indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool ): src, dst = fields("src, dst: [2D]") - asm = Assignment( - dst.center(), - src[-1, 0] - + src[1, 0] - + src[0, -1] - + src[0, 1] - ) + asm = Assignment(dst.center(), src[-1, 0] + src[1, 0] + src[0, -1] + src[0, 1]) - cfg = CreateKernelConfig(target=Target.CUDA) + cfg = CreateKernelConfig(target=Target.CurrentGPU) cfg.gpu.indexing_scheme = indexing_scheme cfg.gpu.manual_launch_grid = manual_grid cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size @@ -227,7 +240,7 @@ def test_invalid_indexing_schemes(): src, dst = fields("src, dst: [4D]") asm = Assignment(src.center(0), dst.center(0)) - cfg = CreateKernelConfig(target=Target.CUDA) + cfg = CreateKernelConfig(target=Target.CurrentGPU) cfg.gpu.indexing_scheme = "linear3d" with pytest.raises(Exception): @@ -342,7 +355,7 @@ def test_ghost_layer(): ghost_layers = [(1, 2), (2, 1)] config = CreateKernelConfig() - config.target = Target.CUDA + config.target = Target.CurrentGPU config.ghost_layers = ghost_layers config.gpu.indexing_scheme = "blockwise4d" @@ -371,7 +384,7 @@ def test_setting_value(): update_rule = [Assignment(f(0), sp.Symbol("value"))] config = CreateKernelConfig() - config.target = Target.CUDA + config.target = Target.CurrentGPU config.iteration_slice = iteration_slice config.gpu.indexing_scheme = "blockwise4d" diff --git a/tests/kernelcreation/test_half_precision.py b/tests/kernelcreation/test_half_precision.py index a9745459da19dbd206264980ed72987cc7879387..5dbe2180ed48e8d71c216fa015da15005fdee9d1 100644 --- a/tests/kernelcreation/test_half_precision.py +++ b/tests/kernelcreation/test_half_precision.py @@ -5,7 +5,7 @@ import numpy as np import pystencils as ps -@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU)) +@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.CurrentGPU)) def test_half_precison(target): if target == ps.Target.CPU: if not platform.machine() in ['arm64', 'aarch64']: @@ -14,7 +14,7 @@ def test_half_precison(target): if 'clang' not in ps.cpu.cpujit.get_compiler_config()['command']: pytest.xfail("skipping half precision because clang compiler is not used") - if target == ps.Target.GPU: + if target.is_gpu(): pytest.importorskip("cupy") dh = ps.create_data_handling(domain_size=(10, 10), default_target=target) diff --git a/tests/kernelcreation/test_index_kernels.py b/tests/kernelcreation/test_index_kernels.py index 569c0ab6a0e582de895a66c656697fdf8a5909ee..bda0ef2732067ef58d682a4501b855ca936148a8 100644 --- a/tests/kernelcreation/test_index_kernels.py +++ b/tests/kernelcreation/test_index_kernels.py @@ -5,14 +5,7 @@ from pystencils import Assignment, Field, FieldType, AssignmentCollection, Targe from pystencils import create_kernel, CreateKernelConfig -@pytest.mark.parametrize("target", [Target.CPU, Target.GPU]) -def test_indexed_kernel(target): - if target == Target.GPU: - cp = pytest.importorskip("cupy") - xp = cp - else: - xp = np - +def test_indexed_kernel(target, xp): arr = xp.zeros((3, 4)) dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)], align=True) @@ -21,8 +14,8 @@ def test_indexed_kernel(target): cpu_index_arr[1] = (1, 3, 42.0) cpu_index_arr[2] = (2, 1, 5.0) - if target == Target.GPU: - gpu_index_arr = cp.empty(cpu_index_arr.shape, cpu_index_arr.dtype) + if target.is_gpu(): + gpu_index_arr = xp.empty(cpu_index_arr.shape, cpu_index_arr.dtype) gpu_index_arr.set(cpu_index_arr) index_arr = gpu_index_arr else: @@ -40,8 +33,8 @@ def test_indexed_kernel(target): kernel(f=arr, index=index_arr) - if target == Target.GPU: - arr = cp.asnumpy(arr) + if target.is_gpu(): + arr = xp.asnumpy(arr) for i in range(cpu_index_arr.shape[0]): np.testing.assert_allclose(arr[cpu_index_arr[i]['x'], cpu_index_arr[i]['y']], cpu_index_arr[i]['value'], atol=1e-13) diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py index b1f2da576750b60e4fbb8d9d4d33e393bc00dcf3..2b3a8ebf0e2fbbd5ed94779b2eb764a6127fc030 100644 --- a/tests/kernelcreation/test_iteration_slices.py +++ b/tests/kernelcreation/test_iteration_slices.py @@ -144,7 +144,7 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp): islice = make_slice[:, slow_counter:] gen_config = replace(gen_config, iteration_slice=islice) - if gen_config.target == Target.CUDA: + if gen_config.target.is_gpu(): gen_config.gpu.manual_launch_grid = True kernel = create_kernel(update, gen_config).compile() @@ -177,7 +177,7 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp): islice = make_slice[:, start::2] gen_config.iteration_slice = islice - if gen_config.target == Target.CUDA: + if gen_config.target.is_gpu(): gen_config.gpu.manual_launch_grid = True try: diff --git a/tests/runtime/test_boundary.py b/tests/runtime/test_boundary.py index fb8f827e88106fd3b7a45b25f9c962f4ebb14f14..226510b83d8832a5a189552df5c8760235f0d598 100644 --- a/tests/runtime/test_boundary.py +++ b/tests/runtime/test_boundary.py @@ -98,7 +98,7 @@ def test_kernel_vs_copy_boundary(): def test_boundary_gpu(): pytest.importorskip('cupy') - dh = SerialDataHandling(domain_size=(7, 7), default_target=Target.GPU) + dh = SerialDataHandling(domain_size=(7, 7), default_target=Target.CurrentGPU) src = dh.add_array('src') dh.fill("src", 0.0, ghost_layers=True) dh.fill("src", 1.0, ghost_layers=False) @@ -111,7 +111,7 @@ def test_boundary_gpu(): name="boundary_handling_cpu", target=Target.CPU) boundary_handling = BoundaryHandling(dh, src.name, boundary_stencil, - name="boundary_handling_gpu", target=Target.GPU) + name="boundary_handling_gpu", target=Target.CurrentGPU) neumann = Neumann() for d in ('N', 'S', 'W', 'E'): diff --git a/tests/runtime/test_datahandling.py b/tests/runtime/test_datahandling.py index 9d7ff924e8d7eba9039f8f0796145bd7de116ef5..9e7c73cac225131cbb2ba160919846de30bc7ca8 100644 --- a/tests/runtime/test_datahandling.py +++ b/tests/runtime/test_datahandling.py @@ -118,7 +118,7 @@ def synchronization(dh, test_gpu=False): def kernel_execution_jacobi(dh, target): - test_gpu = target == Target.GPU + test_gpu = target == Target.CurrentGPU dh.add_array('f', gpu=test_gpu) dh.add_array('tmp', gpu=test_gpu) @@ -219,15 +219,15 @@ def test_kernel(): try: import cupy dh = create_data_handling(domain_size=domain_shape, periodicity=True) - kernel_execution_jacobi(dh, Target.GPU) + kernel_execution_jacobi(dh, Target.CurrentGPU) except ImportError: pass -@pytest.mark.parametrize('target', (Target.CPU, Target.GPU)) +@pytest.mark.parametrize('target', (Target.CPU, Target.CurrentGPU)) def test_kernel_param(target): for domain_shape in [(4, 5), (3, 4, 5)]: - if target == Target.GPU: + if target == Target.CurrentGPU: pytest.importorskip('cupy') dh = create_data_handling(domain_size=domain_shape, periodicity=True, default_target=target) @@ -262,7 +262,7 @@ def test_add_arrays(): def test_add_arrays_with_layout(shape, layout): pytest.importorskip('cupy') - dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.GPU) + dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.CurrentGPU) f1 = dh.add_array("f1", values_per_cell=19) dh.fill(f1.name, 1.0) @@ -392,8 +392,6 @@ def test_array_handler(device_number): empty = array_handler.empty(shape=size, order="F") assert empty.strides == (8, 16) - random_array = array_handler.randn(size) - cpu_array = np.empty((20, 40), dtype=np.float64) gpu_array = array_handler.to_gpu(cpu_array)