diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 474afbb113dbb1a72e6b077e9272a91749c5698d..a2ec00d16cc04af33a2d3e4e46183111e4cfbcda 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -337,6 +337,7 @@ build-documentation: artifacts: paths: - docs/build/html + when: always pages: diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md new file mode 100644 index 0000000000000000000000000000000000000000..2e3709b24ff81c5383ee26c0eb04e04159b61da3 --- /dev/null +++ b/docs/source/backend/gpu_codegen.md @@ -0,0 +1,92 @@ +# GPU Code Generation + +The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components: + + - The {any}`CudaPlatform` at `backend.platforms` which performs materialization of a kernel's iteration + space by mapping GPU block and thread indices to iteration space points. To perform this task, + it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping. + The platform also takes care of lowering mathematical functions to their CUDA runtime library implementation. + - In the code generation driver, the strings are drawn by the `GpuIndexing` helper class. + It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration + for the runtime system. + +:::{attention} + +Code generation for HIP through the `CudaPlatform` is experimental and not tested at the moment. +::: + +## The CUDA Platform and Thread Mappings + +```{eval-rst} +.. module:: pystencils.backend.platforms.cuda + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + ThreadMapping + Linear3DMapping + Blockwise4DMapping +``` + +## Thread Indexing In The Driver + +With regard to GPU thread indexing, the code generation driver has two tasks: +it must provide the Cuda platform object with a valid thread mapping, +and must also provide the runtime system with a [launch configuration](#gpu_launch_config) +which defines the shape of the GPU block grid. +Both of these are produced by the {any}`GpuIndexing` class. +It is instantiated with the GPU indexing scheme and indexing options given by the user. + +At this time, the backend and code generation driver support two indexing schemes: +"Linear3D" (see {any}`Linear3DMapping`) and "Blockwise4D" (see {any}`Blockwise4DMapping`). +These are mostly reimplemented from the pystencils 1.3.x `"block"` and `"line"` indexing options. +The GPU indexing system may be extended in the future. + + +```{eval-rst} +.. module:: pystencils.codegen.gpu_indexing + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + GpuIndexing +``` + +(gpu_launch_config)= +## The Launch Configuration + +The launch configuration is attached to the `GpuKernel` and thus returned to the runtime system. +Since a concrete launch configuration is not specific to the kernel itself, but to the kernels' +invocation site, the code generator only attaches a *factory function* for launch configurations +to `GpuKernel`. It is up to the runtime system to locally instantiate and configure a launch configuration. +To determine the actual launch grid, the launch configuration must be evaluated at the kernel's call site +by passing the required parameters to `GpuLaunchConfiguration.evaluate` + +The {any}`CupyJit`, for instance, will create the launch configuration object while preparing the JIT-compiled +kernel wrapper object. The launch config is there exposed to the user, who may modify some of its properties. +These depend on the type of the launch configuration: +while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel +parameters, +the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size. + +The `evaluate` method can only be used from within a Python runtime environment. +When exporting pystencils CUDA kernels for external use in C++ projects, +equivalent C++ code evaluating the launch config must be generated. +This is the task of, e..g., [pystencils-sfg](https://pycodegen.pages.i10git.cs.fau.de/pystencils-sfg/). + + +```{eval-rst} +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + GpuLaunchConfiguration + AutomaticLaunchConfiguration + ManualLaunchConfiguration + DynamicBlockSizeLaunchConfiguration +``` diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst index 5ab8dbd34eb37fbc38230f3db0506c572d4b6964..0d384c55bc1e5933a055365f9d5ffe4143c902b6 100644 --- a/docs/source/backend/index.rst +++ b/docs/source/backend/index.rst @@ -17,6 +17,7 @@ who wish to customize or extend the behaviour of the code generator in their app translation platforms transformations + gpu_codegen errors extensions diff --git a/docs/source/backend/platforms.md b/docs/source/backend/platforms.md new file mode 100644 index 0000000000000000000000000000000000000000..2a0df48ed63a6730c010dff7964bc88ec76e44b0 --- /dev/null +++ b/docs/source/backend/platforms.md @@ -0,0 +1,54 @@ +# Platforms + +All target-specific code generation in the pystencils backend is facilitated +through the *platform classes*. +This includes: + + - Materialization of the iteration space, meaning the mapping of iteration space points to some indexing structure + - Lowering of mathematical functions to their implementation in some runtime environment + - Selection of vector intrinsics for SIMD-capable CPU targets + +Encapsulation of hardware- and environment-specific details into platform objects allows +us to implement most of the code generator in a generic and hardware-agnostic way. +It also makes it easier to extend pystencils with support for additional code generation +targets in the future. + +## Base Classes + +```{eval-rst} +.. module:: pystencils.backend.platforms + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + Platform + GenericCpu + GenericVectorCpu + GenericGpu +``` + +## CPU Platforms + +```{eval-rst} +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + X86VectorCpu + X86VectorArch +``` + +## GPU Platforms + +```{eval-rst} +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + CudaPlatform + SyclPlatform +``` diff --git a/docs/source/backend/platforms.rst b/docs/source/backend/platforms.rst deleted file mode 100644 index 68b74504cfc94dd20e72a5852a2e45d399065aef..0000000000000000000000000000000000000000 --- a/docs/source/backend/platforms.rst +++ /dev/null @@ -1,6 +0,0 @@ -********* -Platforms -********* - -.. automodule:: pystencils.backend.platforms - :members: \ No newline at end of file diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 5f4995a028b21b904c5fdc7f5e50c1f2ea331ca5..e896fc2bba9a484e33480b34b3e390d8c44eb4df 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -46,7 +46,7 @@ GRID_DIM = [ ] -class ThreadToIndexMapping(ABC): +class ThreadMapping(ABC): @abstractmethod def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: @@ -57,7 +57,7 @@ class ThreadToIndexMapping(ABC): """ -class Linear3DMapping(ThreadToIndexMapping): +class Linear3DMapping(ThreadMapping): """3D globally linearized mapping, where each thread is assigned a work item according to its location in the global launch grid.""" @@ -109,7 +109,7 @@ class Linear3DMapping(ThreadToIndexMapping): return block_idx * block_size + thread_idx -class Blockwise4DMapping(ThreadToIndexMapping): +class Blockwise4DMapping(ThreadMapping): """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions are mapped to block indices.""" @@ -162,13 +162,20 @@ class Blockwise4DMapping(ThreadToIndexMapping): class CudaPlatform(GenericGpu): - """Platform for CUDA-based GPUs.""" + """Platform for CUDA-based GPUs. + + Args: + ctx: The kernel creation context + omit_range_check: If `True`, generated index translation code will not check if the point identified + by block and thread indices is actually contained in the iteration space + thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points + """ def __init__( self, ctx: KernelCreationContext, omit_range_check: bool = False, - thread_mapping: ThreadToIndexMapping | None = None, + thread_mapping: ThreadMapping | None = None, ) -> None: super().__init__(ctx) diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 7491ec8e96a57c7d53637f3bb0db990147e9c127..b5b35c8b03447f1d5c35ed1289b89542bb1127ca 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,14 +1,7 @@ from __future__ import annotations -from abc import abstractmethod -from ..ast.structural import PsBlock -from ..kernelcreation.iteration_space import IterationSpace from .platform import Platform class GenericGpu(Platform): - @abstractmethod - def materialize_iteration_space( - self, body: PsBlock, ispace: IterationSpace - ) -> PsBlock: - pass + """Base class for GPU platforms.""" diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index 9b7e642b5b6634e9f65a6b18b54785e89f6df362..8ed4729a2d67777bbd132d9e48140e20e3656767 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -11,9 +11,9 @@ class Platform(ABC): """Abstract base class for all supported platforms. The platform performs all target-dependent tasks during code generation: - - - Translation of the iteration space to an index source (loop nest, GPU indexing, ...) - - Platform-specific optimizations (e.g. vectorization, OpenMP) + + - Translation of the iteration space to an index source (loop nest, GPU indexing, ...) + - Platform-specific optimizations (e.g. vectorization, OpenMP) """ def __init__(self, ctx: KernelCreationContext) -> None: @@ -22,12 +22,16 @@ class Platform(ABC): @property @abstractmethod def required_headers(self) -> set[str]: + """Set of header files that must be included at the point of definition of a kernel + running on this platform.""" pass @abstractmethod def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace ) -> PsBlock: + """Materialize the given iteration space as an indexing structure and embed the given + kernel body into that structure.""" pass @abstractmethod diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 13549f73485b52a1e64290a735679491a95476e6..afd2958c16161b869b9caf69785c3eac9f287949 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -14,6 +14,7 @@ from ..backend.kernelcreation import ( FullIterationSpace, SparseIterationSpace, ) +from ..backend.platforms.cuda import ThreadMapping from ..backend.ast.expressions import PsExpression @@ -198,24 +199,41 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): return self._block_size -class GpuIndexing(ABC): +class GpuIndexing: + """Factory for GPU indexing objects required during code generation. + + This class acts as a helper class for the code generation driver. + It produces both the `ThreadMapping` required by the backend, + as well as factories for the launch configuration required later by the runtime system. + + Args: + ctx: The kernel creation context + scheme: The desired GPU indexing scheme + block_size: A user-defined default block size, required only if the indexing scheme permits + modification of the block size + manual_launch_grid: If `True`, always emit a `ManualLaunchConfiguration` to force the runtime system + to set the launch configuration explicitly + """ + def __init__( self, ctx: KernelCreationContext, scheme: GpuIndexingScheme, - block_size: dim3 | _AUTO_TYPE, - manual_launch_grid: bool, + default_block_size: dim3 | _AUTO_TYPE | None = None, + manual_launch_grid: bool = False, ) -> None: self._ctx = ctx self._scheme = scheme - self._block_size = block_size + self._default_block_size = default_block_size self._manual_launch_grid = manual_launch_grid from ..backend.kernelcreation import AstFactory self._factory = AstFactory(self._ctx) - def get_thread_mapping(self): + def get_thread_mapping(self) -> ThreadMapping: + """Retrieve a thread mapping object for use by the backend""" + from ..backend.platforms.cuda import Linear3DMapping, Blockwise4DMapping match self._scheme: @@ -225,6 +243,7 @@ class GpuIndexing(ABC): return Blockwise4DMapping() def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]: + """Retrieve a factory for the launch configuration for later consumption by the runtime system""" if self._manual_launch_grid: return ManualLaunchConfiguration @@ -254,7 +273,10 @@ class GpuIndexing(ABC): return factory def _get_default_block_size(self, rank: int) -> dim3: - if isinstance(self._block_size, _AUTO_TYPE): + if self._default_block_size is None: + raise CodegenError("The default block size option was not set") + + if isinstance(self._default_block_size, _AUTO_TYPE): match rank: case 1: return (256, 1, 1) @@ -265,7 +287,7 @@ class GpuIndexing(ABC): case _: assert False, "unreachable code" else: - return self._block_size + return self._default_block_size def _get_blockwise4d_config_factory( self,