Skip to content
Snippets Groups Projects
Commit 21c0ba7f authored by Frederik Hennig's avatar Frederik Hennig
Browse files

documentation on GPU invocation

parent 3ff729e9
No related branches found
No related tags found
1 merge request!24Extend Support for CUDA and HIP kernel invocations
import pystencilssfg import pystencilssfg
from pystencilssfg.config import SfgConfig from pystencilssfg.config import SfgConfig
from os.path import splitext
class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator): class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
"""Mockup wrapper around SourceFileGenerator for use in documentation """Mockup wrapper around SourceFileGenerator for use in documentation
...@@ -30,21 +32,20 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator): ...@@ -30,21 +32,20 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
self._finish_files() self._finish_files()
header_code = self._emitter.dumps(self._header_file) header_code = self._emitter.dumps(self._header_file)
impl_code = ( header_ext = splitext(self._header_file.name)[1]
None
if self._impl_file is None
else self._emitter.dumps(self._impl_file)
)
mdcode = ":::::{tab-set}\n" mdcode = ":::::{tab-set}\n"
mdcode += "::::{tab-item} Generated Header (.hpp)\n" mdcode += f"::::{{tab-item}} Generated Header ({header_ext})\n"
mdcode += ":::{code-block} C++\n\n" mdcode += ":::{code-block} C++\n\n"
mdcode += header_code mdcode += header_code
mdcode += "\n:::\n::::\n" mdcode += "\n:::\n::::\n"
if impl_code: if self._impl_file is not None:
mdcode += "::::{tab-item} Generated Implementation (.cpp)\n" impl_code = self._emitter.dumps(self._impl_file)
impl_ext = splitext(self._impl_file.name)[1]
mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n"
mdcode += ":::{code-block} C++\n\n" mdcode += ":::{code-block} C++\n\n"
mdcode += impl_code mdcode += impl_code
mdcode += "\n:::\n::::\n" mdcode += "\n:::\n::::\n"
......
...@@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``) ...@@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``)
.. autoclass:: SfgClassComposer .. autoclass:: SfgClassComposer
:members: :members:
.. autoclass:: SfgGpuComposer
:members:
Custom Generators Custom Generators
================= =================
......
...@@ -344,6 +344,44 @@ cause them to be added to its signature. ...@@ -344,6 +344,44 @@ cause them to be added to its signature.
We don't want to expose this complexity, but instead hide it by using appropriate data structures. We don't want to expose this complexity, but instead hide it by using appropriate data structures.
The next section explains how that is achieved in pystencils-sfg. The next section explains how that is achieved in pystencils-sfg.
#### Invoking GPU Kernels
Pystencils also allows us to generate kernels for the CUDA and HIP GPU platforms.
First, we need to decide for one of the two systems by calling either
{any}`sfg.use_cuda <SfgGpuComposer.use_cuda>` or {any}`sfg.use_hip <SfgGpuComposer.use_hip>`.
After registering a GPU kernel,
you can render its invocation using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
Here is a basic example:
```{code-cell} ipython3
:tags: [remove-cell]
f, g = ps.fields("f, g: double[2D]")
asm = ps.Assignment(f(0), g(0))
```
```{code-cell} ipython3
from pystencilssfg import SfgConfig
sfg_config = SfgConfig()
sfg_config.extensions.impl = "cu"
with SourceFileGenerator(sfg_config) as sfg:
# Activate CUDA
sfg.use_cuda()
# Register the GPU kernel
cfg = ps.CreateKernelConfig()
cfg.target = ps.Target.CUDA
khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
# Invoke it
sfg.function("kernel_wrapper")(
sfg.gpu_invoke(khandle)
)
```
#### Mapping Fields to Data Structures #### Mapping Fields to Data Structures
Pystencils kernels operate on n-dimensional contiguous or strided arrays, Pystencils kernels operate on n-dimensional contiguous or strided arrays,
......
...@@ -9,6 +9,7 @@ from .basic_composer import ( ...@@ -9,6 +9,7 @@ from .basic_composer import (
) )
from .mixin import SfgComposerMixIn from .mixin import SfgComposerMixIn
from .class_composer import SfgClassComposer from .class_composer import SfgClassComposer
from .gpu_composer import SfgGpuComposer
__all__ = [ __all__ = [
"SfgIComposer", "SfgIComposer",
...@@ -20,4 +21,5 @@ __all__ = [ ...@@ -20,4 +21,5 @@ __all__ = [
"ExprLike", "ExprLike",
"SfgBasicComposer", "SfgBasicComposer",
"SfgClassComposer", "SfgClassComposer",
"SfgGpuComposer",
] ]
...@@ -390,8 +390,8 @@ class SfgBasicComposer(SfgIComposer): ...@@ -390,8 +390,8 @@ class SfgBasicComposer(SfgIComposer):
"""Use inside a function body to directly call a kernel. """Use inside a function body to directly call a kernel.
When using `call`, the given kernel will simply be called as a function. When using `call`, the given kernel will simply be called as a function.
To invoke a GPU kernel on a specified launch grid, use `cuda_invoke` To invoke a GPU kernel on a specified launch grid,
or the interfaces of ``pystencilssfg.extensions.sycl`` instead. use `gpu_invoke <SfgGpuComposer.gpu_invoke>` instead.
Args: Args:
kernel_handle: Handle to a kernel previously added to some kernel namespace. kernel_handle: Handle to a kernel previously added to some kernel namespace.
......
...@@ -24,6 +24,50 @@ from ..lang.gpu import ProvidesGpuRuntimeAPI ...@@ -24,6 +24,50 @@ from ..lang.gpu import ProvidesGpuRuntimeAPI
class SfgGpuComposer(SfgComposerMixIn): class SfgGpuComposer(SfgComposerMixIn):
"""Composer mix-in providing methods to generate GPU kernel invocations.
.. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs)
Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration.
The overloads of this method are listed below.
They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax;
for details on the launch configuration arguments,
refer to `Launch Configurations in CUDA`_
or `Launch Configurations in HIP`_.
.. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
:noindex:
Invoke a GPU kernel with a manual launch grid.
Requires that the kernel was generated
with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
set to `True`.
.. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
:noindex:
Invoke a GPU kernel with an automatic launch grid.
This signature accepts kernels generated with an indexing scheme that
causes the launch grid to be determined automatically,
such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`.
.. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
:noindex:
Invoke a GPU kernel with a dynamic launch grid.
This signature accepts kernels generated with an indexing scheme that permits a user-defined
blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`.
The grid size is calculated automatically by dividing the number of work items in each
dimension by the block size, rounding up.
.. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
.. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
""" # NOQA: E501
def __init__(self) -> None: def __init__(self) -> None:
self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
...@@ -63,12 +107,7 @@ class SfgGpuComposer(SfgComposerMixIn): ...@@ -63,12 +107,7 @@ class SfgGpuComposer(SfgComposerMixIn):
block_size: ExprLike, block_size: ExprLike,
shared_memory_bytes: ExprLike = "0", shared_memory_bytes: ExprLike = "0",
stream: ExprLike | None = None, stream: ExprLike | None = None,
) -> SfgCallTreeNode: ) -> SfgCallTreeNode: ...
"""Invoke a CUDA kernel with a manual launch grid.
Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
set to `True`.
"""
@overload @overload
def gpu_invoke( def gpu_invoke(
...@@ -77,12 +116,7 @@ class SfgGpuComposer(SfgComposerMixIn): ...@@ -77,12 +116,7 @@ class SfgGpuComposer(SfgComposerMixIn):
*, *,
shared_memory_bytes: ExprLike = "0", shared_memory_bytes: ExprLike = "0",
stream: ExprLike | None = None, stream: ExprLike | None = None,
) -> SfgCallTreeNode: ) -> SfgCallTreeNode: ...
"""Invoke a CUDA kernel with an automatic launch grid.
This signature accepts kernels generated with an indexing scheme that permits
the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
"""
@overload @overload
def gpu_invoke( def gpu_invoke(
...@@ -92,14 +126,7 @@ class SfgGpuComposer(SfgComposerMixIn): ...@@ -92,14 +126,7 @@ class SfgGpuComposer(SfgComposerMixIn):
block_size: ExprLike | None = None, block_size: ExprLike | None = None,
shared_memory_bytes: ExprLike = "0", shared_memory_bytes: ExprLike = "0",
stream: ExprLike | None = None, stream: ExprLike | None = None,
) -> SfgCallTreeNode: ) -> SfgCallTreeNode: ...
"""Invoke a CUDA kernel with a dynamic launch grid.
This signature accepts kernels generated with an indexing scheme that permits a user-defined
blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
The grid size is calculated automatically by dividing the number of work items in each
dimension by the block size, rounding up.
"""
def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode: def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
assert isinstance( assert isinstance(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment