Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Commits on Source (2)
Showing
with 712 additions and 163 deletions
......@@ -24,7 +24,6 @@ typechecker:
- nox --session typecheck
.testsuite-base:
extends: .nox-base
stage: "Tests"
needs: []
coverage: '/TOTAL.*\s+(\d+%)$/'
......@@ -38,13 +37,18 @@ typechecker:
coverage_format: cobertura
path: coverage.xml
"testsuite-py3.10":
"testsuite-py3.10+cuda":
extends: .testsuite-base
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:ubuntu24.04-cuda12.6
script:
- nox --session testsuite-3.10
tags:
- docker
- cuda11
"testsuite-py3.13":
extends: .testsuite-base
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:alpine
script:
- nox --session testsuite-3.13
......
import pystencilssfg
from pystencilssfg.config import SfgConfig
from os.path import splitext
class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
"""Mockup wrapper around SourceFileGenerator for use in documentation
......@@ -28,23 +30,23 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
def __exit__(self, exc_type, exc_value, traceback):
if exc_type is None:
self._finish_files()
emitter = self._get_emitter()
header_code = self._emitter.dumps(self._header_file)
impl_code = (
None
if self._impl_file is None
else self._emitter.dumps(self._impl_file)
)
header_code = emitter.dumps(self._header_file)
header_ext = splitext(self._header_file.name)[1]
mdcode = ":::::{tab-set}\n"
mdcode += "::::{tab-item} Generated Header (.hpp)\n"
mdcode += f"::::{{tab-item}} Generated Header ({header_ext})\n"
mdcode += ":::{code-block} C++\n\n"
mdcode += header_code
mdcode += "\n:::\n::::\n"
if impl_code:
mdcode += "::::{tab-item} Generated Implementation (.cpp)\n"
if self._impl_file is not None:
impl_code = emitter.dumps(self._impl_file)
impl_ext = splitext(self._impl_file.name)[1]
mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n"
mdcode += ":::{code-block} C++\n\n"
mdcode += impl_code
mdcode += "\n:::\n::::\n"
......
......@@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``)
.. autoclass:: SfgClassComposer
:members:
.. autoclass:: SfgGpuComposer
:members:
Custom Generators
=================
......@@ -37,6 +40,7 @@ Helper Methods and Builders
.. autoclass:: SfgFunctionSequencer
:members:
:inherited-members:
.. autoclass:: SfgNodeBuilder
:members:
......@@ -47,6 +51,12 @@ Helper Methods and Builders
.. autoclass:: SfgSwitchBuilder
:members:
.. module:: pystencilssfg.composer.class_composer
.. autoclass:: SfgMethodSequencer
:members:
:inherited-members:
Context and Cursor
==================
......
......@@ -41,3 +41,9 @@ Implementation
.. automodule:: pystencilssfg.lang.cpp
:members:
GPU Runtime APIs
----------------
.. automodule:: pystencilssfg.lang.gpu
:members:
......@@ -12,7 +12,7 @@ different configuration sources:
the generator script to set some of its configuration options; see [Command-Line Options](#cmdline_options)
- **Project Configuration:** When embedded into a larger project, using a build system such as CMake, generator scripts
may be configured globally within that project by the use of a *configuration module*.
Settings specified inside that configuration module are always overridden by the former to configuration sources.
Settings specified inside that configuration module are always overridden by the two other configuration sources listed above.
For details on configuration modules, refer to the guide on [Project and Build System Integration](#guide_project_integration).
(inline_config)=
......@@ -60,14 +60,26 @@ set {any}`cfg.outer_namespace <SfgConfig.outer_namespace>`.
### Code Style and Formatting
- Modify the values in the {any}`cfg.code_style <CodeStyle>` category to affect
certain formatting aspects of the generated code.
- To change, enforce, or disable auto-formatting of generated code through `clang-format`,
take a look at the {any}`cfg.clang_format <ClangFormatOptions>` category.
- Clang-format will, by default, sort `#include` statements alphabetically and separate
local and system header includes.
To override this, you can set a custom sorting key for `#include` sorting via
{any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.
Pystencils-sfg gives you some options to affect its output code style.
These are controlled by the options in the {any}`cfg.code_style <CodeStyle>` category.
Furthermore, pystencils-sfg uses `clang-format` to beautify generated code.
The behaviour of the clang-format integration is managed by the
the {any}`cfg.clang_format <ClangFormatOptions>` category,
where you can set options to skip or enforce formatting,
or change the formatter binary.
To set the code style used by `clang-format` either create a `.clang-format` file
in any of the parent folders of your generator script,
or modify the {any}`cfg.clang_format.code_style <ClangFormatOptions.code_style>` option.
:::{seealso}
[Clang-Format Style Options](https://clang.llvm.org/docs/ClangFormatStyleOptions.html)
:::
Clang-format will, by default, sort `#include` statements alphabetically and separate
local and system header includes.
To override this, you can set a custom sorting key for `#include` sorting via
{any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.
(cmdline_options)=
## Command-Line Options
......
......@@ -283,7 +283,7 @@ The composer gives us access to the default kernel namespace (`<current_namespac
via `sfg.kernels`.
To add a kernel,
- either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.reate() <KernelsAdder.create>`,
- either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.create() <KernelsAdder.create>`,
- or create the kernel separately through {any}`pystencils.create_kernel <pystencils.codegen.create_kernel>` and register it using
{any}`kernels.add() <KernelsAdder.add>`.
......@@ -392,13 +392,176 @@ with SourceFileGenerator() as sfg:
)
```
(exposed_inline_kernels)=
### Exposed and Inline Kernels
## GPU Kernels
Pystencils also allows us to generate kernels for the CUDA and HIP GPU programming models.
This section describes how to generate GPU kernels through pystencils-sfg;
how to invoke them with various launch configurations,
and how GPU execution streams are reflected.
### Generate and Invoke CUDA and HIP Kernels
To generate a kernel targetting either of these, set the
{any}`target <pystencils.codegen.config.CreateKernelConfig.target>`
code generator option to either `Target.CUDA` or `Target.HIP`.
After registering a GPU kernel,
its invocation can be rendered using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
Here is an example using CUDA:
```{code-cell} ipython3
from pystencilssfg import SfgConfig
sfg_config = SfgConfig()
sfg_config.extensions.impl = "cu"
with SourceFileGenerator(sfg_config) as sfg:
# Configure the code generator to use CUDA
cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
# Create fields, assemble assignments
f, g = ps.fields("f, g: double[128, 128]")
asm = ps.Assignment(f(0), g(0))
# Register kernel
khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
# Invoke it
sfg.function("kernel_wrapper")(
sfg.gpu_invoke(khandle)
)
```
In this snippet, we used the [generator configuration](#how_to_generator_scripts_config)
to change the suffix of the generated implementation file to `.cu`.
When investigating the generated `.cu` file, you can see that the GPU launch configuration parameters
*grid size* and *block size* are being computed automatically from the array sizes.
This behavior can be changed by modifying options in the {any}`gpu <pystencils.codegen.config.GpuOptions>`
category of the `CreateKernelConfig`.
### Adapting the Launch Configuration
GPU kernel invocations usually require the user to provide a launch grid, defined
by the GPU thread block size and the number of blocks on the grid.
In the simplest case (seen above), pystencils-sfg will emit code that automatically
computes these parameters from the size of the arrays passed to the kernel,
using a default block size defined by pystencils.
The code generator also permits customization of the launch configuration.
You may provide a custom block size to override the default, in which case the
grid size will still be computed by dividing the array sizes by your block size.
Otherwise, you can also fully take over control of both block and grid size.
For both cases, instructions are given in the following.
#### User-Defined Block Size for Auto-Computed Grid Size
To merely modify the block size argument while still automatically inferring the grid size,
pass a variable or expression of type `dim3` to the `block_size` parameter of `gpu_invoke`.
Pystencils-sfg exposes two versions of `dim3`, which differ primarily in their associated
runtime headers:
- {any}`pystencilssfg.lang.gpu.cuda.dim3 <CudaAPI.dim3>` for CUDA, and
- {any}`pystencilssfg.lang.gpu.hip.dim3 <HipAPI.dim3>` for HIP.
The following snippet selects the correct `dim3` type according to the kernel target;
it then creates a variable of that type and turns that into an argument to the kernel invocation:
```{code-cell} ipython3
:tags: [remove-cell]
target = ps.Target.HIP
cfg = ps.CreateKernelConfig(target=target)
f, g = ps.fields("f, g: double[128, 128]")
asm = ps.Assignment(f(0), g(0))
```
```{code-cell} ipython3
from pystencilssfg.lang.gpu import hip
with SourceFileGenerator(sfg_config) as sfg:
# ... define kernel ...
khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
# Select dim3 reflection
match target:
case ps.Target.CUDA:
from pystencilssfg.lang.gpu import cuda as gpu_api
case ps.Target.HIP:
from pystencilssfg.lang.gpu import hip as gpu_api
# Create dim3 variable and pass it to kernel invocation
block_size = gpu_api.dim3(const=True).var("block_size")
sfg.function("kernel_wrapper")(
sfg.gpu_invoke(khandle, block_size=block_size)
)
```
#### Manual Launch Configurations
To take full control of the launch configuration, we must disable its automatic inferrence
by setting the {any}`gpu.manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
code generator option to `True`.
Then, we must pass `dim3` arguments for both `block_size` and `grid_size` to the kernel invocation:
```{code-cell} ipython3
from pystencilssfg.lang.gpu import hip
with SourceFileGenerator(sfg_config) as sfg:
# ... define kernel ...
# Configure for manual launch config
cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
cfg.gpu.manual_launch_grid = True
# Register kernel
khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
# Create dim3 variables
from pystencilssfg.lang.gpu import cuda
block_size = cuda.dim3(const=True).var("block_size")
grid_size = cuda.dim3(const=True).var("grid_size")
sfg.function("kernel_wrapper")(
sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size)
)
```
### Using Streams
CUDA and HIP kernels can be enqueued into streams for concurrent execution.
This is mirrored in pystencils-sfg;
all overloads of `gpu_invoke` take an optional `stream` argument.
The `stream_t` data types of both CUDA and HIP are made available
through the respective API reflections:
- {any}`lang.gpu.cuda.stream_t <CudaAPI.stream_t>` reflects `cudaStream_t`, and
- {any}`lang.gpu.hip.stream_t <HipAPI.stream_t>` reflects `hipStream_t`.
Here is an example that creates a variable of the HIP stream type
and passes it to `gpu_invoke`:
```{code-cell} ipython3
:tags: [remove-cell]
cfg = ps.CreateKernelConfig(target=ps.Target.HIP)
f, g = ps.fields("f, g: double[128, 128]")
asm = ps.Assignment(f(0), g(0))
```
```{code-cell} ipython3
from pystencilssfg.lang.gpu import hip
with SourceFileGenerator(sfg_config) as sfg:
# ... define kernel ...
khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
stream = hip.stream_t(const=True).var("stream")
sfg.function("kernel_wrapper")(
sfg.gpu_invoke(khandle, stream=stream)
)
```
:::{admonition} To Do
- Creating and calling kernels
- Invoking GPU kernels and the CUDA API Mirror
- Defining classes, their fields constructors, and methods
:::
......
......@@ -9,6 +9,7 @@ from .basic_composer import (
)
from .mixin import SfgComposerMixIn
from .class_composer import SfgClassComposer
from .gpu_composer import SfgGpuComposer
__all__ = [
"SfgIComposer",
......@@ -20,4 +21,5 @@ __all__ = [
"ExprLike",
"SfgBasicComposer",
"SfgClassComposer",
"SfgGpuComposer",
]
......@@ -13,7 +13,7 @@ from pystencils import (
Assignment,
AssignmentCollection,
)
from pystencils.codegen import Kernel
from pystencils.codegen import Kernel, Lambda
from pystencils.types import create_type, UserTypeSpec, PsType
from ..context import SfgContext, SfgCursor
......@@ -21,7 +21,6 @@ from .custom import CustomGenerator
from ..ir import (
SfgCallTreeNode,
SfgKernelCallNode,
SfgCudaKernelInvocation,
SfgStatements,
SfgFunctionParams,
SfgRequireIncludes,
......@@ -53,6 +52,7 @@ from ..lang import (
HeaderFile,
includes,
SfgVar,
SfgKernelParamVar,
AugExpr,
SupportsFieldExtraction,
SupportsVectorExtraction,
......@@ -390,34 +390,14 @@ class SfgBasicComposer(SfgIComposer):
"""Use inside a function body to directly call a kernel.
When using `call`, the given kernel will simply be called as a function.
To invoke a GPU kernel on a specified launch grid, use `cuda_invoke`
or the interfaces of ``pystencilssfg.extensions.sycl`` instead.
To invoke a GPU kernel on a specified launch grid,
use `gpu_invoke <SfgGpuComposer.gpu_invoke>` instead.
Args:
kernel_handle: Handle to a kernel previously added to some kernel namespace.
"""
return SfgKernelCallNode(kernel_handle)
def cuda_invoke(
self,
kernel_handle: SfgKernelHandle,
num_blocks: ExprLike,
threads_per_block: ExprLike,
stream: ExprLike | None,
):
"""Dispatch a CUDA kernel to the device."""
num_blocks_str = str(num_blocks)
tpb_str = str(threads_per_block)
stream_str = str(stream) if stream is not None else None
deps = depends(num_blocks) | depends(threads_per_block)
if stream is not None:
deps |= depends(stream)
return SfgCudaKernelInvocation(
kernel_handle, num_blocks_str, tpb_str, stream_str, deps
)
def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence:
"""Syntax sequencing. For details, see `make_sequence`"""
return make_sequence(*args)
......@@ -511,6 +491,11 @@ class SfgBasicComposer(SfgIComposer):
"""
return AugExpr.format(fmt, *deps, **kwdeps)
def expr_from_lambda(self, lamb: Lambda) -> AugExpr:
depends = set(SfgKernelParamVar(p) for p in lamb.parameters)
code = lamb.c_code()
return AugExpr.make(code, depends, dtype=lamb.return_type)
@property
def branch(self) -> SfgBranchBuilder:
"""Use inside a function body to create an if/else conditonal branch.
......@@ -564,7 +549,11 @@ class SfgBasicComposer(SfgIComposer):
var: SfgVar | sp.Symbol = asvar(param) if isinstance(param, _VarLike) else param
return SfgDeferredParamSetter(var, expr)
def map_vector(self, lhs_components: Sequence[VarLike | sp.Symbol], rhs: SupportsVectorExtraction):
def map_vector(
self,
lhs_components: Sequence[VarLike | sp.Symbol],
rhs: SupportsVectorExtraction,
):
"""Extracts scalar numerical values from a vector data type.
Args:
......
......@@ -3,12 +3,13 @@ from typing import TYPE_CHECKING
from .basic_composer import SfgBasicComposer
from .class_composer import SfgClassComposer
from .gpu_composer import SfgGpuComposer
if TYPE_CHECKING:
from ..context import SfgContext
class SfgComposer(SfgBasicComposer, SfgClassComposer):
class SfgComposer(SfgBasicComposer, SfgClassComposer, SfgGpuComposer):
"""Primary interface for constructing source files in pystencils-sfg.
The SfgComposer combines the `SfgBasicComposer`
......@@ -19,3 +20,4 @@ class SfgComposer(SfgBasicComposer, SfgClassComposer):
def __init__(self, sfg: SfgContext | SfgBasicComposer):
SfgBasicComposer.__init__(self, sfg)
SfgClassComposer.__init__(self)
SfgGpuComposer.__init__(self)
from __future__ import annotations
from typing import overload
from pystencils.codegen import GpuKernel, Target
from pystencils.codegen.gpu_indexing import (
ManualLaunchConfiguration,
AutomaticLaunchConfiguration,
DynamicBlockSizeLaunchConfiguration,
)
from .mixin import SfgComposerMixIn
from .basic_composer import make_statements, make_sequence
from ..context import SfgContext
from ..ir import (
SfgKernelHandle,
SfgCallTreeNode,
SfgGpuKernelInvocation,
SfgBlock,
SfgSequence,
)
from ..lang import ExprLike, AugExpr
from ..lang.gpu import CudaAPI, HipAPI, ProvidesGpuRuntimeAPI
class SfgGpuComposer(SfgComposerMixIn):
"""Composer mix-in providing methods to generate GPU kernel invocations.
.. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs)
Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration.
The overloads of this method are listed below.
They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax;
for details on the launch configuration arguments,
refer to `Launch Configurations in CUDA`_
or `Launch Configurations in HIP`_.
.. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
:noindex:
Invoke a GPU kernel with a manual launch grid.
Requires that the kernel was generated
with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
set to `True`.
.. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
:noindex:
Invoke a GPU kernel with an automatic launch grid.
This signature accepts kernels generated with an indexing scheme that
causes the launch grid to be determined automatically,
such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`.
.. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
:noindex:
Invoke a GPU kernel with a dynamic launch grid.
This signature accepts kernels generated with an indexing scheme that permits a user-defined
blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`.
The grid size is calculated automatically by dividing the number of work items in each
dimension by the block size, rounding up.
.. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
.. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
""" # NOQA: E501
@overload
def gpu_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
grid_size: ExprLike,
block_size: ExprLike,
shared_memory_bytes: ExprLike = "0",
stream: ExprLike | None = None,
) -> SfgCallTreeNode: ...
@overload
def gpu_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
shared_memory_bytes: ExprLike = "0",
stream: ExprLike | None = None,
) -> SfgCallTreeNode: ...
@overload
def gpu_invoke(
self,
kernel_handle: SfgKernelHandle,
*,
block_size: ExprLike | None = None,
shared_memory_bytes: ExprLike = "0",
stream: ExprLike | None = None,
) -> SfgCallTreeNode: ...
def gpu_invoke(
self,
kernel_handle: SfgKernelHandle,
shared_memory_bytes: ExprLike = "0",
stream: ExprLike | None = None,
**kwargs,
) -> SfgCallTreeNode:
builder = GpuInvocationBuilder(self._ctx, kernel_handle)
builder.shared_memory_bytes = shared_memory_bytes
builder.stream = stream
return builder(**kwargs)
def cuda_invoke(
self,
kernel_handle: SfgKernelHandle,
num_blocks: ExprLike,
threads_per_block: ExprLike,
stream: ExprLike | None,
):
from warnings import warn
warn(
"cuda_invoke is deprecated and will be removed before version 0.1. "
"Use `gpu_invoke` instead.",
FutureWarning,
)
return self.gpu_invoke(
kernel_handle,
grid_size=num_blocks,
block_size=threads_per_block,
stream=stream,
)
class GpuInvocationBuilder:
def __init__(
self,
ctx: SfgContext,
kernel_handle: SfgKernelHandle,
):
self._ctx = ctx
self._kernel_handle = kernel_handle
ker = kernel_handle.kernel
if not isinstance(ker, GpuKernel):
raise ValueError(f"Non-GPU kernel was passed to `gpu_invoke`: {ker}")
launch_config = ker.get_launch_configuration()
self._launch_config = launch_config
gpu_api: type[ProvidesGpuRuntimeAPI]
match ker.target:
case Target.CUDA:
gpu_api = CudaAPI
case Target.HIP:
gpu_api = HipAPI
case _:
assert False, "unexpected GPU target"
self._gpu_api = gpu_api
self._dim3 = gpu_api.dim3
self._shared_memory_bytes: ExprLike = "0"
self._stream: ExprLike | None = None
@property
def shared_memory_bytes(self) -> ExprLike:
return self._shared_memory_bytes
@shared_memory_bytes.setter
def shared_memory_bytes(self, bs: ExprLike):
self._shared_memory_bytes = bs
@property
def stream(self) -> ExprLike | None:
return self._stream
@stream.setter
def stream(self, s: ExprLike | None):
self._stream = s
def _render_invocation(
self, grid_size: ExprLike, block_size: ExprLike
) -> SfgSequence:
stmt_grid_size = make_statements(grid_size)
stmt_block_size = make_statements(block_size)
stmt_smem = make_statements(self._shared_memory_bytes)
stmt_stream = (
make_statements(self._stream) if self._stream is not None else None
)
return make_sequence(
"// clang-format off: "
"[pystencils-sfg] Formatting may add illegal spaces between angular brackets in `<<< >>>`.",
SfgGpuKernelInvocation(
self._kernel_handle,
stmt_grid_size,
stmt_block_size,
shared_memory_bytes=stmt_smem,
stream=stmt_stream,
),
"// clang-format on",
)
def __call__(self, **kwargs: ExprLike) -> SfgCallTreeNode:
match self._launch_config:
case ManualLaunchConfiguration():
return self._invoke_manual(**kwargs)
case AutomaticLaunchConfiguration():
return self._invoke_automatic(**kwargs)
case DynamicBlockSizeLaunchConfiguration():
return self._invoke_dynamic(**kwargs)
case _:
raise ValueError(
f"Unexpected launch configuration: {self._launch_config}"
)
def _invoke_manual(self, grid_size: ExprLike, block_size: ExprLike):
assert isinstance(self._launch_config, ManualLaunchConfiguration)
return self._render_invocation(grid_size, block_size)
def _invoke_automatic(self):
assert isinstance(self._launch_config, AutomaticLaunchConfiguration)
from .composer import SfgComposer
sfg = SfgComposer(self._ctx)
grid_size_entries = [
self._to_uint32_t(sfg.expr_from_lambda(gs))
for gs in self._launch_config._grid_size
]
grid_size_var = self._dim3(const=True).var("__grid_size")
block_size_entries = [
self._to_uint32_t(sfg.expr_from_lambda(bs))
for bs in self._launch_config._block_size
]
block_size_var = self._dim3(const=True).var("__block_size")
nodes = [
sfg.init(grid_size_var)(*grid_size_entries),
sfg.init(block_size_var)(*block_size_entries),
self._render_invocation(grid_size_var, block_size_var),
]
return SfgBlock(SfgSequence(nodes))
def _invoke_dynamic(self, block_size: ExprLike | None = None):
assert isinstance(self._launch_config, DynamicBlockSizeLaunchConfiguration)
from .composer import SfgComposer
sfg = SfgComposer(self._ctx)
block_size_init_args: tuple[ExprLike, ...]
if block_size is None:
block_size_init_args = tuple(
str(bs) for bs in self._launch_config.default_block_size
)
else:
block_size_init_args = (block_size,)
block_size_var = self._dim3(const=True).var("__block_size")
from ..lang.cpp import std
work_items_entries = [
sfg.expr_from_lambda(wit) for wit in self._launch_config.num_work_items
]
work_items_var = std.tuple("uint32_t", "uint32_t", "uint32_t", const=True).var(
"__work_items"
)
def _div_ceil(a: ExprLike, b: ExprLike):
return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
grid_size_entries = [
_div_ceil(work_items_var.get(i), bs)
for i, bs in enumerate(
[
block_size_var.x,
block_size_var.y,
block_size_var.z,
]
)
]
grid_size_var = self._dim3(const=True).var("__grid_size")
nodes = [
sfg.init(block_size_var)(*block_size_init_args),
sfg.init(work_items_var)(*work_items_entries),
sfg.init(grid_size_var)(*grid_size_entries),
self._render_invocation(grid_size_var, block_size_var),
]
return SfgBlock(SfgSequence(nodes))
@staticmethod
def _to_uint32_t(expr: AugExpr) -> AugExpr:
return AugExpr("uint32_t").format("uint32_t({})", expr)
......@@ -2,7 +2,7 @@ from __future__ import annotations
from typing import Sequence, Any, Generator
from contextlib import contextmanager
from .config import CodeStyle
from .config import CodeStyle, ClangFormatOptions
from .ir import (
SfgSourceFile,
SfgNamespace,
......@@ -23,6 +23,7 @@ class SfgContext:
impl_file: SfgSourceFile | None,
namespace: str | None = None,
codestyle: CodeStyle | None = None,
clang_format_opts: ClangFormatOptions | None = None,
argv: Sequence[str] | None = None,
project_info: Any = None,
):
......@@ -33,6 +34,9 @@ class SfgContext:
self._inner_namespace: str | None = None
self._codestyle = codestyle if codestyle is not None else CodeStyle()
self._clang_format: ClangFormatOptions = (
clang_format_opts if clang_format_opts is not None else ClangFormatOptions()
)
self._header_file = header_file
self._impl_file = impl_file
......@@ -73,6 +77,10 @@ class SfgContext:
"""The code style object for this generation context."""
return self._codestyle
@property
def clang_format(self) -> ClangFormatOptions:
return self._clang_format
@property
def header_file(self) -> SfgSourceFile:
return self._header_file
......@@ -150,6 +158,9 @@ class SfgCursor:
self._loc[f].append(block)
self._loc[f] = block.elements
outer_namespace = self._cur_namespace
self._cur_namespace = namespace
@contextmanager
def ctxmgr():
try:
......@@ -157,5 +168,6 @@ class SfgCursor:
finally:
# Have the cursor step back out of the nested namespace blocks
self._loc = outer_locs
self._cur_namespace = outer_namespace
return ctxmgr()
from pystencilssfg import lang
def dim3class(gpu_runtime_header: str, *, cls_name: str = "dim3"):
"""
>>> dim3 = dim3class("<hip/hip_runtime.h>")
>>> dim3().ctor(64, 1, 1)
dim3{64, 1, 1}
Args:
gpu_runtime_header: String with the name of the gpu runtime header
cls_name: String with the acutal name (default "dim3")
Returns:
Dim3Class: A `lang.CppClass` that mimics cuda's/hip's `dim3`
"""
@lang.cppclass(cls_name, gpu_runtime_header)
class Dim3Class:
def ctor(self, dim0=1, dim1=1, dim2=1):
return self.ctor_bind(dim0, dim1, dim2)
@property
def x(self):
return lang.AugExpr.format("{}.x", self)
@property
def y(self):
return lang.AugExpr.format("{}.y", self)
@property
def z(self):
return lang.AugExpr.format("{}.z", self)
@property
def dims(self):
"""The dims property."""
return [self.x, self.y, self.z]
return Dim3Class
......@@ -95,9 +95,7 @@ class SourceFileGenerator:
self._impl_file = SfgSourceFile(
output_files[1].name, SfgSourceFileType.TRANSLATION_UNIT
)
self._impl_file.includes.append(
HeaderFile.parse(self._header_file.name)
)
self._impl_file.includes.append(HeaderFile.parse(self._header_file.name))
# TODO: Find a way to not hard-code the restrict qualifier in pystencils
self._header_file.elements.append("#define RESTRICT __restrict__")
......@@ -115,14 +113,11 @@ class SourceFileGenerator:
self._impl_file,
namespace,
config.codestyle,
config.clang_format,
argv=script_args,
project_info=cli_params.get_project_info(),
)
self._emitter = SfgCodeEmitter(
self._output_dir, config.codestyle, config.clang_format
)
sort_key = config.codestyle.get_option("includes_sorting_key")
if sort_key is None:
......@@ -161,6 +156,13 @@ class SourceFileGenerator:
)
self._impl_file.includes.sort(key=self._include_sort_key)
def _get_emitter(self):
return SfgCodeEmitter(
self._output_dir,
self._context.codestyle,
self._context.clang_format,
)
def __enter__(self) -> SfgComposer:
self.clean_files()
return SfgComposer(self._context)
......@@ -169,6 +171,7 @@ class SourceFileGenerator:
if exc_type is None:
self._finish_files()
self._emitter.emit(self._header_file)
emitter = self._get_emitter()
emitter.emit(self._header_file)
if self._impl_file is not None:
self._emitter.emit(self._impl_file)
emitter.emit(self._impl_file)
......@@ -3,7 +3,7 @@ from .call_tree import (
SfgCallTreeLeaf,
SfgEmptyNode,
SfgKernelCallNode,
SfgCudaKernelInvocation,
SfgGpuKernelInvocation,
SfgBlock,
SfgSequence,
SfgStatements,
......@@ -47,7 +47,7 @@ __all__ = [
"SfgCallTreeLeaf",
"SfgEmptyNode",
"SfgKernelCallNode",
"SfgCudaKernelInvocation",
"SfgGpuKernelInvocation",
"SfgSequence",
"SfgBlock",
"SfgStatements",
......
......@@ -19,6 +19,7 @@ class SfgCallTreeNode(ABC):
Therefore, every instantiable call tree node must implement the method `get_code`.
By convention, the string returned by `get_code` should not contain a trailing newline.
"""
def __init__(self) -> None:
self._includes: set[HeaderFile] = set()
......@@ -34,6 +35,11 @@ class SfgCallTreeNode(ABC):
By convention, the code block emitted by this function should not contain a trailing newline.
"""
@property
def depends(self) -> set[SfgVar]:
"""Set of objects this leaf depends on"""
return set()
@property
def required_includes(self) -> set[HeaderFile]:
"""Return a set of header includes required by this node"""
......@@ -53,11 +59,6 @@ class SfgCallTreeLeaf(SfgCallTreeNode, ABC):
def children(self) -> Sequence[SfgCallTreeNode]:
return ()
@property
@abstractmethod
def depends(self) -> set[SfgVar]:
"""Set of objects this leaf depends on"""
class SfgEmptyNode(SfgCallTreeLeaf):
"""A leaf node that does not emit any code.
......@@ -202,52 +203,76 @@ class SfgKernelCallNode(SfgCallTreeLeaf):
return set(self._kernel_handle.parameters)
def get_code(self, cstyle: CodeStyle) -> str:
ast_params = self._kernel_handle.parameters
kparams = self._kernel_handle.parameters
fnc_name = self._kernel_handle.fqname
call_parameters = ", ".join([p.name for p in ast_params])
call_parameters = ", ".join([p.name for p in kparams])
return f"{fnc_name}({call_parameters});"
class SfgCudaKernelInvocation(SfgCallTreeLeaf):
class SfgGpuKernelInvocation(SfgCallTreeNode):
"""A CUDA or HIP kernel invocation.
See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
or https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
for the syntax.
"""
def __init__(
self,
kernel_handle: SfgKernelHandle,
num_blocks_code: str,
threads_per_block_code: str,
stream_code: str | None,
depends: set[SfgVar],
grid_size: SfgStatements,
block_size: SfgStatements,
shared_memory_bytes: SfgStatements | None,
stream: SfgStatements | None,
):
from pystencils import Target
from pystencils.codegen import GpuKernel
kernel = kernel_handle.kernel
if not (isinstance(kernel, GpuKernel) and kernel.target == Target.CUDA):
if not isinstance(kernel, GpuKernel):
raise ValueError(
"An `SfgCudaKernelInvocation` node can only call a CUDA kernel."
"An `SfgGpuKernelInvocation` node can only call GPU kernels."
)
super().__init__()
self._kernel_handle = kernel_handle
self._num_blocks = num_blocks_code
self._threads_per_block = threads_per_block_code
self._stream = stream_code
self._depends = depends
self._grid_size = grid_size
self._block_size = block_size
self._shared_memory_bytes = shared_memory_bytes
self._stream = stream
@property
def children(self) -> Sequence[SfgCallTreeNode]:
return (
(
self._grid_size,
self._block_size,
)
+ (
(self._shared_memory_bytes,)
if self._shared_memory_bytes is not None
else ()
)
+ ((self._stream,) if self._stream is not None else ())
)
@property
def depends(self) -> set[SfgVar]:
return set(self._kernel_handle.parameters) | self._depends
return set(self._kernel_handle.parameters)
def get_code(self, cstyle: CodeStyle) -> str:
ast_params = self._kernel_handle.parameters
kparams = self._kernel_handle.parameters
fnc_name = self._kernel_handle.fqname
call_parameters = ", ".join([p.name for p in ast_params])
call_parameters = ", ".join([p.name for p in kparams])
grid_args = [self._grid_size, self._block_size]
if self._shared_memory_bytes is not None:
grid_args += [self._shared_memory_bytes]
grid_args = [self._num_blocks, self._threads_per_block]
if self._stream is not None:
grid_args += [self._stream]
grid = "<<< " + ", ".join(grid_args) + " >>>"
grid = "<<< " + ", ".join(arg.get_code(cstyle) for arg in grid_args) + " >>>"
return f"{fnc_name}{grid}({call_parameters});"
......
from __future__ import annotations
from typing import Sequence, Iterable
import warnings
from functools import reduce
from dataclasses import dataclass
from abc import ABC, abstractmethod
......@@ -15,7 +14,7 @@ from pystencils.codegen.properties import FieldBasePtr, FieldShape, FieldStride
from ..exceptions import SfgException
from ..config import CodeStyle
from .call_tree import SfgCallTreeNode, SfgCallTreeLeaf, SfgSequence, SfgStatements
from .call_tree import SfgCallTreeNode, SfgSequence, SfgStatements
from ..lang.expressions import SfgKernelParamVar
from ..lang import (
SfgVar,
......@@ -163,17 +162,12 @@ class CallTreePostProcessing:
self.handle_sequence(node, ppc)
return ppc.live_variables
case SfgCallTreeLeaf():
return node.depends
case SfgDeferredNode():
raise SfgException("Deferred nodes can only occur inside a sequence.")
case _:
return reduce(
lambda x, y: x | y,
(self.get_live_variables(c) for c in node.children),
set(),
return node.depends.union(
*(self.get_live_variables(c) for c in node.children)
)
......
......@@ -19,10 +19,13 @@ class StdTuple(AugExpr, SupportsVectorExtraction):
dtype = self._template(ts=", ".join(elt_type_strings), const=const, ref=ref)
super().__init__(dtype)
def get(self, idx: int | str) -> AugExpr:
return AugExpr.format("std::get< {} >({})", idx, self)
def _extract_component(self, coordinate: int) -> AugExpr:
if coordinate < 0 or coordinate >= self._length:
raise ValueError(
f"Index {coordinate} out-of-bounds for std::tuple with {self._length} entries."
)
return AugExpr.format("std::get< {} >({})", coordinate, self)
return self.get(coordinate)
......@@ -218,8 +218,12 @@ class AugExpr:
return self._bind(expr)
@staticmethod
def make(code: str, depends: Iterable[SfgVar | AugExpr]):
return AugExpr()._bind(DependentExpression(code, depends))
def make(
code: str,
depends: Iterable[SfgVar | AugExpr],
dtype: UserTypeSpec | None = None,
):
return AugExpr(dtype)._bind(DependentExpression(code, depends))
@staticmethod
def format(fmt: str, *deps, **kwdeps) -> AugExpr:
......
from __future__ import annotations
from typing import Protocol
from .expressions import CppClass, cpptype, AugExpr
class Dim3Interface(CppClass):
"""Interface definition for the ``dim3`` struct of Cuda and HIP."""
def ctor(self, dim0=1, dim1=1, dim2=1):
"""Constructor invocation of ``dim3``"""
return self.ctor_bind(dim0, dim1, dim2)
@property
def x(self) -> AugExpr:
"""The `x` coordinate member."""
return AugExpr.format("{}.x", self)
@property
def y(self) -> AugExpr:
"""The `y` coordinate member."""
return AugExpr.format("{}.y", self)
@property
def z(self) -> AugExpr:
"""The `z` coordinate member."""
return AugExpr.format("{}.z", self)
@property
def dims(self) -> tuple[AugExpr, AugExpr, AugExpr]:
"""`x`, `y`, and `z` as a tuple."""
return (self.x, self.y, self.z)
class ProvidesGpuRuntimeAPI(Protocol):
"""Protocol definition for a GPU runtime API provider."""
dim3: type[Dim3Interface]
"""The ``dim3`` struct type for this GPU runtime"""
stream_t: type[AugExpr]
"""The ``stream_t`` type for this GPU runtime"""
class CudaAPI(ProvidesGpuRuntimeAPI):
"""Reflection of the CUDA runtime API"""
class dim3(Dim3Interface):
"""Implements `Dim3Interface` for CUDA"""
template = cpptype("dim3", "<cuda_runtime.h>")
class stream_t(CppClass):
template = cpptype("cudaStream_t", "<cuda_runtime.h>")
cuda = CudaAPI
"""Alias for `CudaAPI`"""
class HipAPI(ProvidesGpuRuntimeAPI):
"""Reflection of the HIP runtime API"""
class dim3(Dim3Interface):
"""Implements `Dim3Interface` for HIP"""
template = cpptype("dim3", "<hip/hip_runtime.h>")
class stream_t(CppClass):
template = cpptype("hipStream_t", "<hip/hip_runtime.h>")
hip = HipAPI
"""Alias for `HipAPI`"""
from pystencilssfg.extensions.gpu import dim3class
from pystencilssfg.lang import HeaderFile, AugExpr
def test_dim3():
cuda_runtime = "<cuda_runtime.h>"
dim3 = dim3class(cuda_runtime, cls_name="dim3")
assert HeaderFile.parse(cuda_runtime) in dim3.template.includes
assert str(dim3().ctor(128, 1, 1)) == "dim3{128, 1, 1}"
assert str(dim3().ctor()) == "dim3{1, 1, 1}"
assert str(dim3().ctor(1, 1, 128)) == "dim3{1, 1, 128}"
block = dim3(ref=True, const=True).var("block")
dims = [
AugExpr.format(
"uint32_t(({} + {} - 1)/ {})",
1024,
block.dims[i],
block.dims[i],
)
for i in range(3)
]
grid = dim3().ctor(*dims)
assert str(grid) == f"dim3{{{', '.join((str(d) for d in dims))}}}"