diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py index 31337a6282932420f8d5b6d9093deec5c2caea1a..8a076745eac0d2cb98b88490469a58975de1e896 100644 --- a/src/pystencilssfg/composer/basic_composer.py +++ b/src/pystencilssfg/composer/basic_composer.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Sequence, TypeAlias +from typing import Sequence, TypeAlias, overload from abc import ABC, abstractmethod import sympy as sp from functools import reduce @@ -13,7 +13,12 @@ from pystencils import ( Assignment, AssignmentCollection, ) -from pystencils.codegen import Kernel +from pystencils.codegen import Kernel, GpuKernel, Lambda +from pystencils.codegen.gpu_indexing import ( + ManualLaunchConfiguration, + AutomaticLaunchConfiguration, + DynamicBlockSizeLaunchConfiguration, +) from pystencils.types import create_type, UserTypeSpec, PsType from ..context import SfgContext, SfgCursor @@ -53,6 +58,7 @@ from ..lang import ( HeaderFile, includes, SfgVar, + SfgKernelParamVar, AugExpr, SupportsFieldExtraction, SupportsVectorExtraction, @@ -398,25 +404,155 @@ class SfgBasicComposer(SfgIComposer): """ return SfgKernelCallNode(kernel_handle) + @overload def cuda_invoke( self, kernel_handle: SfgKernelHandle, - num_blocks: ExprLike, - threads_per_block: ExprLike, - stream: ExprLike | None, - ): - """Dispatch a CUDA kernel to the device.""" - num_blocks_str = str(num_blocks) - tpb_str = str(threads_per_block) - stream_str = str(stream) if stream is not None else None + *, + grid_size: ExprLike, + block_size: ExprLike, + stream: ExprLike | None = None, + ) -> SfgCallTreeNode: + """Invoke a CUDA kernel with a manual launch grid. + + Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>` + set to `True`. + """ - deps = depends(num_blocks) | depends(threads_per_block) - if stream is not None: - deps |= depends(stream) + @overload + def cuda_invoke( + self, + kernel_handle: SfgKernelHandle, + *, + stream: ExprLike | None = None, + ) -> SfgCallTreeNode: + """Invoke a CUDA kernel with an automatic launch grid. - return SfgCudaKernelInvocation( - kernel_handle, num_blocks_str, tpb_str, stream_str, deps - ) + This signature accepts kernels generated with an indexing scheme that permits + the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>` + """ + + @overload + def cuda_invoke( + self, + kernel_handle: SfgKernelHandle, + *, + block_size: ExprLike | None = None, + stream: ExprLike | None = None, + ) -> SfgCallTreeNode: + """Invoke a CUDA kernel with a dynamic launch grid. + + This signature accepts kernels generated with an indexing scheme that permits a user-defined + blocks size, such as `Linear3D <IndexingScheme.Linear3D>`. + The grid size is calculated automatically. + """ + + def cuda_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode: + ker = kernel_handle.kernel + + if not isinstance(ker, GpuKernel): + raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}") + + launch_config = ker.get_launch_configuration() + + from ..lang.cuda import dim3 + + def _render_invocation( + grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None + ): + stmt_grid_size = make_statements(grid_size) + stmt_block_size = make_statements(block_size) + stmt_stream = make_statements(stream) if stream is not None else None + + return SfgCudaKernelInvocation( + kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream + ) + + grid_size: ExprLike + block_size: ExprLike + stream: ExprLike | None + + match launch_config: + case ManualLaunchConfiguration(): + grid_size = kwargs["grid_size"] + block_size = kwargs["block_size"] + stream = kwargs["stream"] + + return _render_invocation(grid_size, block_size, stream) + + case AutomaticLaunchConfiguration(): + stream = kwargs["stream"] + + grid_size_entries = [ + self.expr_from_lambda(gs) for gs in launch_config._grid_size + ] + grid_size_var = dim3(const=True).var("__grid_size") + + block_size_entries = [ + self.expr_from_lambda(bs) for bs in launch_config._block_size + ] + block_size_var = dim3(const=True).var("__block_size") + + nodes: list[SfgCallTreeNode] = [ + self.init(grid_size_var)(*grid_size_entries), + self.init(block_size_var)(*block_size_entries), + _render_invocation(grid_size_var, block_size_var, stream), + ] + + return SfgBlock(SfgSequence(nodes)) + + case DynamicBlockSizeLaunchConfiguration(): + block_size = kwargs["block_size"] + stream = kwargs["stream"] + + from ..lang.cpp import std + + witem_types = [lmb.return_type for lmb in launch_config.num_work_items] + work_items_entries = [ + self.expr_from_lambda(wit) for wit in launch_config.num_work_items + ] + work_items_var = std.tuple(*witem_types, const=True).var("__work_items") + + def _min(a: ExprLike, b: ExprLike): + return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b) + + def _div_ceil(a: ExprLike, b: ExprLike): + return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b) + + block_size_var = dim3(const=True).var("__block_size") + + reduced_block_size_entries = [ + _min(work_items_var.get(i), bs) + for i, bs in enumerate( + [block_size_var.x(), block_size_var.y(), block_size_var.z()] + ) + ] + reduced_block_size_var = dim3(const=True).var("__reduced_block_size") + + grid_size_entries = [ + _div_ceil(work_items_var.get(i), bs) + for i, bs in enumerate( + [ + reduced_block_size_var.x(), + reduced_block_size_var.y(), + reduced_block_size_var.z(), + ] + ) + ] + grid_size_var = dim3(const=True).var("__grid_size") + + nodes = [ + self.init(block_size_var)(block_size), + self.init(work_items_var)(*work_items_entries), + self.init(reduced_block_size_var)(*reduced_block_size_entries), + self.init(grid_size_var)(*grid_size_entries), + _render_invocation(grid_size_var, reduced_block_size_var, stream), + ] + + return SfgBlock(SfgSequence(nodes)) + + case _: + raise ValueError(f"Unexpected launch configuration: {launch_config}") def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence: """Syntax sequencing. For details, see `make_sequence`""" @@ -511,6 +647,11 @@ class SfgBasicComposer(SfgIComposer): """ return AugExpr.format(fmt, *deps, **kwdeps) + def expr_from_lambda(self, lamb: Lambda) -> AugExpr: + depends = set(SfgKernelParamVar(p) for p in lamb.parameters) + code = lamb.c_code() + return AugExpr.make(code, depends, dtype=lamb.return_type) + @property def branch(self) -> SfgBranchBuilder: """Use inside a function body to create an if/else conditonal branch. @@ -564,7 +705,11 @@ class SfgBasicComposer(SfgIComposer): var: SfgVar | sp.Symbol = asvar(param) if isinstance(param, _VarLike) else param return SfgDeferredParamSetter(var, expr) - def map_vector(self, lhs_components: Sequence[VarLike | sp.Symbol], rhs: SupportsVectorExtraction): + def map_vector( + self, + lhs_components: Sequence[VarLike | sp.Symbol], + rhs: SupportsVectorExtraction, + ): """Extracts scalar numerical values from a vector data type. Args: diff --git a/src/pystencilssfg/ir/call_tree.py b/src/pystencilssfg/ir/call_tree.py index 24a315d5a0ae0319cbc1b906f98deacc72828176..78ba84117cadf9f83f8d0cfdd4afc2ff2e169e81 100644 --- a/src/pystencilssfg/ir/call_tree.py +++ b/src/pystencilssfg/ir/call_tree.py @@ -19,6 +19,7 @@ class SfgCallTreeNode(ABC): Therefore, every instantiable call tree node must implement the method `get_code`. By convention, the string returned by `get_code` should not contain a trailing newline. """ + def __init__(self) -> None: self._includes: set[HeaderFile] = set() @@ -34,6 +35,11 @@ class SfgCallTreeNode(ABC): By convention, the code block emitted by this function should not contain a trailing newline. """ + @property + def depends(self) -> set[SfgVar]: + """Set of objects this leaf depends on""" + return set() + @property def required_includes(self) -> set[HeaderFile]: """Return a set of header includes required by this node""" @@ -53,11 +59,6 @@ class SfgCallTreeLeaf(SfgCallTreeNode, ABC): def children(self) -> Sequence[SfgCallTreeNode]: return () - @property - @abstractmethod - def depends(self) -> set[SfgVar]: - """Set of objects this leaf depends on""" - class SfgEmptyNode(SfgCallTreeLeaf): """A leaf node that does not emit any code. @@ -202,21 +203,20 @@ class SfgKernelCallNode(SfgCallTreeLeaf): return set(self._kernel_handle.parameters) def get_code(self, cstyle: CodeStyle) -> str: - ast_params = self._kernel_handle.parameters + kparams = self._kernel_handle.parameters fnc_name = self._kernel_handle.fqname - call_parameters = ", ".join([p.name for p in ast_params]) + call_parameters = ", ".join([p.name for p in kparams]) return f"{fnc_name}({call_parameters});" -class SfgCudaKernelInvocation(SfgCallTreeLeaf): +class SfgCudaKernelInvocation(SfgCallTreeNode): def __init__( self, kernel_handle: SfgKernelHandle, - num_blocks_code: str, - threads_per_block_code: str, - stream_code: str | None, - depends: set[SfgVar], + grid_size: SfgStatements, + block_size: SfgStatements, + stream: SfgStatements | None, ): from pystencils import Target from pystencils.codegen import GpuKernel @@ -229,25 +229,31 @@ class SfgCudaKernelInvocation(SfgCallTreeLeaf): super().__init__() self._kernel_handle = kernel_handle - self._num_blocks = num_blocks_code - self._threads_per_block = threads_per_block_code - self._stream = stream_code - self._depends = depends + self._grid_size = grid_size + self._block_size = block_size + self._stream = stream + + @property + def children(self) -> Sequence[SfgCallTreeNode]: + return ( + self._grid_size, + self._block_size, + ) + ((self._stream,) if self._stream is not None else ()) @property def depends(self) -> set[SfgVar]: - return set(self._kernel_handle.parameters) | self._depends + return set(self._kernel_handle.parameters) def get_code(self, cstyle: CodeStyle) -> str: - ast_params = self._kernel_handle.parameters + kparams = self._kernel_handle.parameters fnc_name = self._kernel_handle.fqname - call_parameters = ", ".join([p.name for p in ast_params]) + call_parameters = ", ".join([p.name for p in kparams]) - grid_args = [self._num_blocks, self._threads_per_block] + grid_args = [self._grid_size, self._block_size] if self._stream is not None: grid_args += [self._stream] - grid = "<<< " + ", ".join(grid_args) + " >>>" + grid = "<<< " + ", ".join(arg.get_code(cstyle) for arg in grid_args) + " >>>" return f"{fnc_name}{grid}({call_parameters});" diff --git a/src/pystencilssfg/ir/postprocessing.py b/src/pystencilssfg/ir/postprocessing.py index 1e692b0aa9f37368da1688ec2d3bac6892c5ac60..896693317c02dee67302221f810d64c01b5eb233 100644 --- a/src/pystencilssfg/ir/postprocessing.py +++ b/src/pystencilssfg/ir/postprocessing.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import Sequence, Iterable import warnings -from functools import reduce from dataclasses import dataclass from abc import ABC, abstractmethod @@ -15,7 +14,7 @@ from pystencils.codegen.properties import FieldBasePtr, FieldShape, FieldStride from ..exceptions import SfgException from ..config import CodeStyle -from .call_tree import SfgCallTreeNode, SfgCallTreeLeaf, SfgSequence, SfgStatements +from .call_tree import SfgCallTreeNode, SfgSequence, SfgStatements from ..lang.expressions import SfgKernelParamVar from ..lang import ( SfgVar, @@ -163,17 +162,12 @@ class CallTreePostProcessing: self.handle_sequence(node, ppc) return ppc.live_variables - case SfgCallTreeLeaf(): - return node.depends - case SfgDeferredNode(): raise SfgException("Deferred nodes can only occur inside a sequence.") case _: - return reduce( - lambda x, y: x | y, - (self.get_live_variables(c) for c in node.children), - set(), + return node.depends.union( + *(self.get_live_variables(c) for c in node.children) ) diff --git a/src/pystencilssfg/lang/cpp/std_tuple.py b/src/pystencilssfg/lang/cpp/std_tuple.py index 645b6b56fbeb515d6324feafdb8588f7ca22e992..6d1e1c0da0987be055006ce696779a1e2ebaa56b 100644 --- a/src/pystencilssfg/lang/cpp/std_tuple.py +++ b/src/pystencilssfg/lang/cpp/std_tuple.py @@ -19,10 +19,13 @@ class StdTuple(AugExpr, SupportsVectorExtraction): dtype = self._template(ts=", ".join(elt_type_strings), const=const, ref=ref) super().__init__(dtype) + def get(self, idx: int | str) -> AugExpr: + return AugExpr.format("std::get< {} >({})", idx, self) + def _extract_component(self, coordinate: int) -> AugExpr: if coordinate < 0 or coordinate >= self._length: raise ValueError( f"Index {coordinate} out-of-bounds for std::tuple with {self._length} entries." ) - return AugExpr.format("std::get< {} >({})", coordinate, self) + return self.get(coordinate) diff --git a/src/pystencilssfg/lang/cuda.py b/src/pystencilssfg/lang/cuda.py new file mode 100644 index 0000000000000000000000000000000000000000..28794dab927def14cd083900a1fedf895e693c3a --- /dev/null +++ b/src/pystencilssfg/lang/cuda.py @@ -0,0 +1,13 @@ +from .expressions import CppClass, cpptype + + +from ..extensions.gpu import dim3class + +dim3 = dim3class( + "<cuda_runtime.h>" +) +"""Reflection of CUDA's `dim3`.""" + + +class cudaStream_t(CppClass): + template = cpptype("cudaStream_t", "<cuda_runtime.h>") diff --git a/src/pystencilssfg/lang/expressions.py b/src/pystencilssfg/lang/expressions.py index 135a54eed92e4ba214244c8f46323ea81f6610db..8be59b0b37d72113814fa1489a44e7816f0e6f75 100644 --- a/src/pystencilssfg/lang/expressions.py +++ b/src/pystencilssfg/lang/expressions.py @@ -218,8 +218,12 @@ class AugExpr: return self._bind(expr) @staticmethod - def make(code: str, depends: Iterable[SfgVar | AugExpr]): - return AugExpr()._bind(DependentExpression(code, depends)) + def make( + code: str, + depends: Iterable[SfgVar | AugExpr], + dtype: UserTypeSpec | None = None, + ): + return AugExpr(dtype)._bind(DependentExpression(code, depends)) @staticmethod def format(fmt: str, *deps, **kwdeps) -> AugExpr: