Coverage for src/pystencilssfg/composer/gpu_composer.py: 91%
100 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 07:16 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 07:16 +0000
1from __future__ import annotations
3from typing import overload
5from pystencils.codegen import GpuKernel, Target
6from pystencils.codegen.gpu_indexing import (
7 ManualLaunchConfiguration,
8 AutomaticLaunchConfiguration,
9 DynamicBlockSizeLaunchConfiguration,
10)
12from .mixin import SfgComposerMixIn
13from .basic_composer import make_statements, make_sequence
15from ..context import SfgContext
16from ..ir import (
17 SfgKernelHandle,
18 SfgCallTreeNode,
19 SfgGpuKernelInvocation,
20 SfgBlock,
21 SfgSequence,
22)
23from ..lang import ExprLike, AugExpr
24from ..lang.gpu import CudaAPI, HipAPI, ProvidesGpuRuntimeAPI
27class SfgGpuComposer(SfgComposerMixIn):
28 """Composer mix-in providing methods to generate GPU kernel invocations.
30 .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs)
32 Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration.
34 The overloads of this method are listed below.
35 They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax;
36 for details on the launch configuration arguments,
37 refer to `Launch Configurations in CUDA`_
38 or `Launch Configurations in HIP`_.
40 .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
41 :noindex:
43 Invoke a GPU kernel with a manual launch grid.
45 Requires that the kernel was generated
46 with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
47 set to `True`.
49 .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
50 :noindex:
52 Invoke a GPU kernel with an automatic launch grid.
54 This signature accepts kernels generated with an indexing scheme that
55 causes the launch grid to be determined automatically,
56 such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`.
58 .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
59 :noindex:
61 Invoke a GPU kernel with a dynamic launch grid.
63 This signature accepts kernels generated with an indexing scheme that permits a user-defined
64 blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`.
65 The grid size is calculated automatically by dividing the number of work items in each
66 dimension by the block size, rounding up.
68 .. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
70 .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
71 """ # NOQA: E501
73 @overload
74 def gpu_invoke(
75 self,
76 kernel_handle: SfgKernelHandle,
77 *,
78 grid_size: ExprLike,
79 block_size: ExprLike,
80 shared_memory_bytes: ExprLike = "0",
81 stream: ExprLike | None = None,
82 ) -> SfgCallTreeNode: ...
84 @overload
85 def gpu_invoke(
86 self,
87 kernel_handle: SfgKernelHandle,
88 *,
89 shared_memory_bytes: ExprLike = "0",
90 stream: ExprLike | None = None,
91 ) -> SfgCallTreeNode: ...
93 @overload
94 def gpu_invoke(
95 self,
96 kernel_handle: SfgKernelHandle,
97 *,
98 block_size: ExprLike | None = None,
99 shared_memory_bytes: ExprLike = "0",
100 stream: ExprLike | None = None,
101 ) -> SfgCallTreeNode: ...
103 def gpu_invoke(
104 self,
105 kernel_handle: SfgKernelHandle,
106 shared_memory_bytes: ExprLike = "0",
107 stream: ExprLike | None = None,
108 **kwargs,
109 ) -> SfgCallTreeNode:
110 builder = GpuInvocationBuilder(self._ctx, kernel_handle)
111 builder.shared_memory_bytes = shared_memory_bytes
112 builder.stream = stream
114 return builder(**kwargs)
116 def cuda_invoke(
117 self,
118 kernel_handle: SfgKernelHandle,
119 num_blocks: ExprLike,
120 threads_per_block: ExprLike,
121 stream: ExprLike | None,
122 ):
123 from warnings import warn
125 warn(
126 "cuda_invoke is deprecated and will be removed before version 0.1. "
127 "Use `gpu_invoke` instead.",
128 FutureWarning,
129 )
131 return self.gpu_invoke(
132 kernel_handle,
133 grid_size=num_blocks,
134 block_size=threads_per_block,
135 stream=stream,
136 )
139class GpuInvocationBuilder:
140 def __init__(
141 self,
142 ctx: SfgContext,
143 kernel_handle: SfgKernelHandle,
144 ):
145 self._ctx = ctx
146 self._kernel_handle = kernel_handle
148 ker = kernel_handle.kernel
150 if not isinstance(ker, GpuKernel):
151 raise ValueError(f"Non-GPU kernel was passed to `gpu_invoke`: {ker}")
153 launch_config = ker.get_launch_configuration()
155 self._launch_config = launch_config
157 gpu_api: type[ProvidesGpuRuntimeAPI]
158 match ker.target:
159 case Target.CUDA:
160 gpu_api = CudaAPI
161 case Target.HIP:
162 gpu_api = HipAPI
163 case _:
164 assert False, "unexpected GPU target"
166 self._gpu_api = gpu_api
167 self._dim3 = gpu_api.dim3
169 self._shared_memory_bytes: ExprLike = "0"
170 self._stream: ExprLike | None = None
172 @property
173 def shared_memory_bytes(self) -> ExprLike:
174 return self._shared_memory_bytes
176 @shared_memory_bytes.setter
177 def shared_memory_bytes(self, bs: ExprLike):
178 self._shared_memory_bytes = bs
180 @property
181 def stream(self) -> ExprLike | None:
182 return self._stream
184 @stream.setter
185 def stream(self, s: ExprLike | None):
186 self._stream = s
188 def _render_invocation(
189 self, grid_size: ExprLike, block_size: ExprLike
190 ) -> SfgSequence:
191 stmt_grid_size = make_statements(grid_size)
192 stmt_block_size = make_statements(block_size)
193 stmt_smem = make_statements(self._shared_memory_bytes)
194 stmt_stream = (
195 make_statements(self._stream) if self._stream is not None else None
196 )
198 return make_sequence(
199 "/* clang-format off */",
200 "/* [pystencils-sfg] Formatting may add illegal spaces between angular brackets in `<<< >>>` */",
201 SfgGpuKernelInvocation(
202 self._kernel_handle,
203 stmt_grid_size,
204 stmt_block_size,
205 shared_memory_bytes=stmt_smem,
206 stream=stmt_stream,
207 ),
208 "/* clang-format on */",
209 )
211 def __call__(self, **kwargs: ExprLike) -> SfgCallTreeNode:
212 match self._launch_config:
213 case ManualLaunchConfiguration():
214 return self._invoke_manual(**kwargs)
215 case AutomaticLaunchConfiguration():
216 return self._invoke_automatic(**kwargs)
217 case DynamicBlockSizeLaunchConfiguration():
218 return self._invoke_dynamic(**kwargs)
219 case _:
220 raise ValueError(
221 f"Unexpected launch configuration: {self._launch_config}"
222 )
224 def _invoke_manual(self, grid_size: ExprLike, block_size: ExprLike):
225 assert isinstance(self._launch_config, ManualLaunchConfiguration)
226 return self._render_invocation(grid_size, block_size)
228 def _invoke_automatic(self):
229 assert isinstance(self._launch_config, AutomaticLaunchConfiguration)
231 from .composer import SfgComposer
233 sfg = SfgComposer(self._ctx)
235 grid_size_entries = [
236 self._to_uint32_t(sfg.expr_from_lambda(gs))
237 for gs in self._launch_config._grid_size
238 ]
239 grid_size_var = self._dim3(const=True).var("__grid_size")
241 block_size_entries = [
242 self._to_uint32_t(sfg.expr_from_lambda(bs))
243 for bs in self._launch_config._block_size
244 ]
245 block_size_var = self._dim3(const=True).var("__block_size")
247 nodes = [
248 sfg.init(grid_size_var)(*grid_size_entries),
249 sfg.init(block_size_var)(*block_size_entries),
250 self._render_invocation(grid_size_var, block_size_var),
251 ]
253 return SfgBlock(SfgSequence(nodes))
255 def _invoke_dynamic(self, block_size: ExprLike | None = None):
256 assert isinstance(self._launch_config, DynamicBlockSizeLaunchConfiguration)
258 from .composer import SfgComposer
260 sfg = SfgComposer(self._ctx)
262 block_size_init_args: tuple[ExprLike, ...]
263 if block_size is None:
264 block_size_init_args = tuple(
265 str(bs) for bs in self._launch_config.default_block_size
266 )
267 else:
268 block_size_init_args = (block_size,)
270 block_size_var = self._dim3(const=True).var("__block_size")
272 from ..lang.cpp import std
274 work_items_entries = [
275 sfg.expr_from_lambda(wit) for wit in self._launch_config.num_work_items
276 ]
277 work_items_var = std.tuple("uint32_t", "uint32_t", "uint32_t", const=True).var(
278 "__work_items"
279 )
281 def _div_ceil(a: ExprLike, b: ExprLike):
282 return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
284 grid_size_entries = [
285 _div_ceil(work_items_var.get(i), bs)
286 for i, bs in enumerate(
287 [
288 block_size_var.x,
289 block_size_var.y,
290 block_size_var.z,
291 ]
292 )
293 ]
294 grid_size_var = self._dim3(const=True).var("__grid_size")
296 nodes = [
297 sfg.init(block_size_var)(*block_size_init_args),
298 sfg.init(work_items_var)(*work_items_entries),
299 sfg.init(grid_size_var)(*grid_size_entries),
300 self._render_invocation(grid_size_var, block_size_var),
301 ]
303 return SfgBlock(SfgSequence(nodes))
305 @staticmethod
306 def _to_uint32_t(expr: AugExpr) -> AugExpr:
307 return AugExpr("uint32_t").format("uint32_t({})", expr)