Coverage for src/pystencilssfg/composer/gpu_composer.py: 91%

100 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-04 07:16 +0000

1from __future__ import annotations 

2 

3from typing import overload 

4 

5from pystencils.codegen import GpuKernel, Target 

6from pystencils.codegen.gpu_indexing import ( 

7 ManualLaunchConfiguration, 

8 AutomaticLaunchConfiguration, 

9 DynamicBlockSizeLaunchConfiguration, 

10) 

11 

12from .mixin import SfgComposerMixIn 

13from .basic_composer import make_statements, make_sequence 

14 

15from ..context import SfgContext 

16from ..ir import ( 

17 SfgKernelHandle, 

18 SfgCallTreeNode, 

19 SfgGpuKernelInvocation, 

20 SfgBlock, 

21 SfgSequence, 

22) 

23from ..lang import ExprLike, AugExpr 

24from ..lang.gpu import CudaAPI, HipAPI, ProvidesGpuRuntimeAPI 

25 

26 

27class SfgGpuComposer(SfgComposerMixIn): 

28 """Composer mix-in providing methods to generate GPU kernel invocations. 

29 

30 .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs) 

31 

32 Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration. 

33 

34 The overloads of this method are listed below. 

35 They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax; 

36 for details on the launch configuration arguments, 

37 refer to `Launch Configurations in CUDA`_ 

38 or `Launch Configurations in HIP`_. 

39 

40 .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode 

41 :noindex: 

42 

43 Invoke a GPU kernel with a manual launch grid. 

44 

45 Requires that the kernel was generated 

46 with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>` 

47 set to `True`. 

48 

49 .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode 

50 :noindex: 

51 

52 Invoke a GPU kernel with an automatic launch grid. 

53 

54 This signature accepts kernels generated with an indexing scheme that 

55 causes the launch grid to be determined automatically, 

56 such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`. 

57 

58 .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode 

59 :noindex: 

60 

61 Invoke a GPU kernel with a dynamic launch grid. 

62 

63 This signature accepts kernels generated with an indexing scheme that permits a user-defined 

64 blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`. 

65 The grid size is calculated automatically by dividing the number of work items in each 

66 dimension by the block size, rounding up. 

67 

68 .. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration 

69 

70 .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions 

71 """ # NOQA: E501 

72 

73 @overload 

74 def gpu_invoke( 

75 self, 

76 kernel_handle: SfgKernelHandle, 

77 *, 

78 grid_size: ExprLike, 

79 block_size: ExprLike, 

80 shared_memory_bytes: ExprLike = "0", 

81 stream: ExprLike | None = None, 

82 ) -> SfgCallTreeNode: ... 

83 

84 @overload 

85 def gpu_invoke( 

86 self, 

87 kernel_handle: SfgKernelHandle, 

88 *, 

89 shared_memory_bytes: ExprLike = "0", 

90 stream: ExprLike | None = None, 

91 ) -> SfgCallTreeNode: ... 

92 

93 @overload 

94 def gpu_invoke( 

95 self, 

96 kernel_handle: SfgKernelHandle, 

97 *, 

98 block_size: ExprLike | None = None, 

99 shared_memory_bytes: ExprLike = "0", 

100 stream: ExprLike | None = None, 

101 ) -> SfgCallTreeNode: ... 

102 

103 def gpu_invoke( 

104 self, 

105 kernel_handle: SfgKernelHandle, 

106 shared_memory_bytes: ExprLike = "0", 

107 stream: ExprLike | None = None, 

108 **kwargs, 

109 ) -> SfgCallTreeNode: 

110 builder = GpuInvocationBuilder(self._ctx, kernel_handle) 

111 builder.shared_memory_bytes = shared_memory_bytes 

112 builder.stream = stream 

113 

114 return builder(**kwargs) 

115 

116 def cuda_invoke( 

117 self, 

118 kernel_handle: SfgKernelHandle, 

119 num_blocks: ExprLike, 

120 threads_per_block: ExprLike, 

121 stream: ExprLike | None, 

122 ): 

123 from warnings import warn 

124 

125 warn( 

126 "cuda_invoke is deprecated and will be removed before version 0.1. " 

127 "Use `gpu_invoke` instead.", 

128 FutureWarning, 

129 ) 

130 

131 return self.gpu_invoke( 

132 kernel_handle, 

133 grid_size=num_blocks, 

134 block_size=threads_per_block, 

135 stream=stream, 

136 ) 

137 

138 

139class GpuInvocationBuilder: 

140 def __init__( 

141 self, 

142 ctx: SfgContext, 

143 kernel_handle: SfgKernelHandle, 

144 ): 

145 self._ctx = ctx 

146 self._kernel_handle = kernel_handle 

147 

148 ker = kernel_handle.kernel 

149 

150 if not isinstance(ker, GpuKernel): 

151 raise ValueError(f"Non-GPU kernel was passed to `gpu_invoke`: {ker}") 

152 

153 launch_config = ker.get_launch_configuration() 

154 

155 self._launch_config = launch_config 

156 

157 gpu_api: type[ProvidesGpuRuntimeAPI] 

158 match ker.target: 

159 case Target.CUDA: 

160 gpu_api = CudaAPI 

161 case Target.HIP: 

162 gpu_api = HipAPI 

163 case _: 

164 assert False, "unexpected GPU target" 

165 

166 self._gpu_api = gpu_api 

167 self._dim3 = gpu_api.dim3 

168 

169 self._shared_memory_bytes: ExprLike = "0" 

170 self._stream: ExprLike | None = None 

171 

172 @property 

173 def shared_memory_bytes(self) -> ExprLike: 

174 return self._shared_memory_bytes 

175 

176 @shared_memory_bytes.setter 

177 def shared_memory_bytes(self, bs: ExprLike): 

178 self._shared_memory_bytes = bs 

179 

180 @property 

181 def stream(self) -> ExprLike | None: 

182 return self._stream 

183 

184 @stream.setter 

185 def stream(self, s: ExprLike | None): 

186 self._stream = s 

187 

188 def _render_invocation( 

189 self, grid_size: ExprLike, block_size: ExprLike 

190 ) -> SfgSequence: 

191 stmt_grid_size = make_statements(grid_size) 

192 stmt_block_size = make_statements(block_size) 

193 stmt_smem = make_statements(self._shared_memory_bytes) 

194 stmt_stream = ( 

195 make_statements(self._stream) if self._stream is not None else None 

196 ) 

197 

198 return make_sequence( 

199 "/* clang-format off */", 

200 "/* [pystencils-sfg] Formatting may add illegal spaces between angular brackets in `<<< >>>` */", 

201 SfgGpuKernelInvocation( 

202 self._kernel_handle, 

203 stmt_grid_size, 

204 stmt_block_size, 

205 shared_memory_bytes=stmt_smem, 

206 stream=stmt_stream, 

207 ), 

208 "/* clang-format on */", 

209 ) 

210 

211 def __call__(self, **kwargs: ExprLike) -> SfgCallTreeNode: 

212 match self._launch_config: 

213 case ManualLaunchConfiguration(): 

214 return self._invoke_manual(**kwargs) 

215 case AutomaticLaunchConfiguration(): 

216 return self._invoke_automatic(**kwargs) 

217 case DynamicBlockSizeLaunchConfiguration(): 

218 return self._invoke_dynamic(**kwargs) 

219 case _: 

220 raise ValueError( 

221 f"Unexpected launch configuration: {self._launch_config}" 

222 ) 

223 

224 def _invoke_manual(self, grid_size: ExprLike, block_size: ExprLike): 

225 assert isinstance(self._launch_config, ManualLaunchConfiguration) 

226 return self._render_invocation(grid_size, block_size) 

227 

228 def _invoke_automatic(self): 

229 assert isinstance(self._launch_config, AutomaticLaunchConfiguration) 

230 

231 from .composer import SfgComposer 

232 

233 sfg = SfgComposer(self._ctx) 

234 

235 grid_size_entries = [ 

236 self._to_uint32_t(sfg.expr_from_lambda(gs)) 

237 for gs in self._launch_config._grid_size 

238 ] 

239 grid_size_var = self._dim3(const=True).var("__grid_size") 

240 

241 block_size_entries = [ 

242 self._to_uint32_t(sfg.expr_from_lambda(bs)) 

243 for bs in self._launch_config._block_size 

244 ] 

245 block_size_var = self._dim3(const=True).var("__block_size") 

246 

247 nodes = [ 

248 sfg.init(grid_size_var)(*grid_size_entries), 

249 sfg.init(block_size_var)(*block_size_entries), 

250 self._render_invocation(grid_size_var, block_size_var), 

251 ] 

252 

253 return SfgBlock(SfgSequence(nodes)) 

254 

255 def _invoke_dynamic(self, block_size: ExprLike | None = None): 

256 assert isinstance(self._launch_config, DynamicBlockSizeLaunchConfiguration) 

257 

258 from .composer import SfgComposer 

259 

260 sfg = SfgComposer(self._ctx) 

261 

262 block_size_init_args: tuple[ExprLike, ...] 

263 if block_size is None: 

264 block_size_init_args = tuple( 

265 str(bs) for bs in self._launch_config.default_block_size 

266 ) 

267 else: 

268 block_size_init_args = (block_size,) 

269 

270 block_size_var = self._dim3(const=True).var("__block_size") 

271 

272 from ..lang.cpp import std 

273 

274 work_items_entries = [ 

275 sfg.expr_from_lambda(wit) for wit in self._launch_config.num_work_items 

276 ] 

277 work_items_var = std.tuple("uint32_t", "uint32_t", "uint32_t", const=True).var( 

278 "__work_items" 

279 ) 

280 

281 def _div_ceil(a: ExprLike, b: ExprLike): 

282 return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b) 

283 

284 grid_size_entries = [ 

285 _div_ceil(work_items_var.get(i), bs) 

286 for i, bs in enumerate( 

287 [ 

288 block_size_var.x, 

289 block_size_var.y, 

290 block_size_var.z, 

291 ] 

292 ) 

293 ] 

294 grid_size_var = self._dim3(const=True).var("__grid_size") 

295 

296 nodes = [ 

297 sfg.init(block_size_var)(*block_size_init_args), 

298 sfg.init(work_items_var)(*work_items_entries), 

299 sfg.init(grid_size_var)(*grid_size_entries), 

300 self._render_invocation(grid_size_var, block_size_var), 

301 ] 

302 

303 return SfgBlock(SfgSequence(nodes)) 

304 

305 @staticmethod 

306 def _to_uint32_t(expr: AugExpr) -> AugExpr: 

307 return AugExpr("uint32_t").format("uint32_t({})", expr)