diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/backend/jit/gpu_cupy.py index 563a9c06a7261343e6435737c47854a55d54a05e..1dd18767160a626ff7972ebb78f83bb3e64a1efc 100644 --- a/src/pystencils/backend/jit/gpu_cupy.py +++ b/src/pystencils/backend/jit/gpu_cupy.py @@ -41,6 +41,7 @@ class CupyKernelWrapper(KernelWrapper): self._kfunc: GpuKernelFunction = kfunc self._raw_kernel = raw_kernel self._block_size = block_size + self._num_blocks: tuple[int, int, int] | None = None self._args_cache: dict[Any, tuple] = dict() @property @@ -59,6 +60,14 @@ class CupyKernelWrapper(KernelWrapper): def block_size(self, bs: tuple[int, int, int]): self._block_size = bs + @property + def num_blocks(self) -> tuple[int, int, int] | None: + return self._num_blocks + + @num_blocks.setter + def num_blocks(self, nb: tuple[int, int, int] | None): + self._num_blocks = nb + def __call__(self, **kwargs: Any): kernel_args, launch_grid = self._get_cached_args(**kwargs) device = self._get_device(kernel_args) @@ -72,7 +81,7 @@ class CupyKernelWrapper(KernelWrapper): return devices.pop() def _get_cached_args(self, **kwargs): - key = (self._block_size,) + tuple((k, id(v)) for k, v in kwargs.items()) + key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items()) if key not in self._args_cache: args = self._get_args(**kwargs) @@ -185,25 +194,36 @@ class CupyKernelWrapper(KernelWrapper): symbolic_threads_range = self._kfunc.threads_range - threads_range: list[int] = [ - evaluate_expression(expr, valuation) - for expr in symbolic_threads_range.num_work_items - ] + if self._num_blocks is not None: + launch_grid = LaunchGrid(self._num_blocks, self._block_size) - if symbolic_threads_range.dim < 3: - threads_range += [1] * (3 - symbolic_threads_range.dim) + elif symbolic_threads_range is not None: + threads_range: list[int] = [ + evaluate_expression(expr, valuation) + for expr in symbolic_threads_range.num_work_items + ] - def div_ceil(a, b): - return a // b if a % b == 0 else a // b + 1 + if symbolic_threads_range.dim < 3: + threads_range += [1] * (3 - symbolic_threads_range.dim) - # TODO: Refine this? - grid_size = tuple( - div_ceil(threads, tpb) - for threads, tpb in zip(threads_range, self._block_size) - ) - assert len(grid_size) == 3 + def div_ceil(a, b): + return a // b if a % b == 0 else a // b + 1 + + # TODO: Refine this? + num_blocks = tuple( + div_ceil(threads, tpb) + for threads, tpb in zip(threads_range, self._block_size) + ) + assert len(num_blocks) == 3 + + launch_grid = LaunchGrid(num_blocks, self._block_size) - launch_grid = LaunchGrid(grid_size, self._block_size) + else: + raise JitError( + "Unable to determine launch grid for GPU kernel invocation: " + "No manual grid size was specified, and the number of threads could not " + "be determined automatically." + ) return tuple(args), launch_grid diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py index c3c9eaa7a8825ed036cda92b4926d9abbb29a0a9..9df9883ce67d6d856335a7a7a9537f829b7df11e 100644 --- a/src/pystencils/backend/kernelcreation/iteration_space.py +++ b/src/pystencils/backend/kernelcreation/iteration_space.py @@ -196,21 +196,25 @@ class FullIterationSpace(IterationSpace): def dimensions(self): """The dimensions of this iteration space""" return self._dimensions + + @property + def counters(self) -> tuple[PsSymbol, ...]: + return tuple(dim.counter for dim in self._dimensions) @property - def lower(self): + def lower(self) -> tuple[PsExpression, ...]: """Lower limits of each dimension""" - return (dim.start for dim in self._dimensions) + return tuple(dim.start for dim in self._dimensions) @property - def upper(self): + def upper(self) -> tuple[PsExpression, ...]: """Upper limits of each dimension""" - return (dim.stop for dim in self._dimensions) + return tuple(dim.stop for dim in self._dimensions) @property - def steps(self): + def steps(self) -> tuple[PsExpression, ...]: """Iteration steps of each dimension""" - return (dim.step for dim in self._dimensions) + return tuple(dim.step for dim in self._dimensions) @property def archetype_field(self) -> Field | None: diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py index 0118c4f40a2b0702c51f39e9bc75fe4dc29cd67e..8f599d57e27ac45c713ab8878602140107e6e628 100644 --- a/src/pystencils/backend/kernelfunction.py +++ b/src/pystencils/backend/kernelfunction.py @@ -261,7 +261,7 @@ class GpuKernelFunction(KernelFunction): def __init__( self, body: PsBlock, - threads_range: GpuThreadsRange, + threads_range: GpuThreadsRange | None, target: Target, name: str, parameters: Sequence[KernelParameter], @@ -275,7 +275,7 @@ class GpuKernelFunction(KernelFunction): self._threads_range = threads_range @property - def threads_range(self) -> GpuThreadsRange: + def threads_range(self) -> GpuThreadsRange | None: return self._threads_range @@ -283,14 +283,16 @@ def create_gpu_kernel_function( ctx: KernelCreationContext, platform: Platform, body: PsBlock, - threads_range: GpuThreadsRange, + threads_range: GpuThreadsRange | None, function_name: str, target_spec: Target, jit: JitBase, ): undef_symbols = collect_undefined_symbols(body) - for threads in threads_range.num_work_items: - undef_symbols |= collect_undefined_symbols(threads) + + if threads_range is not None: + for threads in threads_range.num_work_items: + undef_symbols |= collect_undefined_symbols(threads) params = _get_function_params(ctx, undef_symbols) req_headers = _get_headers(ctx, platform, body) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 323dcc5a9b306990a11f85037f305794c8729625..5c8b5b5045b5f889123cdf86686ecd696c6d8bf7 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,3 +1,5 @@ +from warnings import warn + from ...types import constify from ..exceptions import MaterializationError from .generic_gpu import GenericGpu, GpuThreadsRange @@ -7,7 +9,7 @@ from ..kernelcreation import ( IterationSpace, FullIterationSpace, SparseIterationSpace, - AstFactory + AstFactory, ) from ..kernelcreation.context import KernelCreationContext @@ -43,6 +45,7 @@ GRID_DIM = [ class CudaPlatform(GenericGpu): + """Platform for CUDA-based GPUs.""" def __init__( self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None @@ -57,7 +60,7 @@ class CudaPlatform(GenericGpu): def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> tuple[PsBlock, GpuThreadsRange | None]: if isinstance(ispace, FullIterationSpace): return self._prepend_dense_translation(body, ispace) elif isinstance(ispace, SparseIterationSpace): @@ -123,13 +126,25 @@ class CudaPlatform(GenericGpu): def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> tuple[PsBlock, GpuThreadsRange | None]: dimensions = ispace.dimensions_in_loop_order() - launch_config = GpuThreadsRange.from_ispace(ispace) + + if not self._cfg.manual_launch_grid: + try: + threads_range = GpuThreadsRange.from_ispace(ispace) + except MaterializationError as e: + warn( + str(e.args[0]) + + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.", + UserWarning, + ) + threads_range = None + else: + threads_range = None indexing_decls = [] conds = [] - for i, dim in enumerate(dimensions[::-1]): + for i, dim in enumerate(dimensions): dim.counter.dtype = constify(dim.counter.get_dtype()) ctr = PsExpression.make(dim.counter) @@ -155,7 +170,7 @@ class CudaPlatform(GenericGpu): body.statements = indexing_decls + body.statements ast = body - return ast, launch_config + return ast, threads_range def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace @@ -199,3 +214,72 @@ class CudaPlatform(GenericGpu): block_idx = BLOCK_IDX[coord] thread_idx = THREAD_IDX[coord] return block_idx * block_size + thread_idx + + +# class LinearIndexing: +# """Linear GPU thread indexing. + +# This indexing scheme maps GPU threads to iteration space points in the following way: +# - Starting from the slowest coordinate, each coordinate is mapped to a dimension +# of the GPU grid until just one dimension is left +# - All remaining dimensions of the iteration space are linearly mapped +# onto the fastest launch grid dimension +# """ + +# def __init__( +# self, +# ctx: KernelCreationContext, +# launch_grid_dimensions: int, +# ispace: FullIterationSpace, +# ) -> None: +# if not (0 < launch_grid_dimensions <= 3): +# raise ValueError( +# f"Invalid number of launch grid dimensions: {launch_grid_dimensions}" +# ) + +# self._ctx = ctx + +# self._grid_dims = launch_grid_dimensions +# self._ispace = ispace +# self._ispace_dims = len(ispace.dimensions) + +# self._typify = Typifier(ctx) + +# def get_counter_declarations(self) -> Sequence[PsDeclaration]: +# num_slower_dimensions = min(self._grid_dims, self._ispace_dims) - 1 +# num_fast_dimensions = self._ispace_dims - num_slower_dimensions + +# decls = [] + +# # Slower n dimensions +# for i in range(num_slower_dimensions, 0, -1): +# thread_idx = BLOCK_IDX[i] * BLOCK_DIM[i] + THREAD_IDX[i] +# decls.append(self._make_ctr_decl(self._ispace.dimensions[num_fast_dimensions + i], thread_idx)) + +# # Fastest dimensions +# thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0] + +# if num_fast_dimensions == 1: +# decls.append(self._make_ctr_decl(self._ispace.dimensions[0], thread_idx)) +# else: +# for i in range(num_fast_dimensions, 0, -1): +# decls.append( +# self._make_ctr_decl( +# self._ispace.dimensions[i], +# # ergh... need actual iterations here... +# ) +# ) + + +# def _make_ctr_decl( +# self, dim: FullIterationSpace.Dimension, thread_idx: PsExpression +# ): +# dim.counter.dtype = constify(dim.counter.get_dtype()) + +# ctr = PsExpression.make(dim.counter) +# return self._typify( +# PsDeclaration( +# ctr, +# dim.start + dim.step * PsCast(ctr.get_dtype(), thread_idx), +# ) +# ) diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 774b9405cd04b8dc6489cd6b6ae36e4aa563f157..f6b888a4941ee657b41c3dd8837c9b1f8a5059e7 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -10,6 +10,7 @@ from ..kernelcreation.iteration_space import ( SparseIterationSpace, ) from .platform import Platform +from ..exceptions import MaterializationError class GpuThreadsRange: @@ -56,6 +57,19 @@ class GpuThreadsRange: raise NotImplementedError( f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space" ) + + from ..ast.analysis import collect_undefined_symbols as collect + + for dim in dimensions: + symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step) + for ctr in ispace.counters: + if ctr in symbs: + raise MaterializationError( + "Unable to construct GPU threads range for iteration space: " + f"Limits of dimension counter {dim.counter.name} " + f"depend on another dimension's counter {ctr.name}" + ) + work_items = [ispace.actual_iterations(dim) for dim in dimensions] return GpuThreadsRange(work_items) @@ -64,5 +78,5 @@ class GenericGpu(Platform): @abstractmethod def materialize_iteration_space( self, block: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> tuple[PsBlock, GpuThreadsRange | None]: pass diff --git a/src/pystencils/config.py b/src/pystencils/config.py index c688530aecd497ddced30cc47e83bf4e314f9ac8..c08ddc16198627adba97169e02724371eca62158 100644 --- a/src/pystencils/config.py +++ b/src/pystencils/config.py @@ -190,6 +190,14 @@ class GpuIndexingConfig: block_size: tuple[int, int, int] | None = None """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system.""" + manual_launch_grid: bool = False + """Always require a manually specified launch grid when running this kernel. + + If set to `True`, the code generator will not attempt to infer the size of + the launch grid from the kernel. + The launch grid will then have to be specified manually at runtime. + """ + sycl_automatic_block_size: bool = True """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size.