diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index ff41d3a68888347e81e9ff65d1359467666f2e32..4f78d344de861261ef89db6e10b51a93026a3dcd 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,11 +1,11 @@ from __future__ import annotations -from warnings import warn -from typing import TYPE_CHECKING +from abc import ABC, abstractmethod -from ...types import constify +from ...types import constify, deconstify from ..exceptions import MaterializationError from .generic_gpu import GenericGpu +from ..memory import PsSymbol from ..kernelcreation import ( Typifier, IterationSpace, @@ -29,8 +29,6 @@ from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral from ..functions import PsMathFunction, MathFunctions, CFunction -if TYPE_CHECKING: - from ...codegen import GpuThreadsRange int32 = PsSignedIntegerType(width=32, const=False) @@ -48,18 +46,88 @@ GRID_DIM = [ ] +class DenseThreadIdxMapping(ABC): + + @abstractmethod + def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]: + """Map the current thread index onto a point in the given iteration space. + + Implementations of this method must return a declaration for each dimension counter + of the given iteration space. + """ + + +class Linear3DMapping(DenseThreadIdxMapping): + """3D globally linearized mapping, where each thread is assigned a work item according to + its location in the global launch grid.""" + + def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]: + if ispace.rank > 3: + raise MaterializationError( + f"Cannot handle {ispace.rank}-dimensional iteration space " + "using the Linear3D GPU thread index mapping." + ) + + dimensions = ispace.dimensions_in_loop_order() + idx_map: dict[PsSymbol, PsExpression] = dict() + + for coord, dim in enumerate(dimensions[::-1]): + tid = self._linear_thread_idx(coord) + idx_map[dim.counter] = dim.start + dim.step * PsCast( + deconstify(dim.counter.get_dtype()), tid + ) + + return idx_map + + def _linear_thread_idx(self, coord: int): + block_size = BLOCK_DIM[coord] + block_idx = BLOCK_IDX[coord] + thread_idx = THREAD_IDX[coord] + return block_idx * block_size + thread_idx + + +class Blockwise4DMapping(DenseThreadIdxMapping): + """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions + are mapped to block indices.""" + + _indices_in_loop_order = [ # slowest to fastest + BLOCK_IDX[2], + BLOCK_IDX[1], + BLOCK_IDX[0], + THREAD_IDX[0], + ] + + def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]: + if ispace.rank > 4: + raise MaterializationError( + f"Cannot handle {ispace.rank}-dimensional iteration space " + "using the Blockwise4D GPU thread index mapping." + ) + + dimensions = ispace.dimensions_in_loop_order() + idx_map: dict[PsSymbol, PsExpression] = dict() + + for dim, tid in zip(dimensions, self._indices_in_loop_order): + idx_map[dim.counter] = dim.start + dim.step * PsCast( + deconstify(dim.counter.get_dtype()), tid + ) + + return idx_map + + class CudaPlatform(GenericGpu): """Platform for CUDA-based GPUs.""" def __init__( - self, ctx: KernelCreationContext, + self, + ctx: KernelCreationContext, omit_range_check: bool = False, - manual_launch_grid: bool = False, + dense_idx_mapping: DenseThreadIdxMapping | None = None, ) -> None: super().__init__(ctx) self._omit_range_check = omit_range_check - self._manual_launch_grid = manual_launch_grid + self._dense_idx_mapping = dense_idx_mapping self._typify = Typifier(ctx) @@ -69,7 +137,7 @@ class CudaPlatform(GenericGpu): def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange | None]: + ) -> PsBlock: if isinstance(ispace, FullIterationSpace): return self._prepend_dense_translation(body, ispace) elif isinstance(ispace, SparseIterationSpace): @@ -141,40 +209,41 @@ class CudaPlatform(GenericGpu): def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange | None]: + ) -> PsBlock: dimensions = ispace.dimensions_in_loop_order() - if not self._manual_launch_grid: - try: - threads_range = self.threads_from_ispace(ispace) - except MaterializationError as e: - warn( - str(e.args[0]) - + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.", - UserWarning, - ) - threads_range = None - else: - threads_range = None + # TODO move to codegen + # if not self._manual_launch_grid: + # try: + # threads_range = self.threads_from_ispace(ispace) + # except MaterializationError as e: + # warn( + # str(e.args[0]) + # + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.", + # UserWarning, + # ) + # threads_range = None + # else: + # threads_range = None + + idx_mapper = ( + self._dense_idx_mapping + if self._dense_idx_mapping is not None + else Linear3DMapping() + ) + ctr_mapping = idx_mapper(ispace) indexing_decls = [] conds = [] - for i, dim in enumerate(dimensions[::-1]): + for dim in dimensions: dim.counter.dtype = constify(dim.counter.get_dtype()) - ctr = PsExpression.make(dim.counter) + ctr_expr = PsExpression.make(dim.counter) indexing_decls.append( - self._typify( - PsDeclaration( - ctr, - dim.start - + dim.step - * PsCast(ctr.get_dtype(), self._linear_thread_idx(i)), - ) - ) + self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) if not self._omit_range_check: - conds.append(PsLt(ctr, dim.stop)) + conds.append(PsLt(ctr_expr, dim.stop)) indexing_decls = indexing_decls[::-1] @@ -187,16 +256,16 @@ class CudaPlatform(GenericGpu): body.statements = indexing_decls + body.statements ast = body - return ast, threads_range + return ast def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> PsBlock: factory = AstFactory(self._ctx) ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) sparse_ctr = PsExpression.make(ispace.sparse_counter) - thread_idx = self._linear_thread_idx(0) + thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0] sparse_idx_decl = self._typify( PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx)) ) @@ -224,10 +293,4 @@ class CudaPlatform(GenericGpu): body.statements = [sparse_idx_decl] + body.statements ast = body - return ast, self.threads_from_ispace(ispace) - - def _linear_thread_idx(self, coord: int): - block_size = BLOCK_DIM[coord] - block_idx = BLOCK_IDX[coord] - thread_idx = THREAD_IDX[coord] - return block_idx * block_size + thread_idx + return ast diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index f22c3c99ba901c12b1e247c85364eb12da26a60b..f66f9aa0ee5ea350ba4304c03ced1ceae346282f 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,6 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from abc import ABC, abstractmethod +from abc import abstractmethod from ..ast.expressions import PsExpression from ..ast.structural import PsBlock @@ -16,31 +16,11 @@ if TYPE_CHECKING: from ...codegen.kernel import GpuThreadsRange -class WorkItemMapping(ABC): - """Signature for work-item mappings used to modify the thread index mapping behavior""" - - @abstractmethod - def __call__( - self, - block_idx: tuple[PsExpression, PsExpression, PsExpression], - thread_idx: tuple[PsExpression, PsExpression, PsExpression], - ispace_rank: int, - ) -> tuple[PsExpression, ...]: - """Compute a work item index from the current block index, thread index, and iteration space dimensionality. - - Implementations of this method must return a tuple with `ispace_rank` entries, - containing expressions for the compressed index of the work item identified by the - given GPU block and thread index triples. - (The *compressed index* is the work item's index before application - of the iteration space's lower limits and strides.) - """ - - class GenericGpu(Platform): @abstractmethod def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange | None]: + ) -> PsBlock: pass @classmethod diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index e1da9e2237b504c4b6b681d48812e4d079a5b463..7f2bbf9f6bf54a8c9cdbcd4a9989e54b9db66923 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -1,5 +1,4 @@ from __future__ import annotations -from typing import TYPE_CHECKING from ..functions import CFunction, PsMathFunction, MathFunctions from ..kernelcreation.iteration_space import ( @@ -29,9 +28,6 @@ from .generic_gpu import GenericGpu from ..exceptions import MaterializationError from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType -if TYPE_CHECKING: - from ...codegen import GpuThreadsRange - class SyclPlatform(GenericGpu): @@ -39,7 +35,7 @@ class SyclPlatform(GenericGpu): self, ctx: KernelCreationContext, omit_range_check: bool = False, - automatic_block_size: bool = False + automatic_block_size: bool = False, ): super().__init__(ctx) @@ -52,7 +48,7 @@ class SyclPlatform(GenericGpu): def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> PsBlock: if isinstance(ispace, FullIterationSpace): return self._prepend_dense_translation(body, ispace) elif isinstance(ispace, SparseIterationSpace): @@ -113,15 +109,13 @@ class SyclPlatform(GenericGpu): def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> PsBlock: rank = ispace.rank id_type = self._id_type(rank) id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type)) id_decl = self._id_declaration(rank, id_symbol) dimensions = ispace.dimensions_in_loop_order() - launch_config = self.threads_from_ispace(ispace) - indexing_decls = [id_decl] conds = [] @@ -153,11 +147,11 @@ class SyclPlatform(GenericGpu): body.statements = indexing_decls + body.statements ast = body - return ast, launch_config + return ast def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> PsBlock: factory = AstFactory(self._ctx) id_type = PsCustomType("sycl::id< 1 >", const=True) @@ -195,7 +189,7 @@ class SyclPlatform(GenericGpu): body.statements = [sparse_idx_decl] + body.statements ast = body - return ast, self.threads_from_ispace(ispace) + return ast def _item_type(self, rank: int): if not self._automatic_block_size: