Skip to content
Snippets Groups Projects
Commit 99894895 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

extract thread idx mapping from cuda platform. remove threads range...

extract thread idx mapping from cuda platform. remove threads range computation from cuda and sycl platforms.
parent 9a15d1f7
No related branches found
No related tags found
1 merge request!449GPU Indexing Schemes and Launch Configurations
Pipeline #73859 failed
from __future__ import annotations from __future__ import annotations
from warnings import warn from abc import ABC, abstractmethod
from typing import TYPE_CHECKING
from ...types import constify from ...types import constify, deconstify
from ..exceptions import MaterializationError from ..exceptions import MaterializationError
from .generic_gpu import GenericGpu from .generic_gpu import GenericGpu
from ..memory import PsSymbol
from ..kernelcreation import ( from ..kernelcreation import (
Typifier, Typifier,
IterationSpace, IterationSpace,
...@@ -29,8 +29,6 @@ from ...types import PsSignedIntegerType, PsIeeeFloatType ...@@ -29,8 +29,6 @@ from ...types import PsSignedIntegerType, PsIeeeFloatType
from ..literals import PsLiteral from ..literals import PsLiteral
from ..functions import PsMathFunction, MathFunctions, CFunction from ..functions import PsMathFunction, MathFunctions, CFunction
if TYPE_CHECKING:
from ...codegen import GpuThreadsRange
int32 = PsSignedIntegerType(width=32, const=False) int32 = PsSignedIntegerType(width=32, const=False)
...@@ -48,18 +46,88 @@ GRID_DIM = [ ...@@ -48,18 +46,88 @@ GRID_DIM = [
] ]
class DenseThreadIdxMapping(ABC):
@abstractmethod
def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
"""Map the current thread index onto a point in the given iteration space.
Implementations of this method must return a declaration for each dimension counter
of the given iteration space.
"""
class Linear3DMapping(DenseThreadIdxMapping):
"""3D globally linearized mapping, where each thread is assigned a work item according to
its location in the global launch grid."""
def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
if ispace.rank > 3:
raise MaterializationError(
f"Cannot handle {ispace.rank}-dimensional iteration space "
"using the Linear3D GPU thread index mapping."
)
dimensions = ispace.dimensions_in_loop_order()
idx_map: dict[PsSymbol, PsExpression] = dict()
for coord, dim in enumerate(dimensions[::-1]):
tid = self._linear_thread_idx(coord)
idx_map[dim.counter] = dim.start + dim.step * PsCast(
deconstify(dim.counter.get_dtype()), tid
)
return idx_map
def _linear_thread_idx(self, coord: int):
block_size = BLOCK_DIM[coord]
block_idx = BLOCK_IDX[coord]
thread_idx = THREAD_IDX[coord]
return block_idx * block_size + thread_idx
class Blockwise4DMapping(DenseThreadIdxMapping):
"""Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
are mapped to block indices."""
_indices_in_loop_order = [ # slowest to fastest
BLOCK_IDX[2],
BLOCK_IDX[1],
BLOCK_IDX[0],
THREAD_IDX[0],
]
def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
if ispace.rank > 4:
raise MaterializationError(
f"Cannot handle {ispace.rank}-dimensional iteration space "
"using the Blockwise4D GPU thread index mapping."
)
dimensions = ispace.dimensions_in_loop_order()
idx_map: dict[PsSymbol, PsExpression] = dict()
for dim, tid in zip(dimensions, self._indices_in_loop_order):
idx_map[dim.counter] = dim.start + dim.step * PsCast(
deconstify(dim.counter.get_dtype()), tid
)
return idx_map
class CudaPlatform(GenericGpu): class CudaPlatform(GenericGpu):
"""Platform for CUDA-based GPUs.""" """Platform for CUDA-based GPUs."""
def __init__( def __init__(
self, ctx: KernelCreationContext, self,
ctx: KernelCreationContext,
omit_range_check: bool = False, omit_range_check: bool = False,
manual_launch_grid: bool = False, dense_idx_mapping: DenseThreadIdxMapping | None = None,
) -> None: ) -> None:
super().__init__(ctx) super().__init__(ctx)
self._omit_range_check = omit_range_check self._omit_range_check = omit_range_check
self._manual_launch_grid = manual_launch_grid self._dense_idx_mapping = dense_idx_mapping
self._typify = Typifier(ctx) self._typify = Typifier(ctx)
...@@ -69,7 +137,7 @@ class CudaPlatform(GenericGpu): ...@@ -69,7 +137,7 @@ class CudaPlatform(GenericGpu):
def materialize_iteration_space( def materialize_iteration_space(
self, body: PsBlock, ispace: IterationSpace self, body: PsBlock, ispace: IterationSpace
) -> tuple[PsBlock, GpuThreadsRange | None]: ) -> PsBlock:
if isinstance(ispace, FullIterationSpace): if isinstance(ispace, FullIterationSpace):
return self._prepend_dense_translation(body, ispace) return self._prepend_dense_translation(body, ispace)
elif isinstance(ispace, SparseIterationSpace): elif isinstance(ispace, SparseIterationSpace):
...@@ -141,40 +209,41 @@ class CudaPlatform(GenericGpu): ...@@ -141,40 +209,41 @@ class CudaPlatform(GenericGpu):
def _prepend_dense_translation( def _prepend_dense_translation(
self, body: PsBlock, ispace: FullIterationSpace self, body: PsBlock, ispace: FullIterationSpace
) -> tuple[PsBlock, GpuThreadsRange | None]: ) -> PsBlock:
dimensions = ispace.dimensions_in_loop_order() dimensions = ispace.dimensions_in_loop_order()
if not self._manual_launch_grid: # TODO move to codegen
try: # if not self._manual_launch_grid:
threads_range = self.threads_from_ispace(ispace) # try:
except MaterializationError as e: # threads_range = self.threads_from_ispace(ispace)
warn( # except MaterializationError as e:
str(e.args[0]) # warn(
+ "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.", # str(e.args[0])
UserWarning, # + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
) # UserWarning,
threads_range = None # )
else: # threads_range = None
threads_range = None # else:
# threads_range = None
idx_mapper = (
self._dense_idx_mapping
if self._dense_idx_mapping is not None
else Linear3DMapping()
)
ctr_mapping = idx_mapper(ispace)
indexing_decls = [] indexing_decls = []
conds = [] conds = []
for i, dim in enumerate(dimensions[::-1]): for dim in dimensions:
dim.counter.dtype = constify(dim.counter.get_dtype()) dim.counter.dtype = constify(dim.counter.get_dtype())
ctr = PsExpression.make(dim.counter) ctr_expr = PsExpression.make(dim.counter)
indexing_decls.append( indexing_decls.append(
self._typify( self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
PsDeclaration(
ctr,
dim.start
+ dim.step
* PsCast(ctr.get_dtype(), self._linear_thread_idx(i)),
)
)
) )
if not self._omit_range_check: if not self._omit_range_check:
conds.append(PsLt(ctr, dim.stop)) conds.append(PsLt(ctr_expr, dim.stop))
indexing_decls = indexing_decls[::-1] indexing_decls = indexing_decls[::-1]
...@@ -187,16 +256,16 @@ class CudaPlatform(GenericGpu): ...@@ -187,16 +256,16 @@ class CudaPlatform(GenericGpu):
body.statements = indexing_decls + body.statements body.statements = indexing_decls + body.statements
ast = body ast = body
return ast, threads_range return ast
def _prepend_sparse_translation( def _prepend_sparse_translation(
self, body: PsBlock, ispace: SparseIterationSpace self, body: PsBlock, ispace: SparseIterationSpace
) -> tuple[PsBlock, GpuThreadsRange]: ) -> PsBlock:
factory = AstFactory(self._ctx) factory = AstFactory(self._ctx)
ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
sparse_ctr = PsExpression.make(ispace.sparse_counter) sparse_ctr = PsExpression.make(ispace.sparse_counter)
thread_idx = self._linear_thread_idx(0) thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
sparse_idx_decl = self._typify( sparse_idx_decl = self._typify(
PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx)) PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx))
) )
...@@ -224,10 +293,4 @@ class CudaPlatform(GenericGpu): ...@@ -224,10 +293,4 @@ class CudaPlatform(GenericGpu):
body.statements = [sparse_idx_decl] + body.statements body.statements = [sparse_idx_decl] + body.statements
ast = body ast = body
return ast, self.threads_from_ispace(ispace) return ast
def _linear_thread_idx(self, coord: int):
block_size = BLOCK_DIM[coord]
block_idx = BLOCK_IDX[coord]
thread_idx = THREAD_IDX[coord]
return block_idx * block_size + thread_idx
from __future__ import annotations from __future__ import annotations
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from abc import ABC, abstractmethod from abc import abstractmethod
from ..ast.expressions import PsExpression from ..ast.expressions import PsExpression
from ..ast.structural import PsBlock from ..ast.structural import PsBlock
...@@ -16,31 +16,11 @@ if TYPE_CHECKING: ...@@ -16,31 +16,11 @@ if TYPE_CHECKING:
from ...codegen.kernel import GpuThreadsRange from ...codegen.kernel import GpuThreadsRange
class WorkItemMapping(ABC):
"""Signature for work-item mappings used to modify the thread index mapping behavior"""
@abstractmethod
def __call__(
self,
block_idx: tuple[PsExpression, PsExpression, PsExpression],
thread_idx: tuple[PsExpression, PsExpression, PsExpression],
ispace_rank: int,
) -> tuple[PsExpression, ...]:
"""Compute a work item index from the current block index, thread index, and iteration space dimensionality.
Implementations of this method must return a tuple with `ispace_rank` entries,
containing expressions for the compressed index of the work item identified by the
given GPU block and thread index triples.
(The *compressed index* is the work item's index before application
of the iteration space's lower limits and strides.)
"""
class GenericGpu(Platform): class GenericGpu(Platform):
@abstractmethod @abstractmethod
def materialize_iteration_space( def materialize_iteration_space(
self, body: PsBlock, ispace: IterationSpace self, body: PsBlock, ispace: IterationSpace
) -> tuple[PsBlock, GpuThreadsRange | None]: ) -> PsBlock:
pass pass
@classmethod @classmethod
......
from __future__ import annotations from __future__ import annotations
from typing import TYPE_CHECKING
from ..functions import CFunction, PsMathFunction, MathFunctions from ..functions import CFunction, PsMathFunction, MathFunctions
from ..kernelcreation.iteration_space import ( from ..kernelcreation.iteration_space import (
...@@ -29,9 +28,6 @@ from .generic_gpu import GenericGpu ...@@ -29,9 +28,6 @@ from .generic_gpu import GenericGpu
from ..exceptions import MaterializationError from ..exceptions import MaterializationError
from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType
if TYPE_CHECKING:
from ...codegen import GpuThreadsRange
class SyclPlatform(GenericGpu): class SyclPlatform(GenericGpu):
...@@ -39,7 +35,7 @@ class SyclPlatform(GenericGpu): ...@@ -39,7 +35,7 @@ class SyclPlatform(GenericGpu):
self, self,
ctx: KernelCreationContext, ctx: KernelCreationContext,
omit_range_check: bool = False, omit_range_check: bool = False,
automatic_block_size: bool = False automatic_block_size: bool = False,
): ):
super().__init__(ctx) super().__init__(ctx)
...@@ -52,7 +48,7 @@ class SyclPlatform(GenericGpu): ...@@ -52,7 +48,7 @@ class SyclPlatform(GenericGpu):
def materialize_iteration_space( def materialize_iteration_space(
self, body: PsBlock, ispace: IterationSpace self, body: PsBlock, ispace: IterationSpace
) -> tuple[PsBlock, GpuThreadsRange]: ) -> PsBlock:
if isinstance(ispace, FullIterationSpace): if isinstance(ispace, FullIterationSpace):
return self._prepend_dense_translation(body, ispace) return self._prepend_dense_translation(body, ispace)
elif isinstance(ispace, SparseIterationSpace): elif isinstance(ispace, SparseIterationSpace):
...@@ -113,15 +109,13 @@ class SyclPlatform(GenericGpu): ...@@ -113,15 +109,13 @@ class SyclPlatform(GenericGpu):
def _prepend_dense_translation( def _prepend_dense_translation(
self, body: PsBlock, ispace: FullIterationSpace self, body: PsBlock, ispace: FullIterationSpace
) -> tuple[PsBlock, GpuThreadsRange]: ) -> PsBlock:
rank = ispace.rank rank = ispace.rank
id_type = self._id_type(rank) id_type = self._id_type(rank)
id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type)) id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type))
id_decl = self._id_declaration(rank, id_symbol) id_decl = self._id_declaration(rank, id_symbol)
dimensions = ispace.dimensions_in_loop_order() dimensions = ispace.dimensions_in_loop_order()
launch_config = self.threads_from_ispace(ispace)
indexing_decls = [id_decl] indexing_decls = [id_decl]
conds = [] conds = []
...@@ -153,11 +147,11 @@ class SyclPlatform(GenericGpu): ...@@ -153,11 +147,11 @@ class SyclPlatform(GenericGpu):
body.statements = indexing_decls + body.statements body.statements = indexing_decls + body.statements
ast = body ast = body
return ast, launch_config return ast
def _prepend_sparse_translation( def _prepend_sparse_translation(
self, body: PsBlock, ispace: SparseIterationSpace self, body: PsBlock, ispace: SparseIterationSpace
) -> tuple[PsBlock, GpuThreadsRange]: ) -> PsBlock:
factory = AstFactory(self._ctx) factory = AstFactory(self._ctx)
id_type = PsCustomType("sycl::id< 1 >", const=True) id_type = PsCustomType("sycl::id< 1 >", const=True)
...@@ -195,7 +189,7 @@ class SyclPlatform(GenericGpu): ...@@ -195,7 +189,7 @@ class SyclPlatform(GenericGpu):
body.statements = [sparse_idx_decl] + body.statements body.statements = [sparse_idx_decl] + body.statements
ast = body ast = body
return ast, self.threads_from_ispace(ispace) return ast
def _item_type(self, rank: int): def _item_type(self, rank: int):
if not self._automatic_block_size: if not self._automatic_block_size:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment