extract thread idx mapping from cuda platform. remove threads range...

extract thread idx mapping from cuda platform. remove threads range computation from cuda and sycl platforms.

extract thread idx mapping from cuda platform. remove threads range...
99894895 · Frederik Hennig · 9a15d1f7 · 99894895 · 99894895 · 99894895
Commit 99894895 authored 4 months ago by Frederik Hennig
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
 from __future__ import annotations
-from warnings import warn
+from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
-from ...types import constify
+from ...types import constify, deconstify
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
+from ..memory import PsSymbol
 from ..kernelcreation import (
    Typifier,
    IterationSpace,
@@ -29,8 +29,6 @@ from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
 from ..functions import PsMathFunction, MathFunctions, CFunction
-if TYPE_CHECKING:
-    from ...codegen import GpuThreadsRange
 int32 = PsSignedIntegerType(width=32, const=False)
@@ -48,18 +46,88 @@ GRID_DIM = [
 ]
+class DenseThreadIdxMapping(ABC):
+    @abstractmethod
+    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+        """Map the current thread index onto a point in the given iteration space.
+        Implementations of this method must return a declaration for each dimension counter
+        of the given iteration space.
+        """
+class Linear3DMapping(DenseThreadIdxMapping):
+    """3D globally linearized mapping, where each thread is assigned a work item according to
+    its location in the global launch grid."""
+    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 3:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Linear3D GPU thread index mapping."
+            )
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+        for coord, dim in enumerate(dimensions[::-1]):
+            tid = self._linear_thread_idx(coord)
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+        return idx_map
+    def _linear_thread_idx(self, coord: int):
+        block_size = BLOCK_DIM[coord]
+        block_idx = BLOCK_IDX[coord]
+        thread_idx = THREAD_IDX[coord]
+        return block_idx * block_size + thread_idx
+class Blockwise4DMapping(DenseThreadIdxMapping):
+    """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
+    are mapped to block indices."""
+    _indices_in_loop_order = [  # slowest to fastest
+        BLOCK_IDX[2],
+        BLOCK_IDX[1],
+        BLOCK_IDX[0],
+        THREAD_IDX[0],
+    ]
+    def __call__(self, ispace: FullIterationSpace) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 4:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Blockwise4D GPU thread index mapping."
+            )
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+        for dim, tid in zip(dimensions, self._indices_in_loop_order):
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+        return idx_map
 class CudaPlatform(GenericGpu):
    """Platform for CUDA-based GPUs."""
    def __init__(
-        self, ctx: KernelCreationContext,
+        self,
+        ctx: KernelCreationContext,
        omit_range_check: bool = False,
-        manual_launch_grid: bool = False,
+        dense_idx_mapping: DenseThreadIdxMapping | None = None,
    ) -> None:
        super().__init__(ctx)
        self._omit_range_check = omit_range_check
-        self._manual_launch_grid = manual_launch_grid
+        self._dense_idx_mapping = dense_idx_mapping
        self._typify = Typifier(ctx)
@@ -69,7 +137,7 @@ class CudaPlatform(GenericGpu):
    def materialize_iteration_space(
        self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
        if isinstance(ispace, FullIterationSpace):
            return self._prepend_dense_translation(body, ispace)
        elif isinstance(ispace, SparseIterationSpace):
@@ -141,40 +209,41 @@ class CudaPlatform(GenericGpu):
    def _prepend_dense_translation(
        self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
        dimensions = ispace.dimensions_in_loop_order()
-        if not self._manual_launch_grid:
+        #   TODO move to codegen
-            try:
+        # if not self._manual_launch_grid:
-                threads_range = self.threads_from_ispace(ispace)
+        #     try:
-            except MaterializationError as e:
+        #         threads_range = self.threads_from_ispace(ispace)
-                warn(
+        #     except MaterializationError as e:
-                    str(e.args[0])
+        #         warn(
-                    + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
+        #             str(e.args[0])
-                    UserWarning,
+        #             + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
-                )
+        #             UserWarning,
-                threads_range = None
+        #         )
-        else:
+        #         threads_range = None
-            threads_range = None
+        # else:
+        #     threads_range = None
+        idx_mapper = (
+            self._dense_idx_mapping
+            if self._dense_idx_mapping is not None
+            else Linear3DMapping()
+        )
+        ctr_mapping = idx_mapper(ispace)
        indexing_decls = []
        conds = []
-        for i, dim in enumerate(dimensions[::-1]):
+        for dim in dimensions:
            dim.counter.dtype = constify(dim.counter.get_dtype())
-            ctr = PsExpression.make(dim.counter)
+            ctr_expr = PsExpression.make(dim.counter)
            indexing_decls.append(
-                self._typify(
+                self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
-                    PsDeclaration(
-                        ctr,
-                        dim.start
-                        + dim.step
-                        * PsCast(ctr.get_dtype(), self._linear_thread_idx(i)),
-                    )
-                )
            )
            if not self._omit_range_check:
-                conds.append(PsLt(ctr, dim.stop))
+                conds.append(PsLt(ctr_expr, dim.stop))
        indexing_decls = indexing_decls[::-1]
@@ -187,16 +256,16 @@ class CudaPlatform(GenericGpu):
            body.statements = indexing_decls + body.statements
            ast = body
-        return ast, threads_range
+        return ast
    def _prepend_sparse_translation(
        self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
        factory = AstFactory(self._ctx)
        ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
        sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = self._linear_thread_idx(0)
+        thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
        sparse_idx_decl = self._typify(
            PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx))
        )
@@ -224,10 +293,4 @@ class CudaPlatform(GenericGpu):
            body.statements = [sparse_idx_decl] + body.statements
            ast = body
-        return ast, self.threads_from_ispace(ispace)
+        return ast
-    def _linear_thread_idx(self, coord: int):
-        block_size = BLOCK_DIM[coord]
-        block_idx = BLOCK_IDX[coord]
-        thread_idx = THREAD_IDX[coord]
-        return block_idx * block_size + thread_idx
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
 from __future__ import annotations
 from typing import TYPE_CHECKING
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from ..ast.expressions import PsExpression
 from ..ast.structural import PsBlock
@@ -16,31 +16,11 @@ if TYPE_CHECKING:
    from ...codegen.kernel import GpuThreadsRange
-class WorkItemMapping(ABC):
-    """Signature for work-item mappings used to modify the thread index mapping behavior"""
-    @abstractmethod
-    def __call__(
-        self,
-        block_idx: tuple[PsExpression, PsExpression, PsExpression],
-        thread_idx: tuple[PsExpression, PsExpression, PsExpression],
-        ispace_rank: int,
-    ) -> tuple[PsExpression, ...]:
-        """Compute a work item index from the current block index, thread index, and iteration space dimensionality.
-        Implementations of this method must return a tuple with `ispace_rank` entries,
-        containing expressions for the compressed index of the work item identified by the
-        given GPU block and thread index triples.
-        (The *compressed index* is the work item's index before application
-        of the iteration space's lower limits and strides.)
-        """
 class GenericGpu(Platform):
    @abstractmethod
    def materialize_iteration_space(
        self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
        pass
    @classmethod

--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
 from __future__ import annotations
-from typing import TYPE_CHECKING
 from ..functions import CFunction, PsMathFunction, MathFunctions
 from ..kernelcreation.iteration_space import (
@@ -29,9 +28,6 @@ from .generic_gpu import GenericGpu
 from ..exceptions import MaterializationError
 from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType
-if TYPE_CHECKING:
-    from ...codegen import GpuThreadsRange
 class SyclPlatform(GenericGpu):
@@ -39,7 +35,7 @@ class SyclPlatform(GenericGpu):
        self,
        ctx: KernelCreationContext,
        omit_range_check: bool = False,
-        automatic_block_size: bool = False
+        automatic_block_size: bool = False,
    ):
        super().__init__(ctx)
@@ -52,7 +48,7 @@ class SyclPlatform(GenericGpu):
    def materialize_iteration_space(
        self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
        if isinstance(ispace, FullIterationSpace):
            return self._prepend_dense_translation(body, ispace)
        elif isinstance(ispace, SparseIterationSpace):
@@ -113,15 +109,13 @@ class SyclPlatform(GenericGpu):
    def _prepend_dense_translation(
        self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
        rank = ispace.rank
        id_type = self._id_type(rank)
        id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type))
        id_decl = self._id_declaration(rank, id_symbol)
        dimensions = ispace.dimensions_in_loop_order()
-        launch_config = self.threads_from_ispace(ispace)
        indexing_decls = [id_decl]
        conds = []
@@ -153,11 +147,11 @@ class SyclPlatform(GenericGpu):
            body.statements = indexing_decls + body.statements
            ast = body
-        return ast, launch_config
+        return ast
    def _prepend_sparse_translation(
        self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
        factory = AstFactory(self._ctx)
        id_type = PsCustomType("sycl::id< 1 >", const=True)
@@ -195,7 +189,7 @@ class SyclPlatform(GenericGpu):
            body.statements = [sparse_idx_decl] + body.statements
            ast = body
-        return ast, self.threads_from_ispace(ispace)
+        return ast
    def _item_type(self, rank: int):
        if not self._automatic_block_size: