fix dimension order in SYCL target

5d03477a · Frederik Hennig · d447eb1b · 5d03477a · 5d03477a · 5d03477a
Commit 5d03477a authored 1 year ago by Frederik Hennig
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -10,17 +10,23 @@ from ..ast.expressions import (
    PsExpression,
    PsSubscript,
 )
+from ..kernelcreation.context import KernelCreationContext
 from ..constants import PsConstant
 from .platform import Platform
 from ..exceptions import MaterializationError
 from ...types import PsType, PsCustomType, PsIeeeFloatType, constify
-from ...config import SyclIndexingConfig
+from ...config import GpuIndexingConfig
 class SyclPlatform(Platform):
-    def __init__(self, indexing_cfg: SyclIndexingConfig):
+    def __init__(self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None):
-        self._cfg = indexing_cfg
+        super().__init__(ctx)
+        self._cfg = indexing_cfg if indexing_cfg is not None else GpuIndexingConfig()
+        if not self._cfg.sycl_automatic_block_size:
+            raise ValueError("The SYCL code generator supports only automatic block sizes at the moment.")
    @property
    def required_headers(self) -> set[str]:
@@ -65,8 +71,17 @@ class SyclPlatform(Platform):
        id_type = PsCustomType(f"sycl::id< {rank} >", const=True)
        id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type))
+        #   Determine loop order by permuting dimensions
+        archetype_field = ispace.archetype_field
+        if archetype_field is not None:
+            loop_order = archetype_field.layout
+            dimensions = [ispace.dimensions[coordinate] for coordinate in loop_order]
+        else:
+            dimensions = ispace.dimensions
        unpackings = []
-        for i, dim in enumerate(ispace.dimensions):
+        for i, dim in enumerate(dimensions[::-1]):
            coord = PsExpression.make(PsConstant(i, self._ctx.index_dtype))
            work_item_idx = PsSubscript(id_symbol, coord)
@@ -98,7 +113,7 @@ class SyclPlatform(Platform):
        return body
    def _id_type(self, rank: int):
-        if self._cfg.use_ndrange:
+        if not self._cfg.sycl_automatic_block_size:
            return PsCustomType(f"sycl::nd_item< {rank} >", const=True)
        else:
            return PsCustomType(f"sycl::item< {rank} >", const=True)

--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -100,7 +100,7 @@ class VectorizationConfig:
 @dataclass
-class SyclIndexingConfig:
+class GpuIndexingConfig:
    """Configure index translation behaviour inside kernels generated for `Target.SYCL`."""
    omit_range_check: bool = False
@@ -111,11 +111,14 @@ class SyclIndexingConfig:
    This check can be discarded through this option, at your own peril.
    """
-    use_ndrange: bool = False
+    sycl_automatic_block_size: bool = True
-    """If set to `True` while generating for `Target.SYCL`, generate the kernel for execution with a ``sycl::ndrange``.
+    """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size.
-    If `use_ndrange` is set, the kernel will receive an `nd_item` instead of an `item` from which the iteration counters
+    If set to `True`, the kernel is generated for execution via ``parallel_for``-dispatch using
-    are derived.
+    a flat `sycl::range`. In this case, the GPU block size will be inferred by the SYCL runtime.
+    If set to `False`, the kernel will receive an `nd_item` and has to be executed using ``parallel_for``
+    with an `nd_range`. This allows manual specification of the block size.
    """
@@ -181,10 +184,10 @@ class CreateKernelConfig:
    If this parameter is set while `target` is a non-CPU target, an error will be raised.
    """
-    sycl_indexing: None | SyclIndexingConfig = None
+    gpu_indexing: None | GpuIndexingConfig = None
-    """Configure index translation for SYCL kernels.
+    """Configure index translation for GPU kernels.
-    It this parameter is set while `target` is not `Target.SYCL`, an error will be raised.
+    It this parameter is set while `target` is not a GPU target, an error will be raised.
    """
    def __post_init__(self):
@@ -217,7 +220,7 @@ class CreateKernelConfig:
            if self.cpu_optim.vectorize is not False and not self.target.is_vector_cpu():
                raise PsOptionsError(f"Cannot enable auto-vectorization for non-vector CPU target {self.target}")
-        if self.sycl_indexing is not None:
+        if self.gpu_indexing is not None:
            if self.target != Target.SYCL:
                raise PsOptionsError(f"`sycl_indexing` cannot be set for non-SYCL target {self.target}")

--- a/src/pystencils/enums.py
+++ b/src/pystencils/enums.py
@@ -80,7 +80,7 @@ class Target(Flag):
    GPU = GenericCUDA
    """Alias for backward compatibility."""
-    SYCL = _SYCL
+    SYCL = _GPU | _SYCL
    """SYCL kernel target.
    Generate a function to be called within a SYCL parallel command.

--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -93,7 +93,7 @@ def create_kernel(
            platform = GenericCpu(ctx)
        case Target.SYCL:
            from .backend.platforms import SyclPlatform
-            platform = SyclPlatform(ctx)
+            platform = SyclPlatform(ctx, config.gpu_indexing)
        case _:
            #   TODO: CUDA/HIP platform
            raise NotImplementedError("Target platform not implemented")