From 5d03477a57b945e3cf66b545ecf887fa38ce2ca4 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 20 Jun 2024 12:21:59 +0200
Subject: [PATCH] fix dimension order in SYCL target

---
 src/pystencils/backend/platforms/sycl.py | 25 +++++++++++++++++++-----
 src/pystencils/config.py                 | 21 +++++++++++---------
 src/pystencils/enums.py                  |  2 +-
 src/pystencils/kernelcreation.py         |  2 +-
 4 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index f646768d5..79de61ffa 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -10,17 +10,23 @@ from ..ast.expressions import (
     PsExpression,
     PsSubscript,
 )
+
+from ..kernelcreation.context import KernelCreationContext
 from ..constants import PsConstant
 from .platform import Platform
 from ..exceptions import MaterializationError
 from ...types import PsType, PsCustomType, PsIeeeFloatType, constify
-from ...config import SyclIndexingConfig
+from ...config import GpuIndexingConfig
 
 
 class SyclPlatform(Platform):
 
-    def __init__(self, indexing_cfg: SyclIndexingConfig):
-        self._cfg = indexing_cfg
+    def __init__(self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None):
+        super().__init__(ctx)
+        self._cfg = indexing_cfg if indexing_cfg is not None else GpuIndexingConfig()
+
+        if not self._cfg.sycl_automatic_block_size:
+            raise ValueError("The SYCL code generator supports only automatic block sizes at the moment.")
 
     @property
     def required_headers(self) -> set[str]:
@@ -65,8 +71,17 @@ class SyclPlatform(Platform):
         id_type = PsCustomType(f"sycl::id< {rank} >", const=True)
         id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type))
 
+        #   Determine loop order by permuting dimensions
+        archetype_field = ispace.archetype_field
+        
+        if archetype_field is not None:
+            loop_order = archetype_field.layout
+            dimensions = [ispace.dimensions[coordinate] for coordinate in loop_order]
+        else:
+            dimensions = ispace.dimensions
+
         unpackings = []
-        for i, dim in enumerate(ispace.dimensions):
+        for i, dim in enumerate(dimensions[::-1]):
             coord = PsExpression.make(PsConstant(i, self._ctx.index_dtype))
             work_item_idx = PsSubscript(id_symbol, coord)
 
@@ -98,7 +113,7 @@ class SyclPlatform(Platform):
         return body
 
     def _id_type(self, rank: int):
-        if self._cfg.use_ndrange:
+        if not self._cfg.sycl_automatic_block_size:
             return PsCustomType(f"sycl::nd_item< {rank} >", const=True)
         else:
             return PsCustomType(f"sycl::item< {rank} >", const=True)
diff --git a/src/pystencils/config.py b/src/pystencils/config.py
index 13df58682..7b0ec590d 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -100,7 +100,7 @@ class VectorizationConfig:
 
 
 @dataclass
-class SyclIndexingConfig:
+class GpuIndexingConfig:
     """Configure index translation behaviour inside kernels generated for `Target.SYCL`."""
 
     omit_range_check: bool = False
@@ -111,11 +111,14 @@ class SyclIndexingConfig:
     This check can be discarded through this option, at your own peril.
     """
 
-    use_ndrange: bool = False
-    """If set to `True` while generating for `Target.SYCL`, generate the kernel for execution with a ``sycl::ndrange``.
+    sycl_automatic_block_size: bool = True
+    """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size.
 
-    If `use_ndrange` is set, the kernel will receive an `nd_item` instead of an `item` from which the iteration counters
-    are derived.
+    If set to `True`, the kernel is generated for execution via ``parallel_for``-dispatch using
+    a flat `sycl::range`. In this case, the GPU block size will be inferred by the SYCL runtime.
+
+    If set to `False`, the kernel will receive an `nd_item` and has to be executed using ``parallel_for``
+    with an `nd_range`. This allows manual specification of the block size.
     """
 
 
@@ -181,10 +184,10 @@ class CreateKernelConfig:
     If this parameter is set while `target` is a non-CPU target, an error will be raised.
     """
 
-    sycl_indexing: None | SyclIndexingConfig = None
-    """Configure index translation for SYCL kernels.
+    gpu_indexing: None | GpuIndexingConfig = None
+    """Configure index translation for GPU kernels.
     
-    It this parameter is set while `target` is not `Target.SYCL`, an error will be raised.
+    It this parameter is set while `target` is not a GPU target, an error will be raised.
     """
 
     def __post_init__(self):
@@ -217,7 +220,7 @@ class CreateKernelConfig:
             if self.cpu_optim.vectorize is not False and not self.target.is_vector_cpu():
                 raise PsOptionsError(f"Cannot enable auto-vectorization for non-vector CPU target {self.target}")
 
-        if self.sycl_indexing is not None:
+        if self.gpu_indexing is not None:
             if self.target != Target.SYCL:
                 raise PsOptionsError(f"`sycl_indexing` cannot be set for non-SYCL target {self.target}")
 
diff --git a/src/pystencils/enums.py b/src/pystencils/enums.py
index 7b3bd9372..276a0c44f 100644
--- a/src/pystencils/enums.py
+++ b/src/pystencils/enums.py
@@ -80,7 +80,7 @@ class Target(Flag):
     GPU = GenericCUDA
     """Alias for backward compatibility."""
 
-    SYCL = _SYCL
+    SYCL = _GPU | _SYCL
     """SYCL kernel target.
     
     Generate a function to be called within a SYCL parallel command.
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index 4c49f7fde..ff0f09512 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -93,7 +93,7 @@ def create_kernel(
             platform = GenericCpu(ctx)
         case Target.SYCL:
             from .backend.platforms import SyclPlatform
-            platform = SyclPlatform(ctx)
+            platform = SyclPlatform(ctx, config.gpu_indexing)
         case _:
             #   TODO: CUDA/HIP platform
             raise NotImplementedError("Target platform not implemented")
-- 
GitLab