From 5d03477a57b945e3cf66b545ecf887fa38ce2ca4 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Thu, 20 Jun 2024 12:21:59 +0200 Subject: [PATCH] fix dimension order in SYCL target --- src/pystencils/backend/platforms/sycl.py | 25 +++++++++++++++++++----- src/pystencils/config.py | 21 +++++++++++--------- src/pystencils/enums.py | 2 +- src/pystencils/kernelcreation.py | 2 +- 4 files changed, 34 insertions(+), 16 deletions(-) diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index f646768d5..79de61ffa 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -10,17 +10,23 @@ from ..ast.expressions import ( PsExpression, PsSubscript, ) + +from ..kernelcreation.context import KernelCreationContext from ..constants import PsConstant from .platform import Platform from ..exceptions import MaterializationError from ...types import PsType, PsCustomType, PsIeeeFloatType, constify -from ...config import SyclIndexingConfig +from ...config import GpuIndexingConfig class SyclPlatform(Platform): - def __init__(self, indexing_cfg: SyclIndexingConfig): - self._cfg = indexing_cfg + def __init__(self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None): + super().__init__(ctx) + self._cfg = indexing_cfg if indexing_cfg is not None else GpuIndexingConfig() + + if not self._cfg.sycl_automatic_block_size: + raise ValueError("The SYCL code generator supports only automatic block sizes at the moment.") @property def required_headers(self) -> set[str]: @@ -65,8 +71,17 @@ class SyclPlatform(Platform): id_type = PsCustomType(f"sycl::id< {rank} >", const=True) id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type)) + # Determine loop order by permuting dimensions + archetype_field = ispace.archetype_field + + if archetype_field is not None: + loop_order = archetype_field.layout + dimensions = [ispace.dimensions[coordinate] for coordinate in loop_order] + else: + dimensions = ispace.dimensions + unpackings = [] - for i, dim in enumerate(ispace.dimensions): + for i, dim in enumerate(dimensions[::-1]): coord = PsExpression.make(PsConstant(i, self._ctx.index_dtype)) work_item_idx = PsSubscript(id_symbol, coord) @@ -98,7 +113,7 @@ class SyclPlatform(Platform): return body def _id_type(self, rank: int): - if self._cfg.use_ndrange: + if not self._cfg.sycl_automatic_block_size: return PsCustomType(f"sycl::nd_item< {rank} >", const=True) else: return PsCustomType(f"sycl::item< {rank} >", const=True) diff --git a/src/pystencils/config.py b/src/pystencils/config.py index 13df58682..7b0ec590d 100644 --- a/src/pystencils/config.py +++ b/src/pystencils/config.py @@ -100,7 +100,7 @@ class VectorizationConfig: @dataclass -class SyclIndexingConfig: +class GpuIndexingConfig: """Configure index translation behaviour inside kernels generated for `Target.SYCL`.""" omit_range_check: bool = False @@ -111,11 +111,14 @@ class SyclIndexingConfig: This check can be discarded through this option, at your own peril. """ - use_ndrange: bool = False - """If set to `True` while generating for `Target.SYCL`, generate the kernel for execution with a ``sycl::ndrange``. + sycl_automatic_block_size: bool = True + """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size. - If `use_ndrange` is set, the kernel will receive an `nd_item` instead of an `item` from which the iteration counters - are derived. + If set to `True`, the kernel is generated for execution via ``parallel_for``-dispatch using + a flat `sycl::range`. In this case, the GPU block size will be inferred by the SYCL runtime. + + If set to `False`, the kernel will receive an `nd_item` and has to be executed using ``parallel_for`` + with an `nd_range`. This allows manual specification of the block size. """ @@ -181,10 +184,10 @@ class CreateKernelConfig: If this parameter is set while `target` is a non-CPU target, an error will be raised. """ - sycl_indexing: None | SyclIndexingConfig = None - """Configure index translation for SYCL kernels. + gpu_indexing: None | GpuIndexingConfig = None + """Configure index translation for GPU kernels. - It this parameter is set while `target` is not `Target.SYCL`, an error will be raised. + It this parameter is set while `target` is not a GPU target, an error will be raised. """ def __post_init__(self): @@ -217,7 +220,7 @@ class CreateKernelConfig: if self.cpu_optim.vectorize is not False and not self.target.is_vector_cpu(): raise PsOptionsError(f"Cannot enable auto-vectorization for non-vector CPU target {self.target}") - if self.sycl_indexing is not None: + if self.gpu_indexing is not None: if self.target != Target.SYCL: raise PsOptionsError(f"`sycl_indexing` cannot be set for non-SYCL target {self.target}") diff --git a/src/pystencils/enums.py b/src/pystencils/enums.py index 7b3bd9372..276a0c44f 100644 --- a/src/pystencils/enums.py +++ b/src/pystencils/enums.py @@ -80,7 +80,7 @@ class Target(Flag): GPU = GenericCUDA """Alias for backward compatibility.""" - SYCL = _SYCL + SYCL = _GPU | _SYCL """SYCL kernel target. Generate a function to be called within a SYCL parallel command. diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py index 4c49f7fde..ff0f09512 100644 --- a/src/pystencils/kernelcreation.py +++ b/src/pystencils/kernelcreation.py @@ -93,7 +93,7 @@ def create_kernel( platform = GenericCpu(ctx) case Target.SYCL: from .backend.platforms import SyclPlatform - platform = SyclPlatform(ctx) + platform = SyclPlatform(ctx, config.gpu_indexing) case _: # TODO: CUDA/HIP platform raise NotImplementedError("Target platform not implemented") -- GitLab