diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 7f2bbf9f6bf54a8c9cdbcd4a9989e54b9db66923..f3c4bb3d5559fa5b418b3f3ca51b3cf43aa99b2f 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -24,12 +24,13 @@ from ..extensions.cpp import CppMethodCall from ..kernelcreation import KernelCreationContext, AstFactory from ..constants import PsConstant -from .generic_gpu import GenericGpu from ..exceptions import MaterializationError from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType +from .platform import Platform -class SyclPlatform(GenericGpu): + +class SyclPlatform(Platform): def __init__( self, diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 14a95c84d899638ea796d13cfddf7dd4e7ccd04f..b8f9c71015765e638ddca22278dfa15c0e5bcaa1 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import cast, Sequence, Iterable, Callable, TYPE_CHECKING +from typing import cast, Sequence, Callable, TYPE_CHECKING from dataclasses import dataclass, replace from .target import Target @@ -15,6 +15,7 @@ from .config import ( from .kernel import Kernel, GpuKernel from .properties import PsSymbolProperty, FieldBasePtr from .parameters import Parameter +from .functions import Lambda from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration from ..field import Field @@ -23,6 +24,7 @@ from ..types import PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode from ..backend.ast.structural import PsBlock, PsLoop +from ..backend.ast.expressions import PsExpression from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -201,20 +203,20 @@ class DefaultKernelCreationDriver: canonicalize = CanonicalizeSymbols(self._ctx, True) kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) - if self._target.is_cpu(): - return create_cpu_kernel_function( - self._ctx, + kernel_factory = KernelFactory(self._ctx) + + if self._target.is_cpu() or self._target == Target.SYCL: + return kernel_factory.create_generic_kernel( self._platform, kernel_ast, self._cfg.get_option("function_name"), self._target, self._cfg.get_jit(), ) - else: + elif self._target.is_gpu(): assert self._gpu_indexing is not None - return create_gpu_kernel_function( - self._ctx, + return kernel_factory.create_gpu_kernel( self._platform, kernel_ast, self._cfg.get_option("function_name"), @@ -222,6 +224,8 @@ class DefaultKernelCreationDriver: self._cfg.get_jit(), self._gpu_indexing.get_launch_config_factory(), ) + else: + assert False, "unexpected target" def parse_kernel_body( self, @@ -432,23 +436,11 @@ class DefaultKernelCreationDriver: f"No platform is currently available for CPU target {self._target}" ) - elif Target._GPU in self._target: + elif self._target.is_gpu(): gpu_opts = self._cfg.gpu omit_range_check: bool = gpu_opts.get_option("omit_range_check") match self._target: - case Target.SYCL: - from ..backend.platforms import SyclPlatform - - auto_block_size: bool = self._cfg.sycl.get_option( - "automatic_block_size" - ) - - return SyclPlatform( - self._ctx, - omit_range_check=omit_range_check, - automatic_block_size=auto_block_size, - ) case Target.CUDA: from ..backend.platforms import CudaPlatform @@ -463,89 +455,102 @@ class DefaultKernelCreationDriver: omit_range_check=omit_range_check, thread_mapping=thread_mapping, ) + elif self._target == Target.SYCL: + from ..backend.platforms import SyclPlatform + + auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size") + omit_range_check = self._cfg.gpu.get_option("omit_range_check") + + return SyclPlatform( + self._ctx, + omit_range_check=omit_range_check, + automatic_block_size=auto_block_size, + ) raise NotImplementedError( f"Code generation for target {self._target} not implemented" ) -def create_cpu_kernel_function( - ctx: KernelCreationContext, - platform: Platform, - body: PsBlock, - function_name: str, - target_spec: Target, - jit: JitBase, -) -> Kernel: - undef_symbols = collect_undefined_symbols(body) - - params = _get_function_params(ctx, undef_symbols) - req_headers = _get_headers(ctx, platform, body) - - kfunc = Kernel(body, target_spec, function_name, params, req_headers, jit) - kfunc.metadata.update(ctx.metadata) - return kfunc - - -def create_gpu_kernel_function( - ctx: KernelCreationContext, - platform: Platform, - body: PsBlock, - function_name: str, - target_spec: Target, - jit: JitBase, - launch_config_factory: Callable[[], GpuLaunchConfiguration], -) -> GpuKernel: - undef_symbols = collect_undefined_symbols(body) - - params = _get_function_params(ctx, undef_symbols) - req_headers = _get_headers(ctx, platform, body) - - kfunc = GpuKernel( - body, - target_spec, - function_name, - params, - req_headers, - jit, - launch_config_factory, - ) - kfunc.metadata.update(ctx.metadata) - return kfunc - - -def _symbol_to_param(ctx: KernelCreationContext, symbol: PsSymbol): - from pystencils.backend.memory import BufferBasePtr, BackendPrivateProperty - - props: set[PsSymbolProperty] = set() - for prop in symbol.properties: - match prop: - case BufferBasePtr(buf): - field = ctx.find_field(buf.name) - props.add(FieldBasePtr(field)) - case BackendPrivateProperty(): - pass - case _: - props.add(prop) - - return Parameter(symbol.name, symbol.get_dtype(), props) - - -def _get_function_params( - ctx: KernelCreationContext, symbols: Iterable[PsSymbol] -) -> list[Parameter]: - params: list[Parameter] = [_symbol_to_param(ctx, s) for s in symbols] - params.sort(key=lambda p: p.name) - return params - - -def _get_headers( - ctx: KernelCreationContext, platform: Platform, body: PsBlock -) -> set[str]: - req_headers = collect_required_headers(body) - req_headers |= platform.required_headers - req_headers |= ctx.required_headers - return req_headers +class KernelFactory: + """Factory for wrapping up backend and IR objects into exportable kernels and function objects.""" + + def __init__(self, ctx: KernelCreationContext): + self._ctx = ctx + + def create_lambda(self, expr: PsExpression) -> Lambda: + """Create a Lambda from an expression.""" + params = self._get_function_params(expr) + return Lambda(expr, params) + + def create_generic_kernel( + self, + platform: Platform, + body: PsBlock, + function_name: str, + target_spec: Target, + jit: JitBase, + ) -> Kernel: + """Create a kernel for a generic target""" + params = self._get_function_params(body) + req_headers = self._get_headers(platform, body) + + kfunc = Kernel(body, target_spec, function_name, params, req_headers, jit) + kfunc.metadata.update(self._ctx.metadata) + return kfunc + + def create_gpu_kernel( + self, + platform: Platform, + body: PsBlock, + function_name: str, + target_spec: Target, + jit: JitBase, + launch_config_factory: Callable[[], GpuLaunchConfiguration], + ) -> GpuKernel: + """Create a kernel for a GPU target""" + params = self._get_function_params(body) + req_headers = self._get_headers(platform, body) + + kfunc = GpuKernel( + body, + target_spec, + function_name, + params, + req_headers, + jit, + launch_config_factory, + ) + kfunc.metadata.update(self._ctx.metadata) + return kfunc + + def _symbol_to_param(self, symbol: PsSymbol): + from pystencils.backend.memory import BufferBasePtr, BackendPrivateProperty + + props: set[PsSymbolProperty] = set() + for prop in symbol.properties: + match prop: + case BufferBasePtr(buf): + field = self._ctx.find_field(buf.name) + props.add(FieldBasePtr(field)) + case BackendPrivateProperty(): + pass + case _: + props.add(prop) + + return Parameter(symbol.name, symbol.get_dtype(), props) + + def _get_function_params(self, ast: PsAstNode) -> list[Parameter]: + symbols = collect_undefined_symbols(ast) + params: list[Parameter] = [self._symbol_to_param(s) for s in symbols] + params.sort(key=lambda p: p.name) + return params + + def _get_headers(self, platform: Platform, body: PsBlock) -> set[str]: + req_headers = collect_required_headers(body) + req_headers |= platform.required_headers + req_headers |= self._ctx.required_headers + return req_headers @dataclass diff --git a/src/pystencils/codegen/functions.py b/src/pystencils/codegen/functions.py index f6be3b1f3446c6b9a25a0013f0e06d099edf5bed..c24dbaffb9947d68c854f83532e87386914c6677 100644 --- a/src/pystencils/codegen/functions.py +++ b/src/pystencils/codegen/functions.py @@ -4,21 +4,12 @@ from typing import Sequence, Any from .parameters import Parameter from ..types import PsType -from ..backend.kernelcreation import KernelCreationContext from ..backend.ast.expressions import PsExpression class Lambda: """A one-line function emitted by the code generator as an auxiliary object.""" - @staticmethod - def from_expression(ctx: KernelCreationContext, expr: PsExpression): - from ..backend.ast.analysis import collect_undefined_symbols - from .driver import _get_function_params - - params = _get_function_params(ctx, collect_undefined_symbols(expr)) - return Lambda(expr, params) - def __init__(self, expr: PsExpression, params: Sequence[Parameter]): self._expr = expr self._params = tuple(params) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 2d22ec624856d9cf8a0b825845fee04caaa4ee74..c93f0f95908c0438a233abdfbd585d164e4e7f96 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -228,8 +228,10 @@ class GpuIndexing: self._manual_launch_grid = manual_launch_grid from ..backend.kernelcreation import AstFactory + from .driver import KernelFactory - self._factory = AstFactory(self._ctx) + self._ast_factory = AstFactory(self._ctx) + self._kernel_factory = KernelFactory(self._ctx) def get_thread_mapping(self) -> ThreadMapping: """Retrieve a thread mapping object for use by the backend""" @@ -267,7 +269,7 @@ class GpuIndexing: num_work_items = cast( _Dim3Lambda, - tuple(Lambda.from_expression(self._ctx, wit) for wit in work_items_expr), + tuple(self._kernel_factory.create_lambda(wit) for wit in work_items_expr), ) def factory(): @@ -305,15 +307,15 @@ class GpuIndexing: raise ValueError(f"Iteration space rank is too large: {rank}") block_size = ( - Lambda.from_expression(self._ctx, work_items[0]), - Lambda.from_expression(self._ctx, self._factory.parse_index(1)), - Lambda.from_expression(self._ctx, self._factory.parse_index(1)), + self._kernel_factory.create_lambda(work_items[0]), + self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), + self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), ) grid_size = tuple( - Lambda.from_expression(self._ctx, wit) for wit in work_items[1:] + self._kernel_factory.create_lambda(wit) for wit in work_items[1:] ) + tuple( - Lambda.from_expression(self._ctx, self._factory.parse_index(1)) + self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)) for _ in range(4 - rank) ) @@ -350,7 +352,7 @@ class GpuIndexing: return tuple(ispace.actual_iterations(dim) for dim in dimensions) case SparseIterationSpace(): - return (self._factory.parse_index(ispace.index_list.shape[0]),) + return (self._ast_factory.parse_index(ispace.index_list.shape[0]),) case _: assert False, "unexpected iteration space" diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py index b847a8139a8725c9c926b7c12c9556aba3ec6e87..0d724b87730f0ec327772bccbb55a8bfff7c8ddd 100644 --- a/src/pystencils/codegen/target.py +++ b/src/pystencils/codegen/target.py @@ -89,10 +89,13 @@ class Target(Flag): GPU = CUDA """Alias for `Target.CUDA`, for backward compatibility.""" - SYCL = _GPU | _SYCL + SYCL = _SYCL """SYCL kernel target. Generate a function to be called within a SYCL parallel command. + + .. note:: + The SYCL target is experimental and not thoroughly tested yet. """ def is_automatic(self) -> bool: diff --git a/tests/kernelcreation/test_sycl_codegen.py b/tests/kernelcreation/test_sycl_codegen.py new file mode 100644 index 0000000000000000000000000000000000000000..b6907c9965b4b80a5b97865063170dfdc3654615 --- /dev/null +++ b/tests/kernelcreation/test_sycl_codegen.py @@ -0,0 +1,45 @@ +""" +Since we don't have a JIT compiler for SYCL, these tests can only +perform dry-dock testing. +If the SYCL target should ever become non-experimental, we need to +find a way to properly test SYCL kernels in execution. + +These tests primarily check that the code generation driver runs +successfully for the SYCL target. +""" + +import sympy as sp +from pystencils import ( + create_kernel, + Target, + fields, + Assignment, + CreateKernelConfig, +) + + +def test_sycl_kernel_static(): + src, dst = fields("src, dst: [2D]") + asm = Assignment(dst.center(), sp.sin(src.center()) + sp.cos(src.center())) + + cfg = CreateKernelConfig(target=Target.SYCL) + kernel = create_kernel(asm, cfg) + + code_string = kernel.get_c_code() + + assert "sycl::id< 2 >" in code_string + assert "sycl::sin(" in code_string + assert "sycl::cos(" in code_string + + +def test_sycl_kernel_manual_block_size(): + src, dst = fields("src, dst: [2D]") + asm = Assignment(dst.center(), sp.sin(src.center()) + sp.cos(src.center())) + + cfg = CreateKernelConfig(target=Target.SYCL) + cfg.sycl.automatic_block_size = False + kernel = create_kernel(asm, cfg) + + code_string = kernel.get_c_code() + + assert "sycl::nd_item< 2 >" in code_string diff --git a/tests/nbackend/test_vectorization.py b/tests/nbackend/test_vectorization.py index b60dc24774566d67eaa271c6ab775374746d89cf..fecade65d97afcaae4382bcc2ced119b2a957bed 100644 --- a/tests/nbackend/test_vectorization.py +++ b/tests/nbackend/test_vectorization.py @@ -20,7 +20,7 @@ from pystencils.backend.transformations import ( LowerToC, ) from pystencils.backend.constants import PsConstant -from pystencils.codegen.driver import create_cpu_kernel_function +from pystencils.codegen.driver import KernelFactory from pystencils.jit import LegacyCpuJit from pystencils import Target, fields, Assignment, Field from pystencils.field import create_numpy_array_with_layout @@ -135,8 +135,8 @@ def create_vector_kernel( lower = LowerToC(ctx) loop_nest = lower(loop_nest) - func = create_cpu_kernel_function( - ctx, + kfactory = KernelFactory(ctx) + func = kfactory.create_generic_kernel( platform, PsBlock([loop_nest]), "vector_kernel",