From 1dff46104f88cc318d05199d223adf91b0aff503 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 19 Feb 2025 10:15:50 +0100 Subject: [PATCH 1/2] Move free functions relating to kernel creation in `driver.py` to `KernelFactory`. --- src/pystencils/backend/platforms/sycl.py | 5 +- src/pystencils/codegen/driver.py | 174 ++++++++++++----------- src/pystencils/codegen/functions.py | 9 -- src/pystencils/codegen/gpu_indexing.py | 18 +-- src/pystencils/codegen/target.py | 5 +- tests/nbackend/test_vectorization.py | 6 +- 6 files changed, 110 insertions(+), 107 deletions(-) diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 7f2bbf9f6..f3c4bb3d5 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -24,12 +24,13 @@ from ..extensions.cpp import CppMethodCall from ..kernelcreation import KernelCreationContext, AstFactory from ..constants import PsConstant -from .generic_gpu import GenericGpu from ..exceptions import MaterializationError from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType +from .platform import Platform -class SyclPlatform(GenericGpu): + +class SyclPlatform(Platform): def __init__( self, diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 14a95c84d..eaac393ef 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import cast, Sequence, Iterable, Callable, TYPE_CHECKING +from typing import cast, Sequence, Callable, TYPE_CHECKING from dataclasses import dataclass, replace from .target import Target @@ -15,6 +15,7 @@ from .config import ( from .kernel import Kernel, GpuKernel from .properties import PsSymbolProperty, FieldBasePtr from .parameters import Parameter +from .functions import Lambda from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration from ..field import Field @@ -23,6 +24,7 @@ from ..types import PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode from ..backend.ast.structural import PsBlock, PsLoop +from ..backend.ast.expressions import PsExpression from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -201,20 +203,20 @@ class DefaultKernelCreationDriver: canonicalize = CanonicalizeSymbols(self._ctx, True) kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) - if self._target.is_cpu(): - return create_cpu_kernel_function( - self._ctx, + kernel_factory = KernelFactory(self._ctx) + + if self._target.is_cpu() or self._target == Target.SYCL: + return kernel_factory.create_generic_kernel( self._platform, kernel_ast, self._cfg.get_option("function_name"), self._target, self._cfg.get_jit(), ) - else: + elif self._target.is_gpu(): assert self._gpu_indexing is not None - return create_gpu_kernel_function( - self._ctx, + return kernel_factory.create_gpu_kernel( self._platform, kernel_ast, self._cfg.get_option("function_name"), @@ -222,6 +224,8 @@ class DefaultKernelCreationDriver: self._cfg.get_jit(), self._gpu_indexing.get_launch_config_factory(), ) + else: + assert False, "unexpected target" def parse_kernel_body( self, @@ -469,83 +473,85 @@ class DefaultKernelCreationDriver: ) -def create_cpu_kernel_function( - ctx: KernelCreationContext, - platform: Platform, - body: PsBlock, - function_name: str, - target_spec: Target, - jit: JitBase, -) -> Kernel: - undef_symbols = collect_undefined_symbols(body) - - params = _get_function_params(ctx, undef_symbols) - req_headers = _get_headers(ctx, platform, body) - - kfunc = Kernel(body, target_spec, function_name, params, req_headers, jit) - kfunc.metadata.update(ctx.metadata) - return kfunc - - -def create_gpu_kernel_function( - ctx: KernelCreationContext, - platform: Platform, - body: PsBlock, - function_name: str, - target_spec: Target, - jit: JitBase, - launch_config_factory: Callable[[], GpuLaunchConfiguration], -) -> GpuKernel: - undef_symbols = collect_undefined_symbols(body) - - params = _get_function_params(ctx, undef_symbols) - req_headers = _get_headers(ctx, platform, body) - - kfunc = GpuKernel( - body, - target_spec, - function_name, - params, - req_headers, - jit, - launch_config_factory, - ) - kfunc.metadata.update(ctx.metadata) - return kfunc - - -def _symbol_to_param(ctx: KernelCreationContext, symbol: PsSymbol): - from pystencils.backend.memory import BufferBasePtr, BackendPrivateProperty - - props: set[PsSymbolProperty] = set() - for prop in symbol.properties: - match prop: - case BufferBasePtr(buf): - field = ctx.find_field(buf.name) - props.add(FieldBasePtr(field)) - case BackendPrivateProperty(): - pass - case _: - props.add(prop) - - return Parameter(symbol.name, symbol.get_dtype(), props) - - -def _get_function_params( - ctx: KernelCreationContext, symbols: Iterable[PsSymbol] -) -> list[Parameter]: - params: list[Parameter] = [_symbol_to_param(ctx, s) for s in symbols] - params.sort(key=lambda p: p.name) - return params - - -def _get_headers( - ctx: KernelCreationContext, platform: Platform, body: PsBlock -) -> set[str]: - req_headers = collect_required_headers(body) - req_headers |= platform.required_headers - req_headers |= ctx.required_headers - return req_headers +class KernelFactory: + """Factory for wrapping up backend and IR objects into exportable kernels and function objects.""" + + def __init__(self, ctx: KernelCreationContext): + self._ctx = ctx + + def create_lambda(self, expr: PsExpression) -> Lambda: + """Create a Lambda from an expression.""" + params = self._get_function_params(expr) + return Lambda(expr, params) + + def create_generic_kernel( + self, + platform: Platform, + body: PsBlock, + function_name: str, + target_spec: Target, + jit: JitBase, + ) -> Kernel: + """Create a kernel for a generic target""" + params = self._get_function_params(body) + req_headers = self._get_headers(platform, body) + + kfunc = Kernel(body, target_spec, function_name, params, req_headers, jit) + kfunc.metadata.update(self._ctx.metadata) + return kfunc + + def create_gpu_kernel( + self, + platform: Platform, + body: PsBlock, + function_name: str, + target_spec: Target, + jit: JitBase, + launch_config_factory: Callable[[], GpuLaunchConfiguration], + ) -> GpuKernel: + """Create a kernel for a GPU target""" + params = self._get_function_params(body) + req_headers = self._get_headers(platform, body) + + kfunc = GpuKernel( + body, + target_spec, + function_name, + params, + req_headers, + jit, + launch_config_factory, + ) + kfunc.metadata.update(self._ctx.metadata) + return kfunc + + def _symbol_to_param(self, symbol: PsSymbol): + from pystencils.backend.memory import BufferBasePtr, BackendPrivateProperty + + props: set[PsSymbolProperty] = set() + for prop in symbol.properties: + match prop: + case BufferBasePtr(buf): + field = self._ctx.find_field(buf.name) + props.add(FieldBasePtr(field)) + case BackendPrivateProperty(): + pass + case _: + props.add(prop) + + return Parameter(symbol.name, symbol.get_dtype(), props) + + def _get_function_params(self, ast: PsAstNode) -> list[Parameter]: + symbols = collect_undefined_symbols(ast) + params: list[Parameter] = [self._symbol_to_param(s) for s in symbols] + params.sort(key=lambda p: p.name) + return params + + def _get_headers(self, platform: Platform, body: PsBlock) -> set[str]: + req_headers = collect_required_headers(body) + req_headers |= platform.required_headers + req_headers |= self._ctx.required_headers + return req_headers @dataclass diff --git a/src/pystencils/codegen/functions.py b/src/pystencils/codegen/functions.py index f6be3b1f3..c24dbaffb 100644 --- a/src/pystencils/codegen/functions.py +++ b/src/pystencils/codegen/functions.py @@ -4,21 +4,12 @@ from typing import Sequence, Any from .parameters import Parameter from ..types import PsType -from ..backend.kernelcreation import KernelCreationContext from ..backend.ast.expressions import PsExpression class Lambda: """A one-line function emitted by the code generator as an auxiliary object.""" - @staticmethod - def from_expression(ctx: KernelCreationContext, expr: PsExpression): - from ..backend.ast.analysis import collect_undefined_symbols - from .driver import _get_function_params - - params = _get_function_params(ctx, collect_undefined_symbols(expr)) - return Lambda(expr, params) - def __init__(self, expr: PsExpression, params: Sequence[Parameter]): self._expr = expr self._params = tuple(params) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 2d22ec624..c93f0f959 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -228,8 +228,10 @@ class GpuIndexing: self._manual_launch_grid = manual_launch_grid from ..backend.kernelcreation import AstFactory + from .driver import KernelFactory - self._factory = AstFactory(self._ctx) + self._ast_factory = AstFactory(self._ctx) + self._kernel_factory = KernelFactory(self._ctx) def get_thread_mapping(self) -> ThreadMapping: """Retrieve a thread mapping object for use by the backend""" @@ -267,7 +269,7 @@ class GpuIndexing: num_work_items = cast( _Dim3Lambda, - tuple(Lambda.from_expression(self._ctx, wit) for wit in work_items_expr), + tuple(self._kernel_factory.create_lambda(wit) for wit in work_items_expr), ) def factory(): @@ -305,15 +307,15 @@ class GpuIndexing: raise ValueError(f"Iteration space rank is too large: {rank}") block_size = ( - Lambda.from_expression(self._ctx, work_items[0]), - Lambda.from_expression(self._ctx, self._factory.parse_index(1)), - Lambda.from_expression(self._ctx, self._factory.parse_index(1)), + self._kernel_factory.create_lambda(work_items[0]), + self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), + self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), ) grid_size = tuple( - Lambda.from_expression(self._ctx, wit) for wit in work_items[1:] + self._kernel_factory.create_lambda(wit) for wit in work_items[1:] ) + tuple( - Lambda.from_expression(self._ctx, self._factory.parse_index(1)) + self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)) for _ in range(4 - rank) ) @@ -350,7 +352,7 @@ class GpuIndexing: return tuple(ispace.actual_iterations(dim) for dim in dimensions) case SparseIterationSpace(): - return (self._factory.parse_index(ispace.index_list.shape[0]),) + return (self._ast_factory.parse_index(ispace.index_list.shape[0]),) case _: assert False, "unexpected iteration space" diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py index b847a8139..0d724b877 100644 --- a/src/pystencils/codegen/target.py +++ b/src/pystencils/codegen/target.py @@ -89,10 +89,13 @@ class Target(Flag): GPU = CUDA """Alias for `Target.CUDA`, for backward compatibility.""" - SYCL = _GPU | _SYCL + SYCL = _SYCL """SYCL kernel target. Generate a function to be called within a SYCL parallel command. + + .. note:: + The SYCL target is experimental and not thoroughly tested yet. """ def is_automatic(self) -> bool: diff --git a/tests/nbackend/test_vectorization.py b/tests/nbackend/test_vectorization.py index b60dc2477..fecade65d 100644 --- a/tests/nbackend/test_vectorization.py +++ b/tests/nbackend/test_vectorization.py @@ -20,7 +20,7 @@ from pystencils.backend.transformations import ( LowerToC, ) from pystencils.backend.constants import PsConstant -from pystencils.codegen.driver import create_cpu_kernel_function +from pystencils.codegen.driver import KernelFactory from pystencils.jit import LegacyCpuJit from pystencils import Target, fields, Assignment, Field from pystencils.field import create_numpy_array_with_layout @@ -135,8 +135,8 @@ def create_vector_kernel( lower = LowerToC(ctx) loop_nest = lower(loop_nest) - func = create_cpu_kernel_function( - ctx, + kfactory = KernelFactory(ctx) + func = kfactory.create_generic_kernel( platform, PsBlock([loop_nest]), "vector_kernel", -- GitLab From 422a2494c1ae20f1d28f8cd8e2a52faf67e75c1f Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 19 Feb 2025 10:27:40 +0100 Subject: [PATCH 2/2] add dry-dock tests for SYCL codegen --- src/pystencils/codegen/driver.py | 25 ++++++------- tests/kernelcreation/test_sycl_codegen.py | 45 +++++++++++++++++++++++ 2 files changed, 57 insertions(+), 13 deletions(-) create mode 100644 tests/kernelcreation/test_sycl_codegen.py diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index eaac393ef..b8f9c7101 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -436,23 +436,11 @@ class DefaultKernelCreationDriver: f"No platform is currently available for CPU target {self._target}" ) - elif Target._GPU in self._target: + elif self._target.is_gpu(): gpu_opts = self._cfg.gpu omit_range_check: bool = gpu_opts.get_option("omit_range_check") match self._target: - case Target.SYCL: - from ..backend.platforms import SyclPlatform - - auto_block_size: bool = self._cfg.sycl.get_option( - "automatic_block_size" - ) - - return SyclPlatform( - self._ctx, - omit_range_check=omit_range_check, - automatic_block_size=auto_block_size, - ) case Target.CUDA: from ..backend.platforms import CudaPlatform @@ -467,6 +455,17 @@ class DefaultKernelCreationDriver: omit_range_check=omit_range_check, thread_mapping=thread_mapping, ) + elif self._target == Target.SYCL: + from ..backend.platforms import SyclPlatform + + auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size") + omit_range_check = self._cfg.gpu.get_option("omit_range_check") + + return SyclPlatform( + self._ctx, + omit_range_check=omit_range_check, + automatic_block_size=auto_block_size, + ) raise NotImplementedError( f"Code generation for target {self._target} not implemented" diff --git a/tests/kernelcreation/test_sycl_codegen.py b/tests/kernelcreation/test_sycl_codegen.py new file mode 100644 index 000000000..b6907c996 --- /dev/null +++ b/tests/kernelcreation/test_sycl_codegen.py @@ -0,0 +1,45 @@ +""" +Since we don't have a JIT compiler for SYCL, these tests can only +perform dry-dock testing. +If the SYCL target should ever become non-experimental, we need to +find a way to properly test SYCL kernels in execution. + +These tests primarily check that the code generation driver runs +successfully for the SYCL target. +""" + +import sympy as sp +from pystencils import ( + create_kernel, + Target, + fields, + Assignment, + CreateKernelConfig, +) + + +def test_sycl_kernel_static(): + src, dst = fields("src, dst: [2D]") + asm = Assignment(dst.center(), sp.sin(src.center()) + sp.cos(src.center())) + + cfg = CreateKernelConfig(target=Target.SYCL) + kernel = create_kernel(asm, cfg) + + code_string = kernel.get_c_code() + + assert "sycl::id< 2 >" in code_string + assert "sycl::sin(" in code_string + assert "sycl::cos(" in code_string + + +def test_sycl_kernel_manual_block_size(): + src, dst = fields("src, dst: [2D]") + asm = Assignment(dst.center(), sp.sin(src.center()) + sp.cos(src.center())) + + cfg = CreateKernelConfig(target=Target.SYCL) + cfg.sycl.automatic_block_size = False + kernel = create_kernel(asm, cfg) + + code_string = kernel.get_c_code() + + assert "sycl::nd_item< 2 >" in code_string -- GitLab