From 5ee715d0df2651e745ff5de0524abfe24d48c968 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 20 Mar 2025 17:04:47 +0100 Subject: [PATCH] Reformat adapted files [skip ci] --- src/pystencils/__init__.py | 9 +- src/pystencils/backend/ast/vector.py | 46 +++++---- src/pystencils/backend/emission/ir_printer.py | 4 +- .../backend/kernelcreation/context.py | 6 +- .../backend/kernelcreation/freeze.py | 18 ++-- .../backend/kernelcreation/typification.py | 7 +- .../backend/platforms/generic_cpu.py | 46 +++++++-- .../backend/platforms/generic_gpu.py | 98 ++++++++++++++----- src/pystencils/backend/platforms/platform.py | 2 +- src/pystencils/backend/platforms/sycl.py | 4 +- src/pystencils/backend/platforms/x86.py | 10 +- .../backend/transformations/add_pragmas.py | 8 +- .../transformations/loop_vectorizer.py | 36 ++++--- .../transformations/select_functions.py | 8 +- .../transformations/select_intrinsics.py | 4 +- src/pystencils/codegen/driver.py | 38 +++++-- src/pystencils/compound_op_mapping.py | 15 ++- src/pystencils/jit/cpu_extension_module.py | 5 +- src/pystencils/sympyextensions/__init__.py | 4 +- src/pystencils/sympyextensions/reduction.py | 11 ++- tests/kernelcreation/test_reduction.py | 23 +++-- 21 files changed, 283 insertions(+), 119 deletions(-) diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index a7bf33aa6..329f61d32 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -1,10 +1,6 @@ """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions""" -from .codegen import ( - Target, - CreateKernelConfig, - AUTO -) +from .codegen import Target, CreateKernelConfig, AUTO from .defaults import DEFAULTS from . import fd from . import stencil as stencil @@ -93,4 +89,5 @@ __all__ = [ ] from . import _version -__version__ = _version.get_versions()['version'] + +__version__ = _version.get_versions()["version"] diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 4f5224133..4141b0296 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -18,7 +18,7 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): """Broadcast a scalar value to N vector lanes.""" __match_args__ = ("lanes", "operand") - + def __init__(self, lanes: int, operand: PsExpression): super().__init__(operand) self._lanes = lanes @@ -26,21 +26,18 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): @property def lanes(self) -> int: return self._lanes - + @lanes.setter def lanes(self, n: int): self._lanes = n def _clone_expr(self) -> PsVecBroadcast: return PsVecBroadcast(self._lanes, self._operand.clone()) - + def structurally_equal(self, other: PsAstNode) -> bool: if not isinstance(other, PsVecBroadcast): return False - return ( - super().structurally_equal(other) - and self._lanes == other._lanes - ) + return super().structurally_equal(other) and self._lanes == other._lanes class PsVecHorizontal(PsBinOp, PsVectorOp): @@ -48,8 +45,13 @@ class PsVecHorizontal(PsBinOp, PsVectorOp): __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op") - def __init__(self, lanes: int, scalar_operand: PsExpression, vector_operand: PsExpression, - reduction_op: ReductionOp): + def __init__( + self, + lanes: int, + scalar_operand: PsExpression, + vector_operand: PsExpression, + reduction_op: ReductionOp, + ): super().__init__(scalar_operand, vector_operand) self._lanes = lanes self._reduction_op = reduction_op @@ -87,19 +89,23 @@ class PsVecHorizontal(PsBinOp, PsVectorOp): self._reduction_op = op def _clone_expr(self) -> PsVecHorizontal: - return PsVecHorizontal(self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op) + return PsVecHorizontal( + self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op + ) def structurally_equal(self, other: PsAstNode) -> bool: if not isinstance(other, PsVecHorizontal): return False - return (super().structurally_equal(other) - and self._lanes == other._lanes - and self._reduction_op == other._reduction_op) + return ( + super().structurally_equal(other) + and self._lanes == other._lanes + and self._reduction_op == other._reduction_op + ) class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): """Pointer-based vectorized memory access. - + Args: base_ptr: Pointer identifying the accessed memory region offset: Offset inside the memory region @@ -150,7 +156,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): @property def stride(self) -> PsExpression | None: return self._stride - + @stride.setter def stride(self, expr: PsExpression | None): self._stride = expr @@ -161,10 +167,12 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): def get_vector_type(self) -> PsVectorType: return cast(PsVectorType, self._dtype) - + def get_children(self) -> tuple[PsAstNode, ...]: - return (self._ptr, self._offset) + (() if self._stride is None else (self._stride,)) - + return (self._ptr, self._offset) + ( + () if self._stride is None else (self._stride,) + ) + def set_child(self, idx: int, c: PsAstNode): idx = [0, 1, 2][idx] match idx: @@ -193,7 +201,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): and self._vector_entries == other._vector_entries and self._aligned == other._aligned ) - + def __repr__(self) -> str: return ( f"PsVecMemAcc({repr(self._ptr)}, {repr(self._offset)}, {repr(self._vector_entries)}, " diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py index 1508e6d94..22ae2f91a 100644 --- a/src/pystencils/backend/emission/ir_printer.py +++ b/src/pystencils/backend/emission/ir_printer.py @@ -24,7 +24,7 @@ def emit_ir(ir: PsAstNode | Kernel): class IRAstPrinter(BasePrinter): """Print the IR AST as pseudo-code. - + This printer produces a complete pseudocode representation of a pystencils AST. Other than the `CAstPrinter`, the `IRAstPrinter` is capable of emitting code for each node defined in `ast <pystencils.backend.ast>`. @@ -85,7 +85,7 @@ class IRAstPrinter(BasePrinter): return pc.parenthesize( f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({scalar_operand_code, vector_operand_code})", - Ops.Weakest + Ops.Weakest, ) case _: diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 827be45a5..536c73c7f 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -106,7 +106,7 @@ class KernelCreationContext: def index_dtype(self) -> PsIntegerType: """Data type used by default for index expressions""" return self._index_dtype - + def resolve_dynamic_type(self, dtype: DynamicType | PsType) -> PsType: """Selects the appropriate data type for `DynamicType` instances, and returns all other types as they are.""" match dtype: @@ -191,7 +191,9 @@ class KernelCreationContext: self._symbols[old.name] = new - def add_symbol_reduction_info(self, local_symb: PsSymbol, reduction_info: ReductionInfo): + def add_symbol_reduction_info( + self, local_symb: PsSymbol, reduction_info: ReductionInfo + ): """Adds entry for a symbol and its reduction info to its corresponding lookup table. The symbol ``symbol`` shall not exist in the symbol table already. diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index df6bfbd1f..63e9ea5b1 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -57,7 +57,7 @@ from ..ast.expressions import ( PsAnd, PsOr, PsNot, - PsMemAcc + PsMemAcc, ) from ..ast.vector import PsVecMemAcc @@ -110,7 +110,9 @@ class FreezeExpressions: def __call__(self, obj: AssignmentCollection | sp.Basic) -> PsAstNode: if isinstance(obj, AssignmentCollection): - return PsBlock([cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments]) + return PsBlock( + [cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments] + ) elif isinstance(obj, AssignmentBase): return cast(PsAssignment, self.visit(obj)) elif isinstance(obj, _ExprLike): @@ -179,7 +181,9 @@ class FreezeExpressions: "/=": ReductionOp.Div, } - return PsAssignment(lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs)) + return PsAssignment( + lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs) + ) def map_ReductionAssignment(self, expr: ReductionAssignment): assert isinstance(expr.lhs, TypedSymbol) @@ -327,22 +331,22 @@ class FreezeExpressions: raise FreezeError("Cannot translate an empty tuple.") items = [self.visit_expr(item) for item in expr] - + if any(isinstance(i, PsArrayInitList) for i in items): # base case: have nested arrays if not all(isinstance(i, PsArrayInitList) for i in items): raise FreezeError( f"Cannot translate nested arrays of non-uniform shape: {expr}" ) - + subarrays = cast(list[PsArrayInitList], items) shape_tail = subarrays[0].shape - + if not all(s.shape == shape_tail for s in subarrays[1:]): raise FreezeError( f"Cannot translate nested arrays of non-uniform shape: {expr}" ) - + return PsArrayInitList([s.items_grid for s in subarrays]) # type: ignore else: # base case: no nested arrays diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 3ca0a16e2..b457f39a0 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -194,9 +194,10 @@ class TypeContext: f" Target type: {self._target_type}" ) - case PsNumericOpTrait() if not isinstance( - self._target_type, PsNumericType - ) or self._target_type.is_bool(): + case PsNumericOpTrait() if ( + not isinstance(self._target_type, PsNumericType) + or self._target_type.is_bool() + ): # FIXME: PsBoolType derives from PsNumericType, but is not numeric raise TypificationError( f"Numerical operation encountered in non-numerical type context:\n" diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 43b048184..ccef61817 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -4,8 +4,14 @@ from typing import Sequence from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr from ..ast import PsAstNode -from ..functions import CFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions, PsMathFunction, \ - PsReductionFunction +from ..functions import ( + CFunction, + MathFunctions, + NumericLimitsFunctions, + ReductionFunctions, + PsMathFunction, + PsReductionFunction, +) from ..literals import PsLiteral from ...compound_op_mapping import compound_op_to_expr from ...sympyextensions import ReductionOp @@ -30,7 +36,8 @@ from ..ast.expressions import ( PsLookup, PsGe, PsLe, - PsTernary, PsLiteralExpr, + PsTernary, + PsLiteralExpr, ) from ..ast.vector import PsVecMemAcc from ...types import PsVectorType, PsCustomType @@ -60,20 +67,31 @@ class GenericCpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: call_func = call.function assert isinstance(call_func, PsReductionFunction | PsMathFunction) func = call_func.func - if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr: + if ( + isinstance(call_func, PsReductionFunction) + and func is ReductionFunctions.WriteBackToPtr + ): ptr_expr, symbol_expr = call.args op = call_func.reduction_op - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance( + ptr_expr.dtype, PsPointerType + ) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance( + symbol_expr.dtype, PsScalarType + ) - ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) + ptr_access = PsMemAcc( + ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) + ) # inspired by OpenMP: local reduction variable (negative sign) is added at the end actual_op = ReductionOp.Add if op is ReductionOp.Sub else op @@ -89,8 +107,16 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): - return PsLiteralExpr(PsLiteral(f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype)) + if isinstance(dtype, PsScalarType) and func in ( + NumericLimitsFunctions.Min, + NumericLimitsFunctions.Max, + ): + return PsLiteralExpr( + PsLiteral( + f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", + dtype, + ) + ) if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): cfunc: CFunction diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 9b21457be..2a12d6b7b 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -24,7 +24,14 @@ from ..kernelcreation import ( ) from ..kernelcreation.context import KernelCreationContext -from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment, PsStructuralNode +from ..ast.structural import ( + PsBlock, + PsConditional, + PsDeclaration, + PsStatement, + PsAssignment, + PsStructuralNode, +) from ..ast.expressions import ( PsExpression, PsLiteralExpr, @@ -36,13 +43,19 @@ from ..ast.expressions import ( PsConstantExpr, PsAdd, PsRem, - PsEq + PsEq, ) from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral -from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions, PsReductionFunction, \ - PsMathFunction +from ..functions import ( + MathFunctions, + CFunction, + ReductionFunctions, + NumericLimitsFunctions, + PsReductionFunction, + PsMathFunction, +) int32 = PsSignedIntegerType(width=32, const=False) @@ -131,7 +144,7 @@ class Blockwise4DMapping(ThreadMapping): THREAD_IDX[0], BLOCK_IDX[0], BLOCK_IDX[1], - BLOCK_IDX[2] + BLOCK_IDX[2], ] def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: @@ -177,7 +190,7 @@ class Blockwise4DMapping(ThreadMapping): class GenericGpu(Platform): """Common base platform for CUDA- and HIP-type GPU targets. - + Args: ctx: The kernel creation context omit_range_check: If `True`, generated index translation code will not check if the point identified @@ -238,23 +251,34 @@ class GenericGpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: call_func = call.function assert isinstance(call_func, PsReductionFunction | PsMathFunction) func = call_func.func - if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr: + if ( + isinstance(call_func, PsReductionFunction) + and func is ReductionFunctions.WriteBackToPtr + ): ptr_expr, symbol_expr = call.args op = call_func.reduction_op stype = symbol_expr.dtype ptrtype = ptr_expr.dtype - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance( + ptrtype, PsPointerType + ) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance( + stype, PsScalarType + ) if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): - NotImplementedError("atomic operations are only available for float32/64 datatypes") + NotImplementedError( + "atomic operations are only available for float32/64 datatypes" + ) # workaround for subtractions -> use additions for reducing intermediate results # similar to OpenMP reductions: local copies (negative sign) are added at the end @@ -274,36 +298,60 @@ class GenericGpu(Platform): # perform local warp reductions def gen_shuffle_instr(offset: int): full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) - return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), - [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) + return PsCall( + CFunction( + "__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype + ), + [ + full_mask, + symbol_expr, + PsConstantExpr(PsConstant(offset, SInt(32))), + ], + ) # set up shuffle instructions for warp-level reduction num_shuffles = math.frexp(self._warp_size)[1] - shuffles = tuple(PsAssignment(symbol_expr, - compound_op_to_expr(actual_op, - symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) - for i in reversed(range(1, num_shuffles))) + shuffles = tuple( + PsAssignment( + symbol_expr, + compound_op_to_expr( + actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1)) + ), + ) + for i in reversed(range(1, num_shuffles)) + ) # find first thread in warp thread_indices_per_dim = [ - idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) - for i, idx in enumerate(THREAD_IDX[:ispace.rank]) + idx + * PsConstantExpr( + PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)) + ) + for i, idx in enumerate(THREAD_IDX[: ispace.rank]) ] tid: PsExpression = thread_indices_per_dim[0] for t in thread_indices_per_dim[1:]: tid = PsAdd(tid, t) - first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), - PsConstantExpr(PsConstant(0, SInt(32)))) + first_thread_in_warp = PsEq( + PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32))), + ) # set condition to only execute atomic operation on first valid thread in warp - cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp + cond = ( + PsAnd(is_valid_thread, first_thread_in_warp) + if is_valid_thread + else first_thread_in_warp + ) else: # no optimization: only execute atomic add on valid thread shuffles = () cond = is_valid_thread # use atomic operation - call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) + call.function = CFunction( + f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void") + ) call.args = (ptr_expr, symbol_expr) # assemble warp reduction @@ -321,7 +369,9 @@ class GenericGpu(Platform): case NumericLimitsFunctions.Max: define = "POS_INFINITY" case _: - raise MaterializationError(f"Cannot materialize call to function {func}") + raise MaterializationError( + f"Cannot materialize call to function {func}" + ) return PsLiteralExpr(PsLiteral(define, dtype)) diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index 4f738dd5d..7b81865ae 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -12,7 +12,7 @@ class Platform(ABC): """Abstract base class for all supported platforms. The platform performs all target-dependent tasks during code generation: - + - Translation of the iteration space to an index source (loop nest, GPU indexing, ...) - Platform-specific optimizations (e.g. vectorization, OpenMP) """ diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 78af01b2f..22d60f9b0 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -56,7 +56,9 @@ class SyclPlatform(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: assert isinstance(call.function, PsMathFunction) func = call.function.func diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index df0945006..add38cfe4 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -359,7 +359,11 @@ def _x86_op_intrin( atype = vtype.scalar_type case PsVecHorizontal(): # horizontal add instead of sub avoids double inversion of sign - actual_op = ReductionOp.Add if op.reduction_op == ReductionOp.Sub else op.reduction_op + actual_op = ( + ReductionOp.Add + if op.reduction_op == ReductionOp.Sub + else op.reduction_op + ) opstr = f"horizontal_{actual_op.name.lower()}" rtype = vtype.scalar_type atypes = (vtype.scalar_type, vtype) @@ -409,7 +413,9 @@ def _x86_op_intrin( case (SInt(64), Fp()) | ( Fp(), SInt(64), - ) if varch < X86VectorArch.AVX512: + ) if ( + varch < X86VectorArch.AVX512 + ): panic() # AVX512 only: cvtepiA_epiT if A > T case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512: diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index c9e8b3994..fa466e495 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -126,9 +126,13 @@ class AddOpenMP: if bool(ctx.symbols_reduction_info): for symbol, reduction_info in ctx.symbols_reduction_info.items(): if isinstance(symbol.dtype, PsScalarType): - pragma_text += f" reduction({reduction_info.op.value}: {symbol.name})" + pragma_text += ( + f" reduction({reduction_info.op.value}: {symbol.name})" + ) else: - NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.") + NotImplementedError( + "OMP: Reductions for non-scalar data types are not supported yet." + ) if collapse is not None: if collapse <= 0: diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index a96c6af4b..04d7d20f0 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -7,7 +7,13 @@ from ...types import PsVectorType, PsScalarType from ..kernelcreation import KernelCreationContext from ..constants import PsConstant from ..ast import PsAstNode -from ..ast.structural import PsLoop, PsBlock, PsDeclaration, PsAssignment, PsStructuralNode +from ..ast.structural import ( + PsLoop, + PsBlock, + PsDeclaration, + PsAssignment, + PsStructuralNode, +) from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr from ..ast.vector import PsVecBroadcast, PsVecHorizontal from ..ast.analysis import collect_undefined_symbols @@ -142,13 +148,25 @@ class LoopVectorizer: vector_symb = vc.vectorize_symbol(symb) # Declare and init vector - simd_init_local_reduction_vars += [PsDeclaration( - PsSymbolExpr(vector_symb), PsVecBroadcast(self._lanes, PsSymbolExpr(symb)))] + simd_init_local_reduction_vars += [ + PsDeclaration( + PsSymbolExpr(vector_symb), + PsVecBroadcast(self._lanes, PsSymbolExpr(symb)), + ) + ] # Write back vectorization result - simd_writeback_local_reduction_vars += [PsAssignment( - PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(symb), PsSymbolExpr(vector_symb), - reduction_info.op))] + simd_writeback_local_reduction_vars += [ + PsAssignment( + PsSymbolExpr(symb), + PsVecHorizontal( + self._lanes, + PsSymbolExpr(symb), + PsSymbolExpr(vector_symb), + reduction_info.op, + ), + ) + ] # Generate vectorized loop body simd_body = self._vectorize_ast(loop.body, vc) @@ -241,11 +259,7 @@ class LoopVectorizer: return PsBlock( simd_init_local_reduction_vars - + [ - simd_stop_decl, - simd_step_decl, - simd_loop - ] + + [simd_stop_decl, simd_step_decl, simd_loop] + simd_writeback_local_reduction_vars + [ trailing_start_decl, diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py index 576cebad1..9ce404693 100644 --- a/src/pystencils/backend/transformations/select_functions.py +++ b/src/pystencils/backend/transformations/select_functions.py @@ -21,7 +21,9 @@ class SelectFunctions: if isinstance(node, PsAssignment): rhs = node.rhs - if isinstance(rhs, PsCall) and isinstance(rhs.function, PsReductionFunction): + if isinstance(rhs, PsCall) and isinstance( + rhs.function, PsReductionFunction + ): resolved_func = self._platform.select_function(rhs) match resolved_func: @@ -30,7 +32,9 @@ class SelectFunctions: match new_rhs: case PsExpression(): - return PsBlock(prepend + (PsAssignment(node.lhs, new_rhs),)) + return PsBlock( + prepend + (PsAssignment(node.lhs, new_rhs),) + ) case PsStructuralNode(): # special case: produces structural with atomic operation writing value back to ptr return PsBlock(prepend + (new_rhs,)) diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py index 49fb9bb08..b20614393 100644 --- a/src/pystencils/backend/transformations/select_intrinsics.py +++ b/src/pystencils/backend/transformations/select_intrinsics.py @@ -101,7 +101,9 @@ class SelectIntrinsics: if isinstance(expr, PsVecHorizontal): scalar_op = expr.scalar_operand vector_op_to_scalar = self.visit_expr(expr.vector_operand, sc) - return self._platform.op_intrinsic(expr, [scalar_op, vector_op_to_scalar]) + return self._platform.op_intrinsic( + expr, [scalar_op, vector_op_to_scalar] + ) else: return expr diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 3962c316b..c285dd7bf 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -26,7 +26,13 @@ from ..types import PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode from ..backend.functions import PsReductionFunction, ReductionFunctions -from ..backend.ast.expressions import PsExpression, PsSymbolExpr, PsCall, PsMemAcc, PsConstantExpr +from ..backend.ast.expressions import ( + PsExpression, + PsSymbolExpr, + PsCall, + PsMemAcc, + PsConstantExpr, +) from ..backend.ast.structural import PsBlock, PsLoop, PsDeclaration, PsAssignment from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( @@ -191,12 +197,20 @@ class DefaultKernelCreationDriver: ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol)) init_val = typify(reduction_info.init_val) - ptr_access = PsMemAcc(ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) - write_back_ptr = PsCall(PsReductionFunction(ReductionFunctions.WriteBackToPtr, reduction_info.op), - [ptr_symbol_expr, symbol_expr]) + ptr_access = PsMemAcc( + ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) + ) + write_back_ptr = PsCall( + PsReductionFunction( + ReductionFunctions.WriteBackToPtr, reduction_info.op + ), + [ptr_symbol_expr, symbol_expr], + ) - prepend_ast = [PsDeclaration(symbol_expr, init_val)] # declare and init local copy with neutral element - append_ast = [PsAssignment(ptr_access, write_back_ptr)] # write back result to reduction target variable + # declare and init local copy with neutral element + prepend_ast = [PsDeclaration(symbol_expr, init_val)] + # write back result to reduction target variable + append_ast = [PsAssignment(ptr_access, write_back_ptr)] kernel_ast.statements = prepend_ast + kernel_ast.statements kernel_ast.statements += append_ast @@ -423,14 +437,18 @@ class DefaultKernelCreationDriver: idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") - assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option( + "assume_warp_aligned_block_size" + ) warp_size: int | None = self._cfg.gpu.get_option("warp_size") if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) if warp_size is None and assume_warp_aligned_block_size: - warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.") + warn( + "GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`." + ) return GpuIndexing( self._ctx, @@ -475,7 +493,9 @@ class DefaultKernelCreationDriver: else None ) - assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option( + "assume_warp_aligned_block_size" + ) warp_size: int | None = self._cfg.gpu.get_option("warp_size") GpuPlatform: type diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/compound_op_mapping.py index f256369f9..193b308d0 100644 --- a/src/pystencils/compound_op_mapping.py +++ b/src/pystencils/compound_op_mapping.py @@ -3,7 +3,12 @@ from .backend.exceptions import FreezeError from .backend.functions import PsMathFunction, MathFunctions from .sympyextensions.reduction import ReductionOp -_available_operator_interface: set[ReductionOp] = {ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Div} +_available_operator_interface: set[ReductionOp] = { + ReductionOp.Add, + ReductionOp.Sub, + ReductionOp.Mul, + ReductionOp.Div, +} def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: @@ -18,7 +23,9 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: case ReductionOp.Div: operator = PsDiv case _: - raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.") + raise FreezeError( + f"Found unsupported operation type for compound assignments: {op}." + ) return operator(op1, op2) else: match op: @@ -27,4 +34,6 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: case ReductionOp.Max: return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) case _: - raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.") + raise FreezeError( + f"Found unsupported operation type for compound assignments: {op}." + ) diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index 03260f649..4d76ea9ca 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -92,6 +92,7 @@ class PsKernelExtensioNModule: # Kernels and call wrappers from ..backend.emission import CAstPrinter + printer = CAstPrinter(func_prefix="FUNC_PREFIX") for name, kernel in self._kernels.items(): @@ -293,7 +294,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ self._buffer_types[ptr] = ptr_dtype.base_type self.extract_buffer(ptr, param.name) buffer = self.get_buffer(param.name) - code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" + code = ( + f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" + ) assert code is not None diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 71f9a049a..bd0fa1fe9 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -28,7 +28,7 @@ from .math import ( count_operations_in_ast, common_denominator, get_symmetric_part, - SymbolCreator + SymbolCreator, ) @@ -67,5 +67,5 @@ __all__ = [ "common_denominator", "get_symmetric_part", "SymbolCreator", - "DynamicType" + "DynamicType", ] diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index cebfcb2f7..e95e37c24 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -22,6 +22,7 @@ class ReductionAssignment(AssignmentBase): reduction_op : ReductionOp Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc. """ + _reduction_op = None # type: ReductionOp @property @@ -55,9 +56,13 @@ class MaxReductionAssignment(ReductionAssignment): # Mapping from ReductionOp enum to ReductionAssigment classes _reduction_assignment_classes = { - cls.reduction_op: cls for cls in [ - AddReductionAssignment, SubReductionAssignment, MulReductionAssignment, - MinReductionAssignment, MaxReductionAssignment + cls.reduction_op: cls + for cls in [ + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, ] } diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index c3775964b..6e2b2f3fe 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -10,7 +10,7 @@ SIZE = 15 SOLUTION = { "+": INIT_W + INIT_ARR * SIZE, "-": INIT_W - INIT_ARR * SIZE, - "*": INIT_W * INIT_ARR ** SIZE, + "*": INIT_W * INIT_ARR**SIZE, "min": min(INIT_W, INIT_ARR), "max": max(INIT_W, INIT_ARR), } @@ -18,7 +18,7 @@ SOLUTION = { # get AST for kernel with reduction assignment def get_reduction_assign_ast(dtype, op, config): - x = ps.fields(f'x: {dtype}[1d]') + x = ps.fields(f"x: {dtype}[1d]") w = ps.TypedSymbol("w", dtype) red_assign = reduction_assignment_from_str(w, op, x.center()) @@ -26,13 +26,18 @@ def get_reduction_assign_ast(dtype, op, config): return ps.create_kernel([red_assign], config, default_dtype=dtype) -@pytest.mark.parametrize('instruction_set', ['sse', 'avx']) -@pytest.mark.parametrize('dtype', ["float64", "float32"]) +@pytest.mark.parametrize("instruction_set", ["sse", "avx"]) +@pytest.mark.parametrize("dtype", ["float64", "float32"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction_cpu(instruction_set, dtype, op): - vectorize_info = {'instruction_set': instruction_set, 'assume_inner_stride_one': True} + vectorize_info = { + "instruction_set": instruction_set, + "assume_inner_stride_one": True, + } - config = ps.CreateKernelConfig(target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info) + config = ps.CreateKernelConfig( + target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info + ) ast_reduction = get_reduction_assign_ast(dtype, op, config) ps.show_code(ast_reduction) @@ -45,7 +50,7 @@ def test_reduction_cpu(instruction_set, dtype, op): assert np.allclose(reduction_array, SOLUTION[op]) -@pytest.mark.parametrize('dtype', ["float64", "float32"]) +@pytest.mark.parametrize("dtype", ["float64", "float32"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction_gpu(dtype, op): try: @@ -57,7 +62,9 @@ def test_reduction_gpu(dtype, op): except ImportError: pytest.skip(reason="CuPy is not available", allow_module_level=True) except CUDARuntimeError: - pytest.skip(reason="No CUDA capable device is detected", allow_module_level=True) + pytest.skip( + reason="No CUDA capable device is detected", allow_module_level=True + ) config = ps.CreateKernelConfig(target=ps.Target.GPU) -- GitLab