diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 90efebe61758d152506f05ece4722b23a7541098..9877cea44d3a01a79ddd9eecdd5b5e79cf71527f 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -168,10 +168,11 @@ class CudaPlatform(GenericGpu): match op: case ReductionOp.Sub: - # workaround for unsupported atomicSub: use atomic add and invert sign + # workaround for unsupported atomicSub: use atomic add + # similar to OpenMP reductions: local copies (negative sign) are added at the end call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) - call.args = (ptr_expr, -symbol_expr) + call.args = (ptr_expr, symbol_expr) case _: call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 7655572de3765990a21dd14579879e2dd15f8fb2..1e7468e33768f45c989cb89cd617475762570bce 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -8,6 +8,7 @@ from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsF PsReductionFunction from ..literals import PsLiteral from ...compound_op_mapping import compound_op_to_expr +from ...sympyextensions import ReductionOp from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType from .platform import Platform @@ -81,8 +82,11 @@ class GenericCpu(Platform): ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) + # inspired by OpenMP: local reduction variable (negative sign) is added at the end + actual_op = ReductionOp.Add if op is ReductionOp.Sub else op + # TODO: can this be avoided somehow? - potential_call = compound_op_to_expr(op, ptr_access, symbol_expr) + potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr) if isinstance(potential_call, PsCall): potential_call.dtype = symbol_expr.dtype potential_call = self.select_function(potential_call) diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index ee14d16898bb2eef0efc67c5b9b001c6df128f94..02b5ea6db120f2b296833f6f74bc51ca2819ebe2 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -18,6 +18,7 @@ from ..ast.expressions import ( PsCall, ) from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal +from ...sympyextensions import ReductionOp from ...types import PsCustomType, PsVectorType, PsPointerType, PsType from ..constants import PsConstant @@ -357,7 +358,9 @@ def _x86_op_intrin( suffix += "x" atype = vtype.scalar_type case PsVecHorizontal(): - opstr = f"horizontal_{op.reduction_op.name.lower()}" + # horizontal add instead of sub avoids double inversion of sign + actual_op = ReductionOp.Add if op.reduction_op == ReductionOp.Sub else op.reduction_op + opstr = f"horizontal_{actual_op.name.lower()}" rtype = vtype.scalar_type atypes = (vtype.scalar_type, vtype) case PsAdd(): diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 12dc4ba1cdb9f0a84562fc7e89d4a1dff68d5939..537eb4b67ade9f38fdaa05626c6dc29511ad2cad 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -10,7 +10,7 @@ INIT_ARR = 2 SIZE = 15 SOLUTION = { "+": INIT_W + INIT_ARR * SIZE, - "-": INIT_W - INIT_ARR * -SIZE, + "-": INIT_W - INIT_ARR * SIZE, "*": INIT_W * INIT_ARR ** SIZE, "min": min(INIT_W, INIT_ARR), "max": max(INIT_W, INIT_ARR),