diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 90efebe61758d152506f05ece4722b23a7541098..9877cea44d3a01a79ddd9eecdd5b5e79cf71527f 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -168,10 +168,11 @@ class CudaPlatform(GenericGpu):
 
                 match op:
                     case ReductionOp.Sub:
-                        # workaround for unsupported atomicSub: use atomic add and invert sign
+                        # workaround for unsupported atomicSub: use atomic add
+                        # similar to OpenMP reductions: local copies (negative sign) are added at the end
                         call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype],
                                           PsCustomType("void"))
-                        call.args = (ptr_expr, -symbol_expr)
+                        call.args = (ptr_expr, symbol_expr)
                     case _:
                         call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
                                                   PsCustomType("void"))
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 7655572de3765990a21dd14579879e2dd15f8fb2..1e7468e33768f45c989cb89cd617475762570bce 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -8,6 +8,7 @@ from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsF
     PsReductionFunction
 from ..literals import PsLiteral
 from ...compound_op_mapping import compound_op_to_expr
+from ...sympyextensions import ReductionOp
 from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType
 
 from .platform import Platform
@@ -81,8 +82,11 @@ class GenericCpu(Platform):
 
                 ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
 
+                # inspired by OpenMP: local reduction variable (negative sign) is added at the end
+                actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
+
                 # TODO: can this be avoided somehow?
-                potential_call = compound_op_to_expr(op, ptr_access, symbol_expr)
+                potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr)
                 if isinstance(potential_call, PsCall):
                     potential_call.dtype = symbol_expr.dtype
                     potential_call = self.select_function(potential_call)
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index ee14d16898bb2eef0efc67c5b9b001c6df128f94..02b5ea6db120f2b296833f6f74bc51ca2819ebe2 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -18,6 +18,7 @@ from ..ast.expressions import (
     PsCall,
 )
 from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal
+from ...sympyextensions import ReductionOp
 from ...types import PsCustomType, PsVectorType, PsPointerType, PsType
 from ..constants import PsConstant
 
@@ -357,7 +358,9 @@ def _x86_op_intrin(
                 suffix += "x"
             atype = vtype.scalar_type
         case PsVecHorizontal():
-            opstr = f"horizontal_{op.reduction_op.name.lower()}"
+            # horizontal add instead of sub avoids double inversion of sign
+            actual_op = ReductionOp.Add if op.reduction_op == ReductionOp.Sub else op.reduction_op
+            opstr = f"horizontal_{actual_op.name.lower()}"
             rtype = vtype.scalar_type
             atypes = (vtype.scalar_type, vtype)
         case PsAdd():
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 12dc4ba1cdb9f0a84562fc7e89d4a1dff68d5939..537eb4b67ade9f38fdaa05626c6dc29511ad2cad 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -10,7 +10,7 @@ INIT_ARR = 2
 SIZE = 15
 SOLUTION = {
     "+": INIT_W + INIT_ARR * SIZE,
-    "-": INIT_W - INIT_ARR * -SIZE,
+    "-": INIT_W - INIT_ARR * SIZE,
     "*": INIT_W * INIT_ARR ** SIZE,
     "min": min(INIT_W, INIT_ARR),
     "max": max(INIT_W, INIT_ARR),