diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 7a5074677433830133fef4c2f0f31761b9014284..8c3cd45faf2a210bf8de16e3f25d9ba7897d4e0f 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,11 +1,32 @@
 from __future__ import annotations
 
+import math
+
 from .generic_gpu import GenericGpu
-from ..ast.expressions import PsExpression, PsLiteralExpr
+from ..ast import PsAstNode
+from ..ast.expressions import (
+    PsExpression,
+    PsLiteralExpr,
+    PsCall,
+    PsAnd,
+    PsConstantExpr,
+    PsSymbolExpr,
+)
+from ..ast.structural import (
+    PsConditional,
+    PsStatement,
+    PsAssignment,
+    PsBlock,
+    PsStructuralNode,
+)
+from ..constants import PsConstant
 from ..exceptions import MaterializationError
-from ..functions import NumericLimitsFunctions
+from ..functions import NumericLimitsFunctions, CFunction
 from ..literals import PsLiteral
-from ...types import PsType, PsIeeeFloatType
+from ...reduction_op_mapping import reduction_op_to_expr
+from ...sympyextensions import ReductionOp
+from ...types import PsType, PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType
+from ...types.quick import SInt, UInt
 
 
 class CudaPlatform(GenericGpu):
@@ -17,7 +38,92 @@ class CudaPlatform(GenericGpu):
             '"npp.h"',
         }
 
-    def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression:
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        stype = symbol_expr.dtype
+        ptrtype = ptr_expr.dtype
+
+        assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
+        assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
+
+        if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
+            NotImplementedError(
+                "atomic operations are only available for float32/64 datatypes"
+            )
+
+        # workaround for subtractions -> use additions for reducing intermediate results
+        # similar to OpenMP reductions: local copies (negative sign) are added at the end
+        match reduction_op:
+            case ReductionOp.Sub:
+                actual_reduction_op = ReductionOp.Add
+            case _:
+                actual_reduction_op = reduction_op
+
+        # check if thread is valid for performing reduction
+        ispace = self._ctx.get_iteration_space()
+        is_valid_thread = self._get_condition_for_translation(ispace)
+
+        cond: PsExpression
+        shuffles: tuple[PsAssignment, ...]
+        if self._warp_size and self._assume_warp_aligned_block_size:
+            # perform local warp reductions
+            def gen_shuffle_instr(offset: int):
+                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
+                return PsCall(
+                    CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                    [
+                        full_mask,
+                        symbol_expr,
+                        PsConstantExpr(PsConstant(offset, SInt(32))),
+                    ],
+                )
+
+            # set up shuffle instructions for warp-level reduction
+            num_shuffles = math.frexp(self._warp_size)[1]
+            shuffles = tuple(
+                PsAssignment(
+                    symbol_expr,
+                    reduction_op_to_expr(
+                        actual_reduction_op,
+                        symbol_expr,
+                        gen_shuffle_instr(pow(2, i - 1)),
+                    ),
+                )
+                for i in reversed(range(1, num_shuffles))
+            )
+
+            # find first thread in warp
+            first_thread_in_warp = self._first_thread_in_warp(ispace)
+
+            # set condition to only execute atomic operation on first valid thread in warp
+            cond = (
+                PsAnd(is_valid_thread, first_thread_in_warp)
+                if is_valid_thread
+                else first_thread_in_warp
+            )
+        else:
+            # no optimization: only execute atomic add on valid thread
+            shuffles = ()
+            cond = is_valid_thread
+
+        # use atomic operation
+        func = CFunction(
+            f"atomic{actual_reduction_op.name}", [ptrtype, stype], PsCustomType("void")
+        )
+        func_args = (ptr_expr, symbol_expr)
+
+        # assemble warp reduction
+        return shuffles, PsConditional(
+            cond, PsBlock([PsStatement(PsCall(func, func_args))])
+        )
+
+    def resolve_numeric_limits(
+        self, func: NumericLimitsFunctions, dtype: PsType
+    ) -> PsExpression:
         assert isinstance(dtype, PsIeeeFloatType)
 
         match func:
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 349e79d4b6a727e84d9f2ec6e941c231ad3fcb81..8a4dd11a29c3243bc1cbad2a857646a1a7be038e 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,16 +1,19 @@
 from __future__ import annotations
 
-import math
 import operator
 from abc import ABC, abstractmethod
 from functools import reduce
 
 from ..ast import PsAstNode
 from ..constants import PsConstant
-from ...reduction_op_mapping import reduction_op_to_expr
 from ...sympyextensions.reduction import ReductionOp
-from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType, PsType
-from ...types.quick import UInt, SInt
+from ...types import (
+    constify,
+    deconstify,
+    PsScalarType,
+    PsType,
+)
+from ...types.quick import SInt
 from ..exceptions import MaterializationError
 from .platform import Platform
 
@@ -28,8 +31,6 @@ from ..ast.structural import (
     PsBlock,
     PsConditional,
     PsDeclaration,
-    PsStatement,
-    PsAssignment,
     PsStructuralNode,
 )
 from ..ast.expressions import (
@@ -39,7 +40,6 @@ from ..ast.expressions import (
     PsCall,
     PsLookup,
     PsBufferAcc,
-    PsSymbolExpr,
     PsConstantExpr,
     PsAdd,
     PsRem,
@@ -206,7 +206,18 @@ class GenericGpu(Platform):
         }
 
     @abstractmethod
-    def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression:
+    def resolve_numeric_limits(
+        self, func: NumericLimitsFunctions, dtype: PsType
+    ) -> PsExpression:
+        pass
+
+    @abstractmethod
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         pass
 
     def __init__(
@@ -262,6 +273,31 @@ class GenericGpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
+    @staticmethod
+    def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
+        """Returns thread indices multiplied with block dimension strides per dimension."""
+
+        return tuple(
+            idx
+            * PsConstantExpr(
+                PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
+            )
+            for i, idx in enumerate(THREAD_IDX[: ispace.rank])
+        )
+
+    def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression:
+        """Returns expression that determines whether a thread is the first within a warp."""
+
+        tids_per_dim = GenericGpu._thread_index_per_dim(ispace)
+        tid: PsExpression = tids_per_dim[0]
+        for t in tids_per_dim[1:]:
+            tid = PsAdd(tid, t)
+
+        return PsEq(
+            PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
+            PsConstantExpr(PsConstant(0, SInt(32))),
+        )
+
     def select_function(
         self, call: PsCall
     ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
@@ -276,97 +312,8 @@ class GenericGpu(Platform):
         ):
             ptr_expr, symbol_expr = call.args
             op = call_func.reduction_op
-            stype = symbol_expr.dtype
-            ptrtype = ptr_expr.dtype
-
-            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(
-                ptrtype, PsPointerType
-            )
-            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(
-                stype, PsScalarType
-            )
-
-            if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
-                NotImplementedError(
-                    "atomic operations are only available for float32/64 datatypes"
-                )
-
-            # workaround for subtractions -> use additions for reducing intermediate results
-            # similar to OpenMP reductions: local copies (negative sign) are added at the end
-            match op:
-                case ReductionOp.Sub:
-                    actual_op = ReductionOp.Add
-                case _:
-                    actual_op = op
-
-            # check if thread is valid for performing reduction
-            ispace = self._ctx.get_iteration_space()
-            is_valid_thread = self._get_condition_for_translation(ispace)
-
-            cond: PsExpression
-            shuffles: tuple[PsAssignment, ...]
-            if self._warp_size and self._assume_warp_aligned_block_size:
-                # perform local warp reductions
-                def gen_shuffle_instr(offset: int):
-                    full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
-                    return PsCall(
-                        CFunction(
-                            "__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype
-                        ),
-                        [
-                            full_mask,
-                            symbol_expr,
-                            PsConstantExpr(PsConstant(offset, SInt(32))),
-                        ],
-                    )
-
-                # set up shuffle instructions for warp-level reduction
-                num_shuffles = math.frexp(self._warp_size)[1]
-                shuffles = tuple(
-                    PsAssignment(
-                        symbol_expr,
-                        reduction_op_to_expr(
-                            actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1))
-                        ),
-                    )
-                    for i in reversed(range(1, num_shuffles))
-                )
-
-                # find first thread in warp
-                thread_indices_per_dim = [
-                    idx
-                    * PsConstantExpr(
-                        PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
-                    )
-                    for i, idx in enumerate(THREAD_IDX[: ispace.rank])
-                ]
-                tid: PsExpression = thread_indices_per_dim[0]
-                for t in thread_indices_per_dim[1:]:
-                    tid = PsAdd(tid, t)
-                first_thread_in_warp = PsEq(
-                    PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
-                    PsConstantExpr(PsConstant(0, SInt(32))),
-                )
-
-                # set condition to only execute atomic operation on first valid thread in warp
-                cond = (
-                    PsAnd(is_valid_thread, first_thread_in_warp)
-                    if is_valid_thread
-                    else first_thread_in_warp
-                )
-            else:
-                # no optimization: only execute atomic add on valid thread
-                shuffles = ()
-                cond = is_valid_thread
-
-            # use atomic operation
-            call.function = CFunction(
-                f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")
-            )
-            call.args = (ptr_expr, symbol_expr)
 
-            # assemble warp reduction
-            return shuffles, PsConditional(cond, PsBlock([PsStatement(call)]))
+            return self.resolve_reduction(ptr_expr, symbol_expr, op)
 
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
index 45d60452bf9b560d1c55230f7c11906026db164b..404d9bb27bf7eded15b9e2f7dc60fb70ceedb1fe 100644
--- a/src/pystencils/backend/platforms/hip.py
+++ b/src/pystencils/backend/platforms/hip.py
@@ -1,9 +1,13 @@
 from __future__ import annotations
 
 from .generic_gpu import GenericGpu
+from ..ast import PsAstNode
 from ..ast.expressions import PsExpression, PsLiteralExpr
+from ..ast.structural import PsStructuralNode
+from ..exceptions import MaterializationError
 from ..functions import NumericLimitsFunctions
 from ..literals import PsLiteral
+from ...sympyextensions import ReductionOp
 from ...types import PsType, PsIeeeFloatType
 
 
@@ -12,12 +16,11 @@ class HipPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return super().required_headers | {
-            '"pystencils_runtime/hip.h"',
-            "<limits>"
-        }
+        return super().required_headers | {'"pystencils_runtime/hip.h"', "<limits>"}
 
-    def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression:
+    def resolve_numeric_limits(
+        self, func: NumericLimitsFunctions, dtype: PsType
+    ) -> PsExpression:
         assert isinstance(dtype, PsIeeeFloatType)
 
         return PsLiteralExpr(
@@ -26,3 +29,12 @@ class HipPlatform(GenericGpu):
                 dtype,
             )
         )
+
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+
+        raise MaterializationError("Reductions are yet not supported in HIP backend.")
diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h
index cd4bd5730657930f7a2375f74df0f5691ebaaf15..bd1889153a56876032781b77dcb66a413b3fe07d 100644
--- a/src/pystencils/include/simd_horizontal_helpers.h
+++ b/src/pystencils/include/simd_horizontal_helpers.h
@@ -120,7 +120,7 @@ inline float _mm256_horizontal_max_ps(float dst, __m256 src) {
 
 #endif
 
-#if defined(__AVX512VL__)
+#if defined(__AVX512F__)
 #include <immintrin.h>
 
 inline double _mm512_horizontal_add_pd(double dst, __m512d src) { 
diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py
index aebbf35bb65dd9fd66374a98a038ed3c4a3151db..1d652c6e1f53f5a7b060f2a725162ef07f1fdfae 100644
--- a/util/generate_simd_horizontal_op.py
+++ b/util/generate_simd_horizontal_op.py
@@ -277,7 +277,7 @@ vtypes_for_instruction_set = {
 guards_for_instruction_sets = {
     InstructionSets.SSE3: "__SSE3__",
     InstructionSets.AVX: "__AVX__",
-    InstructionSets.AVX512: '__AVX512VL__',
+    InstructionSets.AVX512: '__AVX512F__',
     InstructionSets.NEON: '_M_ARM64',
 }