diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index bb42e1f9bb35a33acc13b95d026d76f8d941cc4c..95480de93e72698dbe2b28c997bd096be6b383e5 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -3,7 +3,7 @@ from warnings import warn
 from typing import TYPE_CHECKING
 
 from ..ast import PsAstNode
-from ...types import constify
+from ...types import constify, PsPointerType, PsScalarType, PsCustomType
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
 
@@ -23,12 +23,12 @@ from ..ast.expressions import (
     PsCast,
     PsCall,
     PsLookup,
-    PsBufferAcc,
+    PsBufferAcc, PsSymbolExpr
 )
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
-from ..functions import PsMathFunction, MathFunctions, CFunction
+from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions
 
 if TYPE_CHECKING:
     from ...codegen import GpuIndexingConfig, GpuThreadsRange
@@ -138,7 +138,30 @@ class CudaPlatform(GenericGpu):
     def unfold_function(
         self, call: PsCall
     ) -> PsAstNode:
-        pass
+        assert isinstance(call.function, PsReductionFunction)
+
+        func = call.function.func
+
+        match func:
+            case ReductionFunctions.InitLocalCopy:
+                symbol_expr, init_val = call.args
+                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression)
+
+                return PsDeclaration(symbol_expr, init_val)
+            case ReductionFunctions.WriteBackToPtr:
+                ptr_expr, symbol_expr = call.args
+                op = call.function.op
+
+                assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
+                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
+
+                call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void"))
+                call.args = [ptr_expr, symbol_expr]
+
+                if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
+                    NotImplementedError("atomicMul is only available for float32/64 datatypes")
+
+                return call
 
     #   Internals
 
diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h
index 67e7722e9a01b217584dce14f0dcec16d2025c80..04eeace47c0f1b2659883d4b44e464d587adb2a7 100644
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
@@ -10,3 +10,31 @@ typedef __hip_int8_t int8_t;
 typedef __hip_uint16_t uint16_t;
 typedef __hip_int16_t int16_t;
 #endif
+
+#ifdef __CUDA_ARCH__
+// Implementation of atomic multiplication
+// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
+__device__ double atomicMul(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int oldValue = *address_as_ull, assumed;
+    do {
+      assumed = oldValue;
+      oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val *
+                           __longlong_as_double(assumed)));
+    } while (assumed != oldValue);
+
+    return __longlong_as_double(oldValue);
+}
+
+__device__ float atomicMul(float* address, float val) {
+    int* address_as_int = (int*)address;
+    int old = *address_as_int;
+    int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
+    } while (assumed != old);
+
+    return __int_as_float(old);
+}
+#endif