diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 1af8917ccb7a4d07b700e43ae22958bf45817aa3..f9fbdfa56785aaf6cb897d8c2064c53d69b997fb 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -3,6 +3,7 @@ from warnings import warn
 from typing import TYPE_CHECKING
 
 from ..ast import PsAstNode
+from ...sympyextensions.reduction import ReductionOp
 from ...types import constify, PsPointerType, PsScalarType, PsCustomType
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
@@ -165,9 +166,16 @@ class CudaPlatform(GenericGpu):
                 assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
                 assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
 
-                call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
+                match op:
+                    case ReductionOp.Sub:
+                        # workaround for unsupported atomicSub: use atomic add and invert sign
+                        call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype],
                                           PsCustomType("void"))
-                call.args = (ptr_expr, symbol_expr)
+                        call.args = (ptr_expr, -symbol_expr)
+                    case _:
+                        call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
+                                                  PsCustomType("void"))
+                        call.args = (ptr_expr, symbol_expr)
 
                 if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
                     NotImplementedError("atomicMul is only available for float32/64 datatypes")
diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h
index 8f961e25b6ea1a4b616af4dbba437c5dbb66d524..5525bbc69188fa143ba9146d286a93d392f34bf2 100644
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
@@ -14,8 +14,11 @@ typedef __hip_int16_t int16_t;
 #endif
 
 #ifdef __CUDA_ARCH__
-// Implementation of atomic multiplication
-// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
+// No direct implementation of atomic multiplication, minimum and maximum available
+// -> add support by custom implementations using a CAS mechanism
+
+// - atomicMul (double/float)
+//   see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
 __device__ double atomicMul(double* address, double val) {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
     unsigned long long int oldValue = *address_as_ull, assumed;
@@ -39,4 +42,56 @@ __device__ float atomicMul(float* address, float val) {
 
     return __int_as_float(old);
 }
+
+// - atomicMin (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMin(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val < __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMin(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val < __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
+
+// - atomicMax (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMax(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val > __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMax(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val > __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
 #endif
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index be2589912f458306a06c7f96b8d94e6ce176c8f0..07fb94a7eb81d3833ff91c846fbdb5c0e63d0869 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -18,7 +18,7 @@ SOLUTION = {
 
 
 @pytest.mark.parametrize('dtype', ["float64"])
-@pytest.mark.parametrize("op", ["+"]) #, "-", "*", "min", "max"
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction(target, dtype, op):
     gpu_avail = target is ps.Target.GPU