From 60a348f1f3f39d184da872007756c3aeed13ccee Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 10 Feb 2025 18:30:29 +0100
Subject: [PATCH] Support atomic sub, min, max for fp reductions using custom
 implementations with CAS mechanism

---
 src/pystencils/backend/platforms/cuda.py | 12 ++++-
 src/pystencils/include/gpu_defines.h     | 59 +++++++++++++++++++++++-
 tests/kernelcreation/test_reduction.py   |  2 +-
 3 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 1af8917cc..f9fbdfa56 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -3,6 +3,7 @@ from warnings import warn
 from typing import TYPE_CHECKING
 
 from ..ast import PsAstNode
+from ...sympyextensions.reduction import ReductionOp
 from ...types import constify, PsPointerType, PsScalarType, PsCustomType
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
@@ -165,9 +166,16 @@ class CudaPlatform(GenericGpu):
                 assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
                 assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
 
-                call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
+                match op:
+                    case ReductionOp.Sub:
+                        # workaround for unsupported atomicSub: use atomic add and invert sign
+                        call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype],
                                           PsCustomType("void"))
-                call.args = (ptr_expr, symbol_expr)
+                        call.args = (ptr_expr, -symbol_expr)
+                    case _:
+                        call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
+                                                  PsCustomType("void"))
+                        call.args = (ptr_expr, symbol_expr)
 
                 if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
                     NotImplementedError("atomicMul is only available for float32/64 datatypes")
diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h
index 8f961e25b..5525bbc69 100644
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
@@ -14,8 +14,11 @@ typedef __hip_int16_t int16_t;
 #endif
 
 #ifdef __CUDA_ARCH__
-// Implementation of atomic multiplication
-// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
+// No direct implementation of atomic multiplication, minimum and maximum available
+// -> add support by custom implementations using a CAS mechanism
+
+// - atomicMul (double/float)
+//   see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
 __device__ double atomicMul(double* address, double val) {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
     unsigned long long int oldValue = *address_as_ull, assumed;
@@ -39,4 +42,56 @@ __device__ float atomicMul(float* address, float val) {
 
     return __int_as_float(old);
 }
+
+// - atomicMin (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMin(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val < __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMin(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val < __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
+
+// - atomicMax (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMax(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val > __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMax(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val > __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
 #endif
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index be2589912..07fb94a7e 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -18,7 +18,7 @@ SOLUTION = {
 
 
 @pytest.mark.parametrize('dtype', ["float64"])
-@pytest.mark.parametrize("op", ["+"]) #, "-", "*", "min", "max"
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction(target, dtype, op):
     gpu_avail = target is ps.Target.GPU
 
-- 
GitLab