Skip to content
Snippets Groups Projects
Commit 60a348f1 authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Support atomic sub, min, max for fp reductions using custom implementations with CAS mechanism

parent a71e0d31
No related branches found
No related tags found
1 merge request!438Reduction Support
Pipeline #73845 failed
......@@ -3,6 +3,7 @@ from warnings import warn
from typing import TYPE_CHECKING
from ..ast import PsAstNode
from ...sympyextensions.reduction import ReductionOp
from ...types import constify, PsPointerType, PsScalarType, PsCustomType
from ..exceptions import MaterializationError
from .generic_gpu import GenericGpu
......@@ -165,9 +166,16 @@ class CudaPlatform(GenericGpu):
assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
match op:
case ReductionOp.Sub:
# workaround for unsupported atomicSub: use atomic add and invert sign
call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype],
PsCustomType("void"))
call.args = (ptr_expr, symbol_expr)
call.args = (ptr_expr, -symbol_expr)
case _:
call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
PsCustomType("void"))
call.args = (ptr_expr, symbol_expr)
if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
NotImplementedError("atomicMul is only available for float32/64 datatypes")
......
......@@ -14,8 +14,11 @@ typedef __hip_int16_t int16_t;
#endif
#ifdef __CUDA_ARCH__
// Implementation of atomic multiplication
// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
// No direct implementation of atomic multiplication, minimum and maximum available
// -> add support by custom implementations using a CAS mechanism
// - atomicMul (double/float)
// see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
__device__ double atomicMul(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int oldValue = *address_as_ull, assumed;
......@@ -39,4 +42,56 @@ __device__ float atomicMul(float* address, float val) {
return __int_as_float(old);
}
// - atomicMin (double/float)
// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
__device__ __forceinline__ double atomicMin(double *address, double val)
{
unsigned long long ret = __double_as_longlong(*address);
while(val < __longlong_as_double(ret))
{
unsigned long long old = ret;
if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
break;
}
return __longlong_as_double(ret);
}
__device__ __forceinline__ float atomicMin(float *address, float val)
{
int ret = __float_as_int(*address);
while(val < __int_as_float(ret))
{
int old = ret;
if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
break;
}
return __int_as_float(ret);
}
// - atomicMax (double/float)
// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
__device__ __forceinline__ double atomicMax(double *address, double val)
{
unsigned long long ret = __double_as_longlong(*address);
while(val > __longlong_as_double(ret))
{
unsigned long long old = ret;
if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
break;
}
return __longlong_as_double(ret);
}
__device__ __forceinline__ float atomicMax(float *address, float val)
{
int ret = __float_as_int(*address);
while(val > __int_as_float(ret))
{
int old = ret;
if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
break;
}
return __int_as_float(ret);
}
#endif
......@@ -18,7 +18,7 @@ SOLUTION = {
@pytest.mark.parametrize('dtype', ["float64"])
@pytest.mark.parametrize("op", ["+"]) #, "-", "*", "min", "max"
@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
def test_reduction(target, dtype, op):
gpu_avail = target is ps.Target.GPU
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment