diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 1af8917ccb7a4d07b700e43ae22958bf45817aa3..f9fbdfa56785aaf6cb897d8c2064c53d69b997fb 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -3,6 +3,7 @@ from warnings import warn from typing import TYPE_CHECKING from ..ast import PsAstNode +from ...sympyextensions.reduction import ReductionOp from ...types import constify, PsPointerType, PsScalarType, PsCustomType from ..exceptions import MaterializationError from .generic_gpu import GenericGpu @@ -165,9 +166,16 @@ class CudaPlatform(GenericGpu): assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) - call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], + match op: + case ReductionOp.Sub: + # workaround for unsupported atomicSub: use atomic add and invert sign + call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) - call.args = (ptr_expr, symbol_expr) + call.args = (ptr_expr, -symbol_expr) + case _: + call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], + PsCustomType("void")) + call.args = (ptr_expr, symbol_expr) if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64): NotImplementedError("atomicMul is only available for float32/64 datatypes") diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h index 8f961e25b6ea1a4b616af4dbba437c5dbb66d524..5525bbc69188fa143ba9146d286a93d392f34bf2 100644 --- a/src/pystencils/include/gpu_defines.h +++ b/src/pystencils/include/gpu_defines.h @@ -14,8 +14,11 @@ typedef __hip_int16_t int16_t; #endif #ifdef __CUDA_ARCH__ -// Implementation of atomic multiplication -// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division +// No direct implementation of atomic multiplication, minimum and maximum available +// -> add support by custom implementations using a CAS mechanism + +// - atomicMul (double/float) +// see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division __device__ double atomicMul(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int oldValue = *address_as_ull, assumed; @@ -39,4 +42,56 @@ __device__ float atomicMul(float* address, float val) { return __int_as_float(old); } + +// - atomicMin (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMin(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val < __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMin(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val < __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +// - atomicMax (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMax(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val > __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMax(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val > __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} #endif diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index be2589912f458306a06c7f96b8d94e6ce176c8f0..07fb94a7eb81d3833ff91c846fbdb5c0e63d0869 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -18,7 +18,7 @@ SOLUTION = { @pytest.mark.parametrize('dtype', ["float64"]) -@pytest.mark.parametrize("op", ["+"]) #, "-", "*", "min", "max" +@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction(target, dtype, op): gpu_avail = target is ps.Target.GPU