diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index bb42e1f9bb35a33acc13b95d026d76f8d941cc4c..95480de93e72698dbe2b28c997bd096be6b383e5 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -3,7 +3,7 @@ from warnings import warn from typing import TYPE_CHECKING from ..ast import PsAstNode -from ...types import constify +from ...types import constify, PsPointerType, PsScalarType, PsCustomType from ..exceptions import MaterializationError from .generic_gpu import GenericGpu @@ -23,12 +23,12 @@ from ..ast.expressions import ( PsCast, PsCall, PsLookup, - PsBufferAcc, + PsBufferAcc, PsSymbolExpr ) from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral -from ..functions import PsMathFunction, MathFunctions, CFunction +from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions if TYPE_CHECKING: from ...codegen import GpuIndexingConfig, GpuThreadsRange @@ -138,7 +138,30 @@ class CudaPlatform(GenericGpu): def unfold_function( self, call: PsCall ) -> PsAstNode: - pass + assert isinstance(call.function, PsReductionFunction) + + func = call.function.func + + match func: + case ReductionFunctions.InitLocalCopy: + symbol_expr, init_val = call.args + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression) + + return PsDeclaration(symbol_expr, init_val) + case ReductionFunctions.WriteBackToPtr: + ptr_expr, symbol_expr = call.args + op = call.function.op + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) + + call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) + call.args = [ptr_expr, symbol_expr] + + if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64): + NotImplementedError("atomicMul is only available for float32/64 datatypes") + + return call # Internals diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h index 67e7722e9a01b217584dce14f0dcec16d2025c80..04eeace47c0f1b2659883d4b44e464d587adb2a7 100644 --- a/src/pystencils/include/gpu_defines.h +++ b/src/pystencils/include/gpu_defines.h @@ -10,3 +10,31 @@ typedef __hip_int8_t int8_t; typedef __hip_uint16_t uint16_t; typedef __hip_int16_t int16_t; #endif + +#ifdef __CUDA_ARCH__ +// Implementation of atomic multiplication +// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division +__device__ double atomicMul(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int oldValue = *address_as_ull, assumed; + do { + assumed = oldValue; + oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * + __longlong_as_double(assumed))); + } while (assumed != oldValue); + + return __longlong_as_double(oldValue); +} + +__device__ float atomicMul(float* address, float val) { + int* address_as_int = (int*)address; + int old = *address_as_int; + int assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed))); + } while (assumed != old); + + return __int_as_float(old); +} +#endif