Skip to content
Snippets Groups Projects
Commit e15d3cf7 authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Add first CUDA reduction impl using atomic operations

parent 6e08683b
No related branches found
No related tags found
1 merge request!438Reduction Support
Pipeline #73149 failed
......@@ -3,7 +3,7 @@ from warnings import warn
from typing import TYPE_CHECKING
from ..ast import PsAstNode
from ...types import constify
from ...types import constify, PsPointerType, PsScalarType, PsCustomType
from ..exceptions import MaterializationError
from .generic_gpu import GenericGpu
......@@ -23,12 +23,12 @@ from ..ast.expressions import (
PsCast,
PsCall,
PsLookup,
PsBufferAcc,
PsBufferAcc, PsSymbolExpr
)
from ..ast.expressions import PsLt, PsAnd
from ...types import PsSignedIntegerType, PsIeeeFloatType
from ..literals import PsLiteral
from ..functions import PsMathFunction, MathFunctions, CFunction
from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions
if TYPE_CHECKING:
from ...codegen import GpuIndexingConfig, GpuThreadsRange
......@@ -138,7 +138,30 @@ class CudaPlatform(GenericGpu):
def unfold_function(
self, call: PsCall
) -> PsAstNode:
pass
assert isinstance(call.function, PsReductionFunction)
func = call.function.func
match func:
case ReductionFunctions.InitLocalCopy:
symbol_expr, init_val = call.args
assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression)
return PsDeclaration(symbol_expr, init_val)
case ReductionFunctions.WriteBackToPtr:
ptr_expr, symbol_expr = call.args
op = call.function.op
assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void"))
call.args = [ptr_expr, symbol_expr]
if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
NotImplementedError("atomicMul is only available for float32/64 datatypes")
return call
# Internals
......
......@@ -10,3 +10,31 @@ typedef __hip_int8_t int8_t;
typedef __hip_uint16_t uint16_t;
typedef __hip_int16_t int16_t;
#endif
#ifdef __CUDA_ARCH__
// Implementation of atomic multiplication
// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
__device__ double atomicMul(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int oldValue = *address_as_ull, assumed;
do {
assumed = oldValue;
oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val *
__longlong_as_double(assumed)));
} while (assumed != oldValue);
return __longlong_as_double(oldValue);
}
__device__ float atomicMul(float* address, float val) {
int* address_as_int = (int*)address;
int old = *address_as_int;
int assumed;
do {
assumed = old;
old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
} while (assumed != old);
return __int_as_float(old);
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment