From c7f1518efc14326f9b5732e7e8f6ac7f07acc71a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Mar 2025 16:28:42 +0100 Subject: [PATCH] Move manual atomic op implementations to new header --- src/pystencils/backend/platforms/cuda.py | 5 +- src/pystencils/include/gpu_atomics.h | 90 +++++++++++++++++++ .../include/pystencils_runtime/hip.h | 89 ------------------ 3 files changed, 94 insertions(+), 90 deletions(-) create mode 100644 src/pystencils/include/gpu_atomics.h diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 7aac0d412..32744661a 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -197,7 +197,10 @@ class CudaPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return {'"pystencils_runtime/hip.h"'} # TODO: move to HipPlatform once it is introduced + return { + '"pystencils_runtime/hip.h"', # TODO: move to HipPlatform once it is introduced + '"gpu_atomics.h' + } def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace diff --git a/src/pystencils/include/gpu_atomics.h b/src/pystencils/include/gpu_atomics.h new file mode 100644 index 000000000..6de5c3321 --- /dev/null +++ b/src/pystencils/include/gpu_atomics.h @@ -0,0 +1,90 @@ +#pragma once + +// No direct implementation for all atomic operations available +// -> add support by custom implementations using a CAS mechanism + +#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__) + +// - atomicMul (double/float) +// see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division +__device__ double atomicMul(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int oldValue = *address_as_ull, assumed; + do { + assumed = oldValue; + oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * + __longlong_as_double(assumed))); + } while (assumed != oldValue); + + return __longlong_as_double(oldValue); +} + +__device__ float atomicMul(float* address, float val) { + int* address_as_int = (int*)address; + int old = *address_as_int; + int assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed))); + } while (assumed != old); + + return __int_as_float(old); +} + +#endif + +#ifdef __CUDA_ARCH__ + +// - atomicMin (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMin(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val < __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMin(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val < __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +// - atomicMax (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMax(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val > __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMax(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val > __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +#endif \ No newline at end of file diff --git a/src/pystencils/include/pystencils_runtime/hip.h b/src/pystencils/include/pystencils_runtime/hip.h index b0b4d9679..4bf4917f8 100644 --- a/src/pystencils/include/pystencils_runtime/hip.h +++ b/src/pystencils/include/pystencils_runtime/hip.h @@ -6,92 +6,3 @@ typedef __hip_int8_t int8_t; typedef __hip_uint16_t uint16_t; typedef __hip_int16_t int16_t; #endif - -// No direct implementation for all atomic operations available -// -> add support by custom implementations using a CAS mechanism - -#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__) - -// - atomicMul (double/float) -// see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division -__device__ double atomicMul(double* address, double val) { - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int oldValue = *address_as_ull, assumed; - do { - assumed = oldValue; - oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * - __longlong_as_double(assumed))); - } while (assumed != oldValue); - - return __longlong_as_double(oldValue); -} - -__device__ float atomicMul(float* address, float val) { - int* address_as_int = (int*)address; - int old = *address_as_int; - int assumed; - do { - assumed = old; - old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed))); - } while (assumed != old); - - return __int_as_float(old); -} - -#endif - -#ifdef __CUDA_ARCH__ - -// - atomicMin (double/float) -// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda -__device__ __forceinline__ double atomicMin(double *address, double val) -{ - unsigned long long ret = __double_as_longlong(*address); - while(val < __longlong_as_double(ret)) - { - unsigned long long old = ret; - if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) - break; - } - return __longlong_as_double(ret); -} - -__device__ __forceinline__ float atomicMin(float *address, float val) -{ - int ret = __float_as_int(*address); - while(val < __int_as_float(ret)) - { - int old = ret; - if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) - break; - } - return __int_as_float(ret); -} - -// - atomicMax (double/float) -// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda -__device__ __forceinline__ double atomicMax(double *address, double val) -{ - unsigned long long ret = __double_as_longlong(*address); - while(val > __longlong_as_double(ret)) - { - unsigned long long old = ret; - if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) - break; - } - return __longlong_as_double(ret); -} - -__device__ __forceinline__ float atomicMax(float *address, float val) -{ - int ret = __float_as_int(*address); - while(val > __int_as_float(ret)) - { - int old = ret; - if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) - break; - } - return __int_as_float(ret); -} - -#endif -- GitLab