Fix missing declarations of custom atomics when using pystencils-sfg
When compiling the following snippet via nvcc --generate-code=arch=compute_61,code=[compute_61,sm_61] test.cu
, nvcc is missing the declaration of the custom atomic max implementation for the host-side compilation. This error also occurs when the generated code is embedded via pystencils-sfg.
This can simply be fixed by removing the __CUDA_ARCH__
guard, which is also done in this MR in gpu_atomics.h
#ifdef __CUDA_ARCH__
__device__ __forceinline__ double atomicMax(double *address, double val)
{
unsigned long long ret = __double_as_longlong(*address);
while(val > __longlong_as_double(ret))
{
unsigned long long old = ret;
if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
break;
}
return __longlong_as_double(ret);
}
#endif
__global__ void PyCodegen_VMax_gen(
const double *const _data_x, const int64_t _size_x_0,
const int64_t _size_x_1, const int64_t _size_x_2, const int64_t _stride_x_0,
const int64_t _stride_x_1, const int64_t _stride_x_2, double *const r) {
double r_local = 0xfff0000000000000;
const int32_t __c_blockidx_zmblockdim_zpthreadidx_z =
blockIdx.z * blockDim.z + threadIdx.z;
const int32_t __c_blockidx_ymblockdim_ypthreadidx_y =
blockIdx.y * blockDim.y + threadIdx.y;
const int32_t __c_blockidx_xmblockdim_xpthreadidx_x =
blockIdx.x * blockDim.x + threadIdx.x;
const int64_t ctr_2 = (int64_t)__c_blockidx_zmblockdim_zpthreadidx_z;
const int64_t ctr_1 = (int64_t)__c_blockidx_ymblockdim_ypthreadidx_y;
const int64_t ctr_0 = (int64_t)__c_blockidx_xmblockdim_xpthreadidx_x;
if (ctr_2 < _size_x_2 && ctr_1 < _size_x_1 && ctr_0 < _size_x_0) {
r_local = fmax(r_local, _data_x[ctr_0 * _stride_x_0 + ctr_1 * _stride_x_1 +
ctr_2 * _stride_x_2]);
}
{
if (ctr_2 < _size_x_2 - 0LL && ctr_1 < _size_x_1 - 0LL &&
ctr_0 < _size_x_0 - 0LL) {
atomicMax(r, r_local);
}
}
}