Fix missing declarations of custom atomics when using pystencils-sfg (!468) · Merge requests · pycodegen / pystencils

When compiling the following snippet via nvcc --generate-code=arch=compute_61,code=[compute_61,sm_61] test.cu, nvcc is missing the declaration of the custom atomic max implementation for the host-side compilation. This error also occurs when the generated code is embedded via pystencils-sfg.

This can simply be fixed by removing the __CUDA_ARCH__ guard, which is also done in this MR in gpu_atomics.h

#ifdef __CUDA_ARCH__
__device__ __forceinline__ double atomicMax(double *address, double val)
{
    unsigned long long ret = __double_as_longlong(*address);
    while(val > __longlong_as_double(ret))
    {
        unsigned long long old = ret;
        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
            break;
    }
    return __longlong_as_double(ret);
}
#endif

__global__ void PyCodegen_VMax_gen(
        const double *const _data_x, const int64_t _size_x_0,
        const int64_t _size_x_1, const int64_t _size_x_2, const int64_t _stride_x_0,
        const int64_t _stride_x_1, const int64_t _stride_x_2, double *const r) {
    double r_local = 0xfff0000000000000;
    const int32_t __c_blockidx_zmblockdim_zpthreadidx_z =
            blockIdx.z * blockDim.z + threadIdx.z;
    const int32_t __c_blockidx_ymblockdim_ypthreadidx_y =
            blockIdx.y * blockDim.y + threadIdx.y;
    const int32_t __c_blockidx_xmblockdim_xpthreadidx_x =
            blockIdx.x * blockDim.x + threadIdx.x;
    const int64_t ctr_2 = (int64_t)__c_blockidx_zmblockdim_zpthreadidx_z;
    const int64_t ctr_1 = (int64_t)__c_blockidx_ymblockdim_ypthreadidx_y;
    const int64_t ctr_0 = (int64_t)__c_blockidx_xmblockdim_xpthreadidx_x;
    if (ctr_2 < _size_x_2 && ctr_1 < _size_x_1 && ctr_0 < _size_x_0) {
        r_local = fmax(r_local, _data_x[ctr_0 * _stride_x_0 + ctr_1 * _stride_x_1 +
                                        ctr_2 * _stride_x_2]);
    }
    {
        if (ctr_2 < _size_x_2 - 0LL && ctr_1 < _size_x_1 - 0LL &&
            ctr_0 < _size_x_0 - 0LL) {
            atomicMax(r, r_local);
        }
    }
}

Fix missing declarations of custom atomics when using pystencils-sfg

Merge request reports