diff --git a/pystencils/gpu/gpujit.py b/pystencils/gpu/gpujit.py index e29f85d4369b7dfc38bba9259a39bfc666dc0977..efa5af826df117b22d26a8f766a0c8b1730ee3f4 100644 --- a/pystencils/gpu/gpujit.py +++ b/pystencils/gpu/gpujit.py @@ -42,6 +42,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen headers = get_headers(kernel_function_node) if cp.cuda.runtime.is_hip: headers.add('"gpu_defines.h"') + for field in kernel_function_node.fields_accessed: + if isinstance(field.dtype, BasicType) and field.dtype.is_half(): + headers.add('<hip/hip_fp16.h>') else: headers.update({'"gpu_defines.h"', '<cstdint>'}) for field in kernel_function_node.fields_accessed: diff --git a/pystencils/include/gpu_defines.h b/pystencils/include/gpu_defines.h index 2d9875db2e0ea0360cccb17ac249f7bc79a2cb66..67e7722e9a01b217584dce14f0dcec16d2025c80 100644 --- a/pystencils/include/gpu_defines.h +++ b/pystencils/include/gpu_defines.h @@ -3,3 +3,10 @@ #define POS_INFINITY __int_as_float(0x7f800000) #define INFINITY POS_INFINITY #define NEG_INFINITY __int_as_float(0xff800000) + +#ifdef __HIPCC_RTC__ +typedef __hip_uint8_t uint8_t; +typedef __hip_int8_t int8_t; +typedef __hip_uint16_t uint16_t; +typedef __hip_int16_t int16_t; +#endif diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index fab94146889a854f09537b0395cbee5607355c1e..cb91b53b96c487b575d9d318a5f58d1460ed59ed 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -1,4 +1,4 @@ -#ifndef __OPENCL_VERSION__ +#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__) #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #include <emmintrin.h> // SSE2 #endif @@ -38,7 +38,7 @@ #endif #endif -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) #define QUALIFIERS static __forceinline__ __device__ #elif defined(__OPENCL_VERSION__) #define QUALIFIERS static inline @@ -59,7 +59,9 @@ typedef uint32_t uint32; typedef uint64_t uint64; #else +#ifndef __HIPCC_RTC__ #include <cstdint> +#endif typedef std::uint32_t uint32; typedef std::uint64_t uint64; #endif @@ -75,7 +77,7 @@ typedef svfloat64_t svfloat64_st; QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip) { -#ifndef __CUDA_ARCH__ +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) // host code #if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__)) *hip = __mulhwu(a,b); @@ -186,7 +188,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 #endif } -#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) +#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__) #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key) { diff --git a/pystencils_tests/test_custom_backends.py b/pystencils_tests/test_custom_backends.py index 9b625f8f9020e42d6528e4fdf64da7e89d777bae..c7bf7fe243dc6a9df08ba4152cf56ef6ea588bee 100644 --- a/pystencils_tests/test_custom_backends.py +++ b/pystencils_tests/test_custom_backends.py @@ -47,5 +47,5 @@ def test_custom_backends_gpu(): ast = pystencils.create_kernel(normal_assignments, target=Target.GPU) pystencils.show_code(ast, ScreamingGpuBackend()) - with pytest.raises(cupy.cuda.compiler.JitifyException): + with pytest.raises((cupy.cuda.compiler.JitifyException, cupy.cuda.compiler.CompileException)): pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend())