Skip to content
Snippets Groups Projects
Commit 144c6e14 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

OpenCL RNG

Unfortunately most OpenCL implementations don't support C++
parent fce1a417
No related branches found
No related tags found
1 merge request!247OpenCL RNG
Pipeline #32239 passed
#include <cstdint>
#ifndef __OPENCL_VERSION__
#if defined(__SSE2__) || defined(_MSC_VER)
#include <emmintrin.h> // SSE2
#endif
......@@ -33,12 +32,15 @@
#ifdef __riscv_v
#include <riscv_vector.h>
#endif
#endif
#ifndef __CUDA_ARCH__
#ifdef __CUDA_ARCH__
#define QUALIFIERS static __forceinline__ __device__
#elif defined(__OPENCL_VERSION__)
#define QUALIFIERS static inline
#else
#define QUALIFIERS inline
#include "myintrin.h"
#else
#define QUALIFIERS static __forceinline__ __device__
#endif
#define PHILOX_W32_0 (0x9E3779B9)
......@@ -48,8 +50,15 @@
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
#ifdef __OPENCL_VERSION__
#include "opencl_stdint.h"
typedef uint32_t uint32;
typedef uint64_t uint64;
#else
#include <cstdint>
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
#endif
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
......@@ -67,6 +76,9 @@ QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
*hip = __mulhwu(a,b);
return a*b;
#elif defined(__OPENCL_VERSION__)
*hip = mul_hi(a,b);
return a*b;
#else
uint64 product = ((uint64)a) * ((uint64)b);
*hip = product >> 32;
......@@ -106,7 +118,12 @@ QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, double & rnd1, double & rnd2)
uint32 key0, uint32 key1,
#ifdef __OPENCL_VERSION__
double * rnd1, double * rnd2)
#else
double & rnd1, double & rnd2)
#endif
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
......@@ -121,14 +138,23 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
#ifdef __OPENCL_VERSION__
*rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
*rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
#else
rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
#endif
}
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
#ifdef __OPENCL_VERSION__
float * rnd1, float * rnd2, float * rnd3, float * rnd4)
#else
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
#endif
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
......@@ -143,13 +169,20 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
#ifdef __OPENCL_VERSION__
*rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
*rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
*rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
*rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
#else
rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
#endif
}
#ifndef __CUDA_ARCH__
#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
#if defined(__SSE4_1__) || defined(_MSC_VER)
QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
{
......
......@@ -53,8 +53,9 @@ class RNGBase(CustomCodeNode):
else:
code += f"{vector_instruction_set[r.dtype.base_name] if vector_instruction_set else r.dtype} " + \
f"{r.name};\n"
code += (self._name + "(" + ", ".join([print_arg(a) for a in self.args]
+ [r.name for r in self.result_symbols]) + ");\n")
args = [print_arg(a) for a in self.args] + \
[('&' if dialect == 'opencl' else '') + r.name for r in self.result_symbols]
code += (self._name + "(" + ", ".join(args) + ");\n")
return code
def __repr__(self):
......
......@@ -21,12 +21,16 @@ if get_compiler_config()['os'] == 'windows':
instruction_sets.remove('avx512')
@pytest.mark.parametrize('target,rng', (('cpu', 'philox'), ('cpu', 'aesni'), ('gpu', 'philox')))
@pytest.mark.parametrize('target,rng', (('cpu', 'philox'), ('cpu', 'aesni'), ('gpu', 'philox'), ('opencl', 'philox')))
@pytest.mark.parametrize('precision', ('float', 'double'))
@pytest.mark.parametrize('dtype', ('float', 'double'))
def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
if target == 'gpu':
pytest.importorskip('pycuda')
if target == 'opencl':
pytest.importorskip('pyopencl')
from pystencils.opencl.opencljit import init_globally
init_globally()
if instruction_sets and set(['neon', 'sve', 'vsx', 'rvv']).intersection(instruction_sets) and rng == 'aesni':
pytest.xfail('AES not yet implemented for this architecture')
if rng == 'aesni' and len(keys) == 2:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment