VSX Philox

3f9b4068 · Michael Kuron · 4751ce0d · 3f9b4068 · 3f9b4068 · 3f9b4068
Commit 3f9b4068 authored 4 years ago by Michael Kuron
--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -63,7 +63,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
    result['intwidth'] = intwidth
    result[data_type] = f'__vector {data_type}'
    result['int'] = '__vector int'
-    result['bool'] = f'__vector bool {"long long" if data_type == "double" else "int"}'
+    result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}'
    result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"']
    result['makeVecConst'] = '((' + result[data_type] + '){{' + ", ".join(['{0}' for _ in range(width)]) + '}})'

--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -155,6 +155,9 @@ def read_config():
            ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
            ('restrict_qualifier', '__restrict__')
        ])
+        if platform.machine().startswith('ppc64'):
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
+                                                                                        '-mcpu=native')
    elif platform.system().lower() == 'windows':
        default_compiler_config = OrderedDict([
            ('os', 'windows'),

--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -16,8 +16,12 @@
 #include <arm_neon.h>
 #endif
+#ifdef __powerpc__
+#include <ppu_intrinsics.h>
+#endif
 #ifdef __ALTIVEC__
 #include <altivec.h>
+#undef bool
 #endif
 #ifndef __CUDA_ARCH__
@@ -288,6 +292,169 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
 }
 #endif
+#ifdef __ALTIVEC__
+QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key)
+{
+    __vector uint32 lohi0a = (__vector uint32) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector uint32 lohi0b = (__vector uint32) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector uint32 lohi1a = (__vector uint32) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector uint32 lohi1b = (__vector uint32) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector uint32 lo0 = vec_mergee(lohi0a, lohi0b);
+    __vector uint32 lo1 = vec_mergee(lohi1a, lohi1b);
+    __vector uint32 hi0 = vec_mergeo(lohi0a, lohi0b);
+    __vector uint32 hi1 = vec_mergeo(lohi1a, lohi1b);
+    ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
+    ctr[1] = lo1;
+    ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
+    ctr[3] = lo0;
+}
+QUALIFIERS void _philox4x32bumpkey(__vector uint32* key)
+{
+    key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0));
+    key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1));
+}
+#ifdef __VSX__
+template<bool high>
+QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = vec_mergel(x, vec_splats(0U));
+        y = vec_mergel(y, vec_splats(0U));
+    }
+    else
+    {
+        x = vec_mergeh(x, vec_splats(0U));
+        y = vec_mergeh(y, vec_splats(0U));
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __vector uint64 z = vec_sl((__vector uint64) y, vec_splats(53ULL - 32ULL));
+    z = vec_xor((__vector uint64) x, z);
+    // convert uint64 to double
+    __vector double rs = __builtin_convertvector(z, __vector double); // vec_ctd(z, 0) is documented but not available
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));
+    return rs;
+}
+#endif
+QUALIFIERS void philox_float4(__vector uint32 ctr0, __vector uint32 ctr1, __vector uint32 ctr2, __vector uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
+{
+    __vector uint32 key[2] = {vec_splats(key0), vec_splats(key1)};
+    __vector uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    // convert uint32 to float
+    rnd1 = vec_ctf(ctr[0], 0);
+    rnd2 = vec_ctf(ctr[1], 0);
+    rnd3 = vec_ctf(ctr[2], 0);
+    rnd4 = vec_ctf(ctr[3], 0);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = vec_madd(rnd1, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = vec_madd(rnd2, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = vec_madd(rnd3, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = vec_madd(rnd4, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+}
+#ifdef __VSX__
+QUALIFIERS void philox_double2(__vector uint32 ctr0, __vector uint32 ctr1, __vector uint32 ctr2, __vector uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
+{
+    __vector uint32 key[2] = {vec_splats(key0), vec_splats(key1)};
+    __vector uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+#endif
+QUALIFIERS void philox_float4(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
+{
+    __vector uint32 ctr0v = vec_splats(ctr0);
+    __vector uint32 ctr2v = vec_splats(ctr2);
+    __vector uint32 ctr3v = vec_splats(ctr3);
+    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
+{
+    philox_float4(ctr0, (__vector uint32) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+#ifdef __VSX__
+QUALIFIERS void philox_double2(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
+{
+    __vector uint32 ctr0v = vec_splats(ctr0);
+    __vector uint32 ctr2v = vec_splats(ctr2);
+    __vector uint32 ctr3v = vec_splats(ctr3);
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1, __vector double & rnd2)
+{
+    __vector uint32 ctr0v = vec_splats(ctr0);
+    __vector uint32 ctr2v = vec_splats(ctr2);
+    __vector uint32 ctr3v = vec_splats(ctr3);
+    __vector double ignore;
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1, __vector double & rnd2)
+{
+    philox_double2(ctr0, (__vector uint32) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2);
+}
+#endif
+#endif
 #if defined(__ARM_NEON)
 QUALIFIERS void _philox4x32round(uint32x4_t* ctr, uint32x4_t* key)
 {

--- a/pystencils_tests/test_random.py
+++ b/pystencils_tests/test_random.py
@@ -27,8 +27,8 @@ if get_compiler_config()['os'] == 'windows':
 def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
    if target == 'gpu':
        pytest.importorskip('pycuda')
-    if instruction_sets and 'neon' in instruction_sets and rng == 'aesni':
+    if instruction_sets and set(['neon', 'vsx']).intersection(instruction_sets) and rng == 'aesni':
-        pytest.xfail('AES not yet implemented for ARM Neon')
+        pytest.xfail('AES not yet implemented for this architecture')
    if rng == 'aesni' and len(keys) == 2:
        keys *= 2
    if offset_values is None:
@@ -118,8 +118,8 @@ def test_rng_offsets(kind, vectorized):
 @pytest.mark.parametrize('rng', ('philox', 'aesni'))
 @pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double')))
 def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None):
-    if target == 'neon' and rng == 'aesni':
+    if target in ['neon', 'vsx'] and rng == 'aesni':
-        pytest.xfail('AES not yet implemented for ARM Neon')
+        pytest.xfail('AES not yet implemented for this architecture')
    cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target}
    dh = ps.create_data_handling((17, 17), default_ghost_layers=0, default_target='cpu')