Skip to content
Snippets Groups Projects
Commit 3f9b4068 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

VSX Philox

parent 4751ce0d
No related branches found
No related tags found
1 merge request!228Vectorization improvements
...@@ -63,7 +63,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): ...@@ -63,7 +63,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
result['intwidth'] = intwidth result['intwidth'] = intwidth
result[data_type] = f'__vector {data_type}' result[data_type] = f'__vector {data_type}'
result['int'] = '__vector int' result['int'] = '__vector int'
result['bool'] = f'__vector bool {"long long" if data_type == "double" else "int"}' result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}'
result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"'] result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"']
result['makeVecConst'] = '((' + result[data_type] + '){{' + ", ".join(['{0}' for _ in range(width)]) + '}})' result['makeVecConst'] = '((' + result[data_type] + '){{' + ", ".join(['{0}' for _ in range(width)]) + '}})'
......
...@@ -155,6 +155,9 @@ def read_config(): ...@@ -155,6 +155,9 @@ def read_config():
('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'), ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__') ('restrict_qualifier', '__restrict__')
]) ])
if platform.machine().startswith('ppc64'):
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
'-mcpu=native')
elif platform.system().lower() == 'windows': elif platform.system().lower() == 'windows':
default_compiler_config = OrderedDict([ default_compiler_config = OrderedDict([
('os', 'windows'), ('os', 'windows'),
......
...@@ -16,8 +16,12 @@ ...@@ -16,8 +16,12 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#ifdef __powerpc__
#include <ppu_intrinsics.h>
#endif
#ifdef __ALTIVEC__ #ifdef __ALTIVEC__
#include <altivec.h> #include <altivec.h>
#undef bool
#endif #endif
#ifndef __CUDA_ARCH__ #ifndef __CUDA_ARCH__
...@@ -288,6 +292,169 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct ...@@ -288,6 +292,169 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
} }
#endif #endif
#ifdef __ALTIVEC__
QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key)
{
__vector uint32 lohi0a = (__vector uint32) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 lohi0b = (__vector uint32) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 lohi1a = (__vector uint32) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector uint32 lohi1b = (__vector uint32) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector uint32 lo0 = vec_mergee(lohi0a, lohi0b);
__vector uint32 lo1 = vec_mergee(lohi1a, lohi1b);
__vector uint32 hi0 = vec_mergeo(lohi0a, lohi0b);
__vector uint32 hi1 = vec_mergeo(lohi1a, lohi1b);
ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
ctr[1] = lo1;
ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(__vector uint32* key)
{
key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0));
key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1));
}
#ifdef __VSX__
template<bool high>
QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 y)
{
// convert 32 to 64 bit
if (high)
{
x = vec_mergel(x, vec_splats(0U));
y = vec_mergel(y, vec_splats(0U));
}
else
{
x = vec_mergeh(x, vec_splats(0U));
y = vec_mergeh(y, vec_splats(0U));
}
// calculate z = x ^ y << (53 - 32))
__vector uint64 z = vec_sl((__vector uint64) y, vec_splats(53ULL - 32ULL));
z = vec_xor((__vector uint64) x, z);
// convert uint64 to double
__vector double rs = __builtin_convertvector(z, __vector double); // vec_ctd(z, 0) is documented but not available
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));
return rs;
}
#endif
QUALIFIERS void philox_float4(__vector uint32 ctr0, __vector uint32 ctr1, __vector uint32 ctr2, __vector uint32 ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
__vector uint32 key[2] = {vec_splats(key0), vec_splats(key1)};
__vector uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
// convert uint32 to float
rnd1 = vec_ctf(ctr[0], 0);
rnd2 = vec_ctf(ctr[1], 0);
rnd3 = vec_ctf(ctr[2], 0);
rnd4 = vec_ctf(ctr[3], 0);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1 = vec_madd(rnd1, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
rnd2 = vec_madd(rnd2, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
rnd3 = vec_madd(rnd3, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
rnd4 = vec_madd(rnd4, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
}
#ifdef __VSX__
QUALIFIERS void philox_double2(__vector uint32 ctr0, __vector uint32 ctr1, __vector uint32 ctr2, __vector uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
{
__vector uint32 key[2] = {vec_splats(key0), vec_splats(key1)};
__vector uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
#endif
QUALIFIERS void philox_float4(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
__vector uint32 ctr0v = vec_splats(ctr0);
__vector uint32 ctr2v = vec_splats(ctr2);
__vector uint32 ctr3v = vec_splats(ctr3);
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
philox_float4(ctr0, (__vector uint32) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
#ifdef __VSX__
QUALIFIERS void philox_double2(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
{
__vector uint32 ctr0v = vec_splats(ctr0);
__vector uint32 ctr2v = vec_splats(ctr2);
__vector uint32 ctr3v = vec_splats(ctr3);
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void philox_double2(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1, __vector double & rnd2)
{
__vector uint32 ctr0v = vec_splats(ctr0);
__vector uint32 ctr2v = vec_splats(ctr2);
__vector uint32 ctr3v = vec_splats(ctr3);
__vector double ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
}
QUALIFIERS void philox_double2(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1, __vector double & rnd2)
{
philox_double2(ctr0, (__vector uint32) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2);
}
#endif
#endif
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
QUALIFIERS void _philox4x32round(uint32x4_t* ctr, uint32x4_t* key) QUALIFIERS void _philox4x32round(uint32x4_t* ctr, uint32x4_t* key)
{ {
......
...@@ -27,8 +27,8 @@ if get_compiler_config()['os'] == 'windows': ...@@ -27,8 +27,8 @@ if get_compiler_config()['os'] == 'windows':
def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None): def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
if target == 'gpu': if target == 'gpu':
pytest.importorskip('pycuda') pytest.importorskip('pycuda')
if instruction_sets and 'neon' in instruction_sets and rng == 'aesni': if instruction_sets and set(['neon', 'vsx']).intersection(instruction_sets) and rng == 'aesni':
pytest.xfail('AES not yet implemented for ARM Neon') pytest.xfail('AES not yet implemented for this architecture')
if rng == 'aesni' and len(keys) == 2: if rng == 'aesni' and len(keys) == 2:
keys *= 2 keys *= 2
if offset_values is None: if offset_values is None:
...@@ -118,8 +118,8 @@ def test_rng_offsets(kind, vectorized): ...@@ -118,8 +118,8 @@ def test_rng_offsets(kind, vectorized):
@pytest.mark.parametrize('rng', ('philox', 'aesni')) @pytest.mark.parametrize('rng', ('philox', 'aesni'))
@pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double'))) @pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double')))
def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None): def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None):
if target == 'neon' and rng == 'aesni': if target in ['neon', 'vsx'] and rng == 'aesni':
pytest.xfail('AES not yet implemented for ARM Neon') pytest.xfail('AES not yet implemented for this architecture')
cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target} cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target}
dh = ps.create_data_handling((17, 17), default_ghost_layers=0, default_target='cpu') dh = ps.create_data_handling((17, 17), default_ghost_layers=0, default_target='cpu')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment