Skip to content
Snippets Groups Projects
Commit fdb946ec authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

make VSX portable across compilers

parent 9f9d301c
No related branches found
No related tags found
1 merge request!228Vectorization improvements
Pipeline #31059 passed
......@@ -23,7 +23,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
'*': 'mul[0, 1]',
'/': 'div[0, 1]',
'sqrt': 'sqrt[0]',
'rsqrt': 'rsqrt[0]',
'rsqrt': 'rsqrte[0]', # rsqrt is available too, but not on Clang
'loadU': 'xl[0x0, 0]',
'loadA': 'ld[0x0, 0]',
......@@ -73,6 +73,12 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
result[intrinsic_id] = 'vec_' + name + arg_string
if data_type == 'double':
# Clang and XL C++ are missing these for doubles
result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}')
result['+int'] = "vec_add({0}, {1})"
result['width'] = width
......@@ -82,10 +88,12 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}'
result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"']
result['makeVecConst'] = '((' + result[data_type] + '){{' + ", ".join(['{0}' for _ in range(width)]) + '}})'
result['makeVec'] = '((' + result[data_type] + '){{' + ", ".join(['{' + str(i) + '}' for i in range(width)]) + '}})'
result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['{0}' for _ in range(intwidth)]) + '}})'
result['makeVecInt'] = '((' + result['int'] + '){{{0}, {1}, {2}, {3}}})'
result['makeVecConst'] = '((' + result[data_type] + '){{' + \
", ".join(['(' + data_type + ') {0}' for _ in range(width)]) + '}})'
result['makeVec'] = '((' + result[data_type] + '){{' + \
", ".join(['{' + data_type + '} {' + str(i) + '}' for i in range(width)]) + '}})'
result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['(int) {0}' for _ in range(intwidth)]) + '}})'
result['makeVecInt'] = '((' + result['int'] + '){{(int) {0}, (int) {1}, (int) {2}, (int) {3}}})'
result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
......
......@@ -14,19 +14,28 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
return get_vector_instruction_set_x86(data_type, instruction_set)
_cache = None
def get_supported_instruction_sets():
"""List of supported instruction sets on current hardware, or None if query failed."""
global _cache
if _cache is not None:
return _cache.copy()
if platform.system() == 'Darwin' and platform.machine() == 'arm64': # not supported by cpuinfo
return ['neon']
elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo
import subprocess
import tempfile
from pystencils.cpu.cpujit import get_compiler_config
command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', '-']
f = tempfile.NamedTemporaryFile(suffix='.cpp')
command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
macros = subprocess.check_output(command, input='', text=True)
if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
return ['vsx']
_cache = ['vsx']
else:
return []
_cache = []
return _cache.copy()
try:
from cpuinfo import get_cpu_info
except ImportError:
......
......@@ -296,29 +296,29 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
#ifdef __ALTIVEC__
QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key)
QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key)
{
#ifdef __POWER10_VECTOR__
__vector uint32 lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector uint32 hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
#else
__vector uint32 lohi0a = (__vector uint32) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 lohi0b = (__vector uint32) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 lohi1a = (__vector uint32) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector uint32 lohi1b = (__vector uint32) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lohi0b = (__vector unsigned int) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lohi1a = (__vector unsigned int) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int lohi1b = (__vector unsigned int) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
#ifdef __LITTLE_ENDIAN__
__vector uint32 lo0 = vec_mergee(lohi0a, lohi0b);
__vector uint32 lo1 = vec_mergee(lohi1a, lohi1b);
__vector uint32 hi0 = vec_mergeo(lohi0a, lohi0b);
__vector uint32 hi1 = vec_mergeo(lohi1a, lohi1b);
__vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
__vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
__vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
__vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
#else
__vector uint32 lo0 = vec_mergeo(lohi0a, lohi0b);
__vector uint32 lo1 = vec_mergeo(lohi1a, lohi1b);
__vector uint32 hi0 = vec_mergee(lohi0a, lohi0b);
__vector uint32 hi1 = vec_mergee(lohi1a, lohi1b);
__vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
__vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
__vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
__vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
#endif
#endif
......@@ -328,7 +328,7 @@ QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key)
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(__vector uint32* key)
QUALIFIERS void _philox4x32bumpkey(__vector unsigned int* key)
{
key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0));
key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1));
......@@ -336,7 +336,7 @@ QUALIFIERS void _philox4x32bumpkey(__vector uint32* key)
#ifdef __VSX__
template<bool high>
QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 y)
QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector unsigned int y)
{
// convert 32 to 64 bit
#ifdef __LITTLE_ENDIAN__
......@@ -364,16 +364,14 @@ QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32
#endif
// calculate z = x ^ y << (53 - 32))
__vector uint64 z = vec_sl((__vector uint64) y, vec_splats(53ULL - 32ULL));
z = vec_xor((__vector uint64) x, z);
__vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
z = vec_xor((__vector unsigned long long) x, z);
// convert uint64 to double
#if defined(__has_builtin) && __has_builtin(__builtin_convertvector)
__vector double rs = __builtin_convertvector(z, __vector double);
#elif defined(__GNUC__) && __GNUC__ >= 8
__vector double rs = vec_ctf(z, 0);
#else
#ifdef __ibmxl__
__vector double rs = vec_ctd(z, 0);
#else
__vector double rs = vec_ctf(z, 0);
#endif
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));
......@@ -383,12 +381,12 @@ QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32
#endif
QUALIFIERS void philox_float4(__vector uint32 ctr0, __vector uint32 ctr1, __vector uint32 ctr2, __vector uint32 ctr3,
QUALIFIERS void philox_float4(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
__vector uint32 key[2] = {vec_splats(key0), vec_splats(key1)};
__vector uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
__vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
......@@ -414,12 +412,12 @@ QUALIFIERS void philox_float4(__vector uint32 ctr0, __vector uint32 ctr1, __vect
#ifdef __VSX__
QUALIFIERS void philox_double2(__vector uint32 ctr0, __vector uint32 ctr1, __vector uint32 ctr2, __vector uint32 ctr3,
QUALIFIERS void philox_double2(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
{
__vector uint32 key[2] = {vec_splats(key0), vec_splats(key1)};
__vector uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
__vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
......@@ -438,13 +436,13 @@ QUALIFIERS void philox_double2(__vector uint32 ctr0, __vector uint32 ctr1, __vec
}
#endif
QUALIFIERS void philox_float4(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3,
QUALIFIERS void philox_float4(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
__vector uint32 ctr0v = vec_splats(ctr0);
__vector uint32 ctr2v = vec_splats(ctr2);
__vector uint32 ctr3v = vec_splats(ctr3);
__vector unsigned int ctr0v = vec_splats(ctr0);
__vector unsigned int ctr2v = vec_splats(ctr2);
__vector unsigned int ctr3v = vec_splats(ctr3);
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
......@@ -453,28 +451,28 @@ QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint3
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
philox_float4(ctr0, (__vector uint32) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
philox_float4(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
#ifdef __VSX__
QUALIFIERS void philox_double2(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3,
QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
{
__vector uint32 ctr0v = vec_splats(ctr0);
__vector uint32 ctr2v = vec_splats(ctr2);
__vector uint32 ctr3v = vec_splats(ctr3);
__vector unsigned int ctr0v = vec_splats(ctr0);
__vector unsigned int ctr2v = vec_splats(ctr2);
__vector unsigned int ctr3v = vec_splats(ctr3);
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void philox_double2(uint32 ctr0, __vector uint32 ctr1, uint32 ctr2, uint32 ctr3,
QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1, __vector double & rnd2)
{
__vector uint32 ctr0v = vec_splats(ctr0);
__vector uint32 ctr2v = vec_splats(ctr2);
__vector uint32 ctr3v = vec_splats(ctr3);
__vector unsigned int ctr0v = vec_splats(ctr0);
__vector unsigned int ctr2v = vec_splats(ctr2);
__vector unsigned int ctr3v = vec_splats(ctr3);
__vector double ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
......@@ -484,7 +482,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint
uint32 key0, uint32 key1,
__vector double & rnd1, __vector double & rnd2)
{
philox_double2(ctr0, (__vector uint32) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2);
philox_double2(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2);
}
#endif
#endif
......
......@@ -41,7 +41,7 @@ exclude_lines =
if __name__ == .__main__.:
skip_covered = True
fail_under = 89
fail_under = 88
[html]
directory = coverage_report
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment