Skip to content
Snippets Groups Projects
Commit 059e2eef authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

Endianness fix

parent 3f9b4068
No related branches found
No related tags found
1 merge request!228Vectorization improvements
Pipeline #30943 passed
...@@ -14,6 +14,9 @@ def get_argument_string(function_shortcut): ...@@ -14,6 +14,9 @@ def get_argument_string(function_shortcut):
def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q_registers=True): def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q_registers=True):
if instruction_set != 'neon':
raise NotImplementedError(instruction_set)
base_names = { base_names = {
'+': 'add[0, 1]', '+': 'add[0, 1]',
'-': 'sub[0, 1]', '-': 'sub[0, 1]',
......
...@@ -14,6 +14,9 @@ def get_argument_string(function_shortcut): ...@@ -14,6 +14,9 @@ def get_argument_string(function_shortcut):
def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
if instruction_set != 'vsx':
raise NotImplementedError(instruction_set)
base_names = { base_names = {
'+': 'add[0, 1]', '+': 'add[0, 1]',
'-': 'sub[0, 1]', '-': 'sub[0, 1]',
......
...@@ -21,7 +21,7 @@ def get_supported_instruction_sets(): ...@@ -21,7 +21,7 @@ def get_supported_instruction_sets():
elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo
import subprocess import subprocess
from pystencils.cpu.cpujit import get_compiler_config from pystencils.cpu.cpujit import get_compiler_config
command = [get_compiler_config()['command'], '-dM', '-E', '-'] command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', '-']
macros = subprocess.check_output(command, input='', text=True) macros = subprocess.check_output(command, input='', text=True)
if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros: if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
return ['vsx'] return ['vsx']
......
...@@ -44,14 +44,16 @@ typedef std::uint64_t uint64; ...@@ -44,14 +44,16 @@ typedef std::uint64_t uint64;
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip) QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{ {
#ifndef __CUDA_ARCH__
// host code
#ifdef __powerpc__ #ifdef __powerpc__
*hip = __mulhwu(a,b); *hip = __mulhwu(a,b);
return a*b; return a*b;
#elif !defined(__CUDA_ARCH__) #else
// host code
uint64 product = ((uint64)a) * ((uint64)b); uint64 product = ((uint64)a) * ((uint64)b);
*hip = product >> 32; *hip = product >> 32;
return (uint32)product; return (uint32)product;
#endif
#else #else
// device code // device code
*hip = __umulhi(a,b); *hip = __umulhi(a,b);
...@@ -296,15 +298,29 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct ...@@ -296,15 +298,29 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
#ifdef __ALTIVEC__ #ifdef __ALTIVEC__
QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key) QUALIFIERS void _philox4x32round(__vector uint32* ctr, __vector uint32* key)
{ {
#ifdef __POWER10_VECTOR__
__vector uint32 lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector uint32 hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
#else
__vector uint32 lohi0a = (__vector uint32) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector uint32 lohi0a = (__vector uint32) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 lohi0b = (__vector uint32) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0)); __vector uint32 lohi0b = (__vector uint32) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector uint32 lohi1a = (__vector uint32) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector uint32 lohi1a = (__vector uint32) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector uint32 lohi1b = (__vector uint32) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1)); __vector uint32 lohi1b = (__vector uint32) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
#ifdef __LITTLE_ENDIAN__
__vector uint32 lo0 = vec_mergee(lohi0a, lohi0b); __vector uint32 lo0 = vec_mergee(lohi0a, lohi0b);
__vector uint32 lo1 = vec_mergee(lohi1a, lohi1b); __vector uint32 lo1 = vec_mergee(lohi1a, lohi1b);
__vector uint32 hi0 = vec_mergeo(lohi0a, lohi0b); __vector uint32 hi0 = vec_mergeo(lohi0a, lohi0b);
__vector uint32 hi1 = vec_mergeo(lohi1a, lohi1b); __vector uint32 hi1 = vec_mergeo(lohi1a, lohi1b);
#else
__vector uint32 lo0 = vec_mergeo(lohi0a, lohi0b);
__vector uint32 lo1 = vec_mergeo(lohi1a, lohi1b);
__vector uint32 hi0 = vec_mergee(lohi0a, lohi0b);
__vector uint32 hi1 = vec_mergee(lohi1a, lohi1b);
#endif
#endif
ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]); ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
ctr[1] = lo1; ctr[1] = lo1;
...@@ -323,6 +339,7 @@ template<bool high> ...@@ -323,6 +339,7 @@ template<bool high>
QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 y) QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 y)
{ {
// convert 32 to 64 bit // convert 32 to 64 bit
#ifdef __LITTLE_ENDIAN__
if (high) if (high)
{ {
x = vec_mergel(x, vec_splats(0U)); x = vec_mergel(x, vec_splats(0U));
...@@ -333,13 +350,31 @@ QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32 ...@@ -333,13 +350,31 @@ QUALIFIERS __vector double _uniform_double_hq(__vector uint32 x, __vector uint32
x = vec_mergeh(x, vec_splats(0U)); x = vec_mergeh(x, vec_splats(0U));
y = vec_mergeh(y, vec_splats(0U)); y = vec_mergeh(y, vec_splats(0U));
} }
#else
if (high)
{
x = vec_mergel(vec_splats(0U), x);
y = vec_mergel(vec_splats(0U), y);
}
else
{
x = vec_mergeh(vec_splats(0U), x);
y = vec_mergeh(vec_splats(0U), y);
}
#endif
// calculate z = x ^ y << (53 - 32)) // calculate z = x ^ y << (53 - 32))
__vector uint64 z = vec_sl((__vector uint64) y, vec_splats(53ULL - 32ULL)); __vector uint64 z = vec_sl((__vector uint64) y, vec_splats(53ULL - 32ULL));
z = vec_xor((__vector uint64) x, z); z = vec_xor((__vector uint64) x, z);
// convert uint64 to double // convert uint64 to double
__vector double rs = __builtin_convertvector(z, __vector double); // vec_ctd(z, 0) is documented but not available #if defined(__has_builtin) && __has_builtin(__builtin_convertvector)
__vector double rs = __builtin_convertvector(z, __vector double);
#elif defined(__GNUC__) && __GNUC__ >= 8
__vector double rs = vec_ctf(z, 0);
#else
__vector double rs = vec_ctd(z, 0);
#endif
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0) // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0)); rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment