Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Commits on Source (11)
This diff is collapsed.
#pragma once
#ifdef __SSE2__
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#ifdef __AVX512VL__
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
{
__m128i T0, T1, T2, T3;
T0 = _mm_unpacklo_epi32(R0, R1);
T1 = _mm_unpacklo_epi32(R2, R3);
T2 = _mm_unpackhi_epi32(R0, R1);
T3 = _mm_unpackhi_epi32(R2, R3);
R0 = _mm_unpacklo_epi64(T0, T1);
R1 = _mm_unpackhi_epi64(T0, T1);
R2 = _mm_unpacklo_epi64(T2, T3);
R3 = _mm_unpackhi_epi64(T2, T3);
}
#endif
#ifdef __SSE4_1__
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{
#ifdef __AVX512VL__
return _mm_cvtepu64_pd(x);
#else
__m128i xH = _mm_srli_epi64(x, 32);
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
#endif
}
#endif
#ifdef __AVX__
QUALIFIERS __m256i _my256_set_m128i(__m128i hi, __m128i lo)
{
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
}
#endif
#ifdef __AVX2__
QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
{
#ifdef __AVX512VL__
return _mm256_cvtepu32_ps(v);
#else
__m256i v2 = _mm256_srli_epi32(v, 1);
__m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
__m256 v2f = _mm256_cvtepi32_ps(v2);
__m256 v1f = _mm256_cvtepi32_ps(v1);
return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
#endif
}
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
{
#ifdef __AVX512VL__
return _mm256_cvtepu64_pd(x);
#else
__m256i xH = _mm256_srli_epi64(x, 32);
xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84
__m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
#endif
}
#endif
#ifdef __AVX512F__
QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
{
return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);
}
#endif
This diff is collapsed.
...@@ -6,16 +6,23 @@ from pystencils.astnodes import LoopOverCoordinate ...@@ -6,16 +6,23 @@ from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode from pystencils.backends.cbackend import CustomCodeNode
def _get_rng_template(name, data_type, num_vars): def _data_type_to_str(data_type):
if data_type is np.float32: if data_type is np.float32:
c_type = "float" return "float"
elif data_type is np.float64: elif data_type is np.float64:
c_type = "double" return "double"
elif type(data_type) is str:
return data_type
raise ValueError("%s is not a supported data type" % (data_type, ))
def _get_rng_template(name, data_type, num_vars):
c_type = _data_type_to_str(data_type)
template = "\n" template = "\n"
for i in range(num_vars): for i in range(num_vars):
template += "{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};\n".format(i, i) template += "{} {{result_symbols[{}].name}};\n".format(c_type, i, i)
template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \ template += ("{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
.format(name, c_type, num_vars, *tuple(range(num_vars))) .format(name, *tuple(range(num_vars)))
return template return template
...@@ -23,7 +30,7 @@ def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, ...@@ -23,7 +30,7 @@ def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets,
parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i] parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i]
for i in range(dim)] + [0] * (3 - dim) + list(keys) for i in range(dim)] + [0] * (3 - dim) + list(keys)
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None): if dialect == 'cuda' or dialect == 'c':
return template.format(parameters=', '.join(str(p) for p in parameters), return template.format(parameters=', '.join(str(p) for p in parameters),
result_symbols=result_symbols) result_symbols=result_symbols)
else: else:
...@@ -44,7 +51,7 @@ class RNGBase(CustomCodeNode): ...@@ -44,7 +51,7 @@ class RNGBase(CustomCodeNode):
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols) super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self._time_step = time_step self._time_step = time_step
self._offsets = offsets self._offsets = offsets
self.headers = ['"{}_rand.h"'.format(self._name)] self.headers = ['"{}_rand.h"'.format(self._name.split('_')[0])]
self.keys = tuple(keys) self.keys = tuple(keys)
self._args = sp.sympify((dim, time_step, keys)) self._args = sp.sympify((dim, time_step, keys))
self._dim = dim self._dim = dim
...@@ -65,7 +72,11 @@ class RNGBase(CustomCodeNode): ...@@ -65,7 +72,11 @@ class RNGBase(CustomCodeNode):
return self # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them return self # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
def get_code(self, dialect, vector_instruction_set): def get_code(self, dialect, vector_instruction_set):
template = _get_rng_template(self._name, self._data_type, self._num_vars) if vector_instruction_set:
template = _get_rng_template(self._name, vector_instruction_set[_data_type_to_str(self._data_type)],
self._num_vars)
else:
template = _get_rng_template(self._name, self._data_type, self._num_vars)
return _get_rng_code(template, dialect, vector_instruction_set, return _get_rng_code(template, dialect, vector_instruction_set,
self._time_step, self._offsets, self.keys, self._dim, self.result_symbols) self._time_step, self._offsets, self.keys, self._dim, self.result_symbols)
...@@ -74,28 +85,28 @@ class RNGBase(CustomCodeNode): ...@@ -74,28 +85,28 @@ class RNGBase(CustomCodeNode):
class PhiloxTwoDoubles(RNGBase): class PhiloxTwoDoubles(RNGBase):
_name = "philox" _name = "philox_double2"
_data_type = np.float64 _data_type = np.float64
_num_vars = 2 _num_vars = 2
_num_keys = 2 _num_keys = 2
class PhiloxFourFloats(RNGBase): class PhiloxFourFloats(RNGBase):
_name = "philox" _name = "philox_float4"
_data_type = np.float32 _data_type = np.float32
_num_vars = 4 _num_vars = 4
_num_keys = 2 _num_keys = 2
class AESNITwoDoubles(RNGBase): class AESNITwoDoubles(RNGBase):
_name = "aesni" _name = "aesni_double2"
_data_type = np.float64 _data_type = np.float64
_num_vars = 2 _num_vars = 2
_num_keys = 4 _num_keys = 4
class AESNIFourFloats(RNGBase): class AESNIFourFloats(RNGBase):
_name = "aesni" _name = "aesni_float4"
_data_type = np.float32 _data_type = np.float32
_num_vars = 4 _num_vars = 4
_num_keys = 4 _num_keys = 4
......