diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b76595e724da36e68aab389a2f15365c676cb854..37bc43f979110c4393eabc636793215d713bc1f7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -88,6 +88,19 @@ minimal-conda: - docker +minimal-sympy-master: + stage: test + except: + variables: + - $ENABLE_NIGHTLY_BUILDS + image: i10git.cs.fau.de:5005/pycodegen/pycodegen/minimal_conda + script: + - python -m pip install --upgrade git+https://github.com/sympy/sympy.git + - python setup.py quicktest + allow_failure: true + tags: + - docker + # -------------------- Linter & Documentation -------------------------------------------------------------------------- diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index b69de887ba3324f0f6ae3fc1e7c3764b4ba4968b..00fe25c787035ca00ce4fb9a890046786ff4463e 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -173,7 +173,8 @@ def read_config(): config = recursive_dict_update(config, loaded_config) else: create_folder(config_path, True) - json.dump(config, open(config_path, 'w'), indent=4) + with open(config_path, 'w') as f: + json.dump(config, f, indent=4) if config['cache']['object_cache'] is not False: config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid()) diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h index 09327f27b8bc0b3cdd0b16cb8a64e3237b555797..1e2b05bd28a01d64dcbd709d4018dcb7adfd69cd 100644 --- a/pystencils/include/aesni_rand.h +++ b/pystencils/include/aesni_rand.h @@ -1,11 +1,16 @@ -#if !defined(__AES__) || !defined(__SSE2__) -#error AES-NI and SSE2 need to be enabled +#if !defined(__AES__) || !defined(__SSE4_1__) +#error AES-NI and SSE4.1 need to be enabled #endif #include <emmintrin.h> // SSE2 #include <wmmintrin.h> // AES #ifdef __AVX512VL__ #include <immintrin.h> // AVX* +#else +#include <smmintrin.h> // SSE4 +#ifdef __FMA__ +#include <immintrin.h> // FMA +#endif #endif #include <cstdint> @@ -44,14 +49,19 @@ QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) #endif } +#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 +__attribute__((optimize("no-associative-math"))) +#endif QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x) { #ifdef __AVX512VL__ return _mm_cvtepu64_pd(x); #else - uint64 r[2]; - _mm_storeu_si128((__m128i*)r, x); - return _mm_set_pd((double)r[1], (double)r[0]); + __m128i xH = _mm_srli_epi64(x, 32); + xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 + __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52 + __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + return _mm_add_pd(f, _mm_castsi128_pd(xL)); #endif } @@ -71,18 +81,22 @@ QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 y = _mm_srli_si128(y, 4); // calculate z = x ^ y << (53 - 32)) - __m128i z = _mm_sll_epi64(y, _mm_set_epi64x(53 - 32, 53 - 32)); + __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32)); z = _mm_xor_si128(x, z); // convert uint64 to double __m128d rs = _my_cvtepu64_pd(z); // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0) - rs = _mm_mul_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE)); - rs = _mm_add_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0)); +#ifdef __FMA__ + rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0)); +#else + rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE)); + rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0)); +#endif // store result - double rr[2]; - _mm_storeu_pd(rr, rs); + alignas(16) double rr[2]; + _mm_store_pd(rr, rs); rnd1 = rr[0]; rnd2 = rr[1]; } @@ -100,14 +114,19 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, // convert uint32 to float __m128 rs = _my_cvtepu32_ps(c128); // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f) - rs = _mm_mul_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT)); - rs = _mm_add_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f)); +#ifdef __FMA__ + rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f)); +#else + rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT)); + rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f)); +#endif // store result - float r[4]; - _mm_storeu_ps(r, rs); + alignas(16) float r[4]; + _mm_store_ps(r, rs); rnd1 = r[0]; rnd2 = r[1]; rnd3 = r[2]; rnd4 = r[3]; -} \ No newline at end of file +} + diff --git a/pystencils/rng.py b/pystencils/rng.py index 4341a0b3906cb9505b1ffbb4c6070e9ea09aee60..5bc91b5678de9ec9e10bc43953604c4978eeaae7 100644 --- a/pystencils/rng.py +++ b/pystencils/rng.py @@ -111,3 +111,4 @@ def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node assignment_list.insert(0, node) inserted = True yield symbol + counter += 1