Merge branch 'master' of i10git.cs.fau.de:pycodegen/pystencils into philox-simd

0c927339 · Michael Kuron · 7c2af9ad · 71b8767b · 0c927339 · 0c927339
Commit 0c927339 authored 5 years ago by Michael Kuron
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -88,6 +88,19 @@ minimal-conda:
    - docker
+minimal-sympy-master:
+  stage: test
+  except:
+    variables:
+      - $ENABLE_NIGHTLY_BUILDS
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/minimal_conda
+  script:
+    - python -m pip install --upgrade git+https://github.com/sympy/sympy.git
+    - python setup.py quicktest
+  allow_failure: true
+  tags:
+    - docker
 # -------------------- Linter & Documentation --------------------------------------------------------------------------

--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -173,7 +173,8 @@ def read_config():
        config = recursive_dict_update(config, loaded_config)
    else:
        create_folder(config_path, True)
-        json.dump(config, open(config_path, 'w'), indent=4)
+        with open(config_path, 'w') as f:
+            json.dump(config, f, indent=4)
    if config['cache']['object_cache'] is not False:
        config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid())

--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
-#if !defined(__AES__) || !defined(__SSE2__)
+#if !defined(__AES__) || !defined(__SSE4_1__)
-#error AES-NI and SSE2 need to be enabled
+#error AES-NI and SSE4.1 need to be enabled
 #endif
 #include <emmintrin.h> // SSE2
 #include <wmmintrin.h> // AES
 #ifdef __AVX512VL__
 #include <immintrin.h> // AVX*
+#else
+#include <smmintrin.h>  // SSE4
+#ifdef __FMA__
+#include <immintrin.h> // FMA
+#endif
 #endif
 #include <cstdint>
@@ -44,14 +49,19 @@ QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 #endif
 }
+#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
+__attribute__((optimize("no-associative-math")))
+#endif
 QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
 {
 #ifdef __AVX512VL__
    return _mm_cvtepu64_pd(x);
 #else
-    uint64 r[2];
+    __m128i xH = _mm_srli_epi64(x, 32);
-    _mm_storeu_si128((__m128i*)r, x);
+    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
-    return _mm_set_pd((double)r[1], (double)r[0]);
+    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
+    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
+    return _mm_add_pd(f, _mm_castsi128_pd(xL));
 #endif
 }
@@ -71,18 +81,22 @@ QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
    y = _mm_srli_si128(y, 4);
    // calculate z = x ^ y << (53 - 32))
-    __m128i z = _mm_sll_epi64(y, _mm_set_epi64x(53 - 32, 53 - 32));
+    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
    z = _mm_xor_si128(x, z);
    // convert uint64 to double
    __m128d rs = _my_cvtepu64_pd(z);
    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
-    rs = _mm_mul_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE));
+#ifdef __FMA__
-    rs = _mm_add_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
+    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#else
+    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#endif
    // store result
-    double rr[2];
+    alignas(16) double rr[2];
-    _mm_storeu_pd(rr, rs);
+    _mm_store_pd(rr, rs);
    rnd1 = rr[0];
    rnd2 = rr[1];
 }
@@ -100,14 +114,19 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
    // convert uint32 to float
    __m128 rs = _my_cvtepu32_ps(c128);
    // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
-    rs = _mm_mul_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT));
+#ifdef __FMA__
-    rs = _mm_add_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
+    rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#else
+    rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#endif
    // store result
-    float r[4];
+    alignas(16) float r[4];
-    _mm_storeu_ps(r, rs);
+    _mm_store_ps(r, rs);
    rnd1 = r[0];
    rnd2 = r[1];
    rnd3 = r[2];
    rnd4 = r[3];
 }
\ No newline at end of file
--- a/pystencils/rng.py
+++ b/pystencils/rng.py
@@ -111,3 +111,4 @@ def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node
                assignment_list.insert(0, node)
                inserted = True
            yield symbol
+        counter += 1