Merge remote-tracking branch 'origin/master' into arm64

267ce6a4 · Michael Kuron · cede566a · 30b55d00 · 267ce6a4 · 267ce6a4
Commit 267ce6a4 authored Jun 4, 2023 by Michael Kuron
--- a/pystencils/alignedarray.py
+++ b/pystencils/alignedarray.py
@@ -25,7 +25,8 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
            byte_alignment = 64
        elif byte_alignment == 'cacheline':
            cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
-            if all([s is None for s in cacheline_sizes]):
+            if all([s is None for s in cacheline_sizes]) or \
+                    max([s for s in cacheline_sizes if s is not None]) > 0x100000:
                widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
                          for is_name in instruction_sets
                          if type(get_vector_instruction_set(dtype, is_name)['width']) is int]

--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -39,6 +39,8 @@ def get_supported_instruction_sets():
        return os.environ['PYSTENCILS_SIMD'].split(',')
    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
        return ['neon']
+    elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
+        return ['neon']
    elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
        result = ['neon']  # Neon is mandatory on 64-bit ARM
        libc = CDLL('libc.so.6')

--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -157,6 +157,9 @@ def read_config():
            ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
            ('restrict_qualifier', '__restrict')
        ])
+        if platform.machine() == 'ARM64':
+            default_compiler_config['arch'] = 'ARM64'
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
    elif platform.system().lower() == 'darwin':
        default_compiler_config = OrderedDict([
            ('os', 'darwin'),
@@ -391,7 +394,8 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                            has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
                                                                      loop.atoms(VectorMemoryAccess)])
                        if has_openmp and has_nontemporal:
-                            byte_width = ast_node.instruction_set['cachelineSize']
+                            cl_size = ast_node.instruction_set['cachelineSize']
+                            byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
                    offset = max(max(ast_node.ghost_layers)) * item_size
                    offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"


--- a/pystencils/include/arm_neon_helpers.h
+++ b/pystencils/include/arm_neon_helpers.h
+#if defined(_MSC_VER)
+#define __ARM_NEON
+#endif
+
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -32,10 +36,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
 #endif

 inline void cachelineZero(void * p) {
+#if !defined(_MSC_VER) || defined(__clang__)
 	__asm__ volatile("dc zva, %0"::"r"(p):"memory");
+#endif
 }

 inline size_t _cachelineSize() {
+#if !defined(_MSC_VER) || defined(__clang__)
 	// check that dc zva is permitted
 	uint64_t dczid;
 	__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
@@ -72,6 +79,7 @@ inline size_t _cachelineSize() {
 			return size;
 		}
 	}
+#endif
 	
 	// too much was zeroed
 	return SIZE_MAX;

--- a/pystencils/include/myintrin.h
+++ b/pystencils/include/myintrin.h
 #pragma once

-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 {
 #ifdef __AVX512VL__
@@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _
 }
 #endif

-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
 __attribute__((optimize("no-associative-math")))
 #endif

--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
 #ifndef __OPENCL_VERSION__
-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <emmintrin.h> // SSE2
 #endif
 #ifdef __AVX2__
 #include <immintrin.h> // AVX*
-#elif defined(__SSE4_1__) || defined(_MSC_VER)
+#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <smmintrin.h>  // SSE4
 #ifdef __FMA__
 #include <immintrin.h> // FMA
 #endif
 #endif

+#if defined(_MSC_VER) && defined(_M_ARM64)
+#define __ARM_NEON
+#endif
+
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -183,7 +187,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
 }

 #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
 {
    __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
@@ -665,12 +669,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }

+#ifndef _MSC_VER
 QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                              uint32 key0, uint32 key1,
                              float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
 {
    philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
+#endif

 QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
@@ -695,6 +701,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
 }

+#ifndef _MSC_VER
 QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               float64x2_t & rnd1, float64x2_t & rnd2)
@@ -702,6 +709,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
 }
 #endif
+#endif


 #if defined(__ARM_FEATURE_SVE)