Skip to content
Snippets Groups Projects
Commit 267ce6a4 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

Merge remote-tracking branch 'origin/master' into arm64

parents cede566a 30b55d00
Branches
Tags
No related merge requests found
...@@ -25,7 +25,8 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o ...@@ -25,7 +25,8 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
byte_alignment = 64 byte_alignment = 64
elif byte_alignment == 'cacheline': elif byte_alignment == 'cacheline':
cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets] cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
if all([s is None for s in cacheline_sizes]): if all([s is None for s in cacheline_sizes]) or \
max([s for s in cacheline_sizes if s is not None]) > 0x100000:
widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
for is_name in instruction_sets for is_name in instruction_sets
if type(get_vector_instruction_set(dtype, is_name)['width']) is int] if type(get_vector_instruction_set(dtype, is_name)['width']) is int]
......
...@@ -39,6 +39,8 @@ def get_supported_instruction_sets(): ...@@ -39,6 +39,8 @@ def get_supported_instruction_sets():
return os.environ['PYSTENCILS_SIMD'].split(',') return os.environ['PYSTENCILS_SIMD'].split(',')
if platform.system() == 'Darwin' and platform.machine() == 'arm64': if platform.system() == 'Darwin' and platform.machine() == 'arm64':
return ['neon'] return ['neon']
elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
return ['neon']
elif platform.system() == 'Linux' and platform.machine() == 'aarch64': elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
result = ['neon'] # Neon is mandatory on 64-bit ARM result = ['neon'] # Neon is mandatory on 64-bit ARM
libc = CDLL('libc.so.6') libc = CDLL('libc.so.6')
......
...@@ -157,6 +157,9 @@ def read_config(): ...@@ -157,6 +157,9 @@ def read_config():
('flags', '/Ox /fp:fast /OpenMP /arch:avx'), ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
('restrict_qualifier', '__restrict') ('restrict_qualifier', '__restrict')
]) ])
if platform.machine() == 'ARM64':
default_compiler_config['arch'] = 'ARM64'
default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
elif platform.system().lower() == 'darwin': elif platform.system().lower() == 'darwin':
default_compiler_config = OrderedDict([ default_compiler_config = OrderedDict([
('os', 'darwin'), ('os', 'darwin'),
...@@ -391,7 +394,8 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec ...@@ -391,7 +394,8 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
loop.atoms(VectorMemoryAccess)]) loop.atoms(VectorMemoryAccess)])
if has_openmp and has_nontemporal: if has_openmp and has_nontemporal:
byte_width = ast_node.instruction_set['cachelineSize'] cl_size = ast_node.instruction_set['cachelineSize']
byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
offset = max(max(ast_node.ghost_layers)) * item_size offset = max(max(ast_node.ghost_layers)) * item_size
offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0" offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"
......
#if defined(_MSC_VER)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON #ifdef __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
...@@ -32,10 +36,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d) ...@@ -32,10 +36,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
#endif #endif
inline void cachelineZero(void * p) { inline void cachelineZero(void * p) {
#if !defined(_MSC_VER) || defined(__clang__)
__asm__ volatile("dc zva, %0"::"r"(p):"memory"); __asm__ volatile("dc zva, %0"::"r"(p):"memory");
#endif
} }
inline size_t _cachelineSize() { inline size_t _cachelineSize() {
#if !defined(_MSC_VER) || defined(__clang__)
// check that dc zva is permitted // check that dc zva is permitted
uint64_t dczid; uint64_t dczid;
__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid)); __asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
...@@ -72,6 +79,7 @@ inline size_t _cachelineSize() { ...@@ -72,6 +79,7 @@ inline size_t _cachelineSize() {
return size; return size;
} }
} }
#endif
// too much was zeroed // too much was zeroed
return SIZE_MAX; return SIZE_MAX;
......
#pragma once #pragma once
#if defined(__SSE2__) || defined(_MSC_VER) #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{ {
#ifdef __AVX512VL__ #ifdef __AVX512VL__
...@@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _ ...@@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _
} }
#endif #endif
#if defined(__SSE4_1__) || defined(_MSC_VER) #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) #if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math"))) __attribute__((optimize("no-associative-math")))
#endif #endif
......
#ifndef __OPENCL_VERSION__ #ifndef __OPENCL_VERSION__
#if defined(__SSE2__) || defined(_MSC_VER) #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <emmintrin.h> // SSE2 #include <emmintrin.h> // SSE2
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
#include <immintrin.h> // AVX* #include <immintrin.h> // AVX*
#elif defined(__SSE4_1__) || defined(_MSC_VER) #elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <smmintrin.h> // SSE4 #include <smmintrin.h> // SSE4
#ifdef __FMA__ #ifdef __FMA__
#include <immintrin.h> // FMA #include <immintrin.h> // FMA
#endif #endif
#endif #endif
#if defined(_MSC_VER) && defined(_M_ARM64)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON #ifdef __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
...@@ -183,7 +187,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 ...@@ -183,7 +187,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
} }
#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
#if defined(__SSE4_1__) || defined(_MSC_VER) #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key) QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
{ {
__m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0)); __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
...@@ -665,12 +669,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ...@@ -665,12 +669,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
#ifndef _MSC_VER
QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4) float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
{ {
philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
#endif
QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3, QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
...@@ -695,6 +701,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ...@@ -695,6 +701,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
} }
#ifndef _MSC_VER
QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
float64x2_t & rnd1, float64x2_t & rnd2) float64x2_t & rnd1, float64x2_t & rnd2)
...@@ -702,6 +709,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ...@@ -702,6 +709,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2); philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
} }
#endif #endif
#endif
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment