Skip to content
Snippets Groups Projects
Commit 587b23f6 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

Support Windows on ARM64

parent e356d3d7
No related branches found
No related tags found
No related merge requests found
Pipeline #53210 failed
......@@ -25,7 +25,8 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
byte_alignment = 64
elif byte_alignment == 'cacheline':
cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
if all([s is None for s in cacheline_sizes]):
if all([s is None for s in cacheline_sizes]) or \
max([s for s in cacheline_sizes if s is not None]) > 0x100000:
widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
for is_name in instruction_sets
if type(get_vector_instruction_set(dtype, is_name)['width']) is int]
......
......@@ -45,6 +45,9 @@ def get_supported_instruction_sets():
if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64':
# not supported by cpuinfo
return ['neon']
elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
# not supported by cpuinfo
return ['neon']
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): # not supported by cpuinfo
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
......
......@@ -159,6 +159,9 @@ def read_config():
('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
('restrict_qualifier', '__restrict')
])
if platform.machine() == 'ARM64':
default_compiler_config['arch'] = 'ARM64'
default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
elif platform.system().lower() == 'darwin':
default_compiler_config = OrderedDict([
('os', 'darwin'),
......@@ -393,7 +396,8 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
loop.atoms(VectorMemoryAccess)])
if has_openmp and has_nontemporal:
byte_width = ast_node.instruction_set['cachelineSize']
cl_size = ast_node.instruction_set['cachelineSize']
byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
offset = max(max(ast_node.ghost_layers)) * item_size
offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"
......
#if defined(_MSC_VER)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
......@@ -32,10 +36,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
#endif
inline void cachelineZero(void * p) {
#ifndef _MSC_VER
__asm__ volatile("dc zva, %0"::"r"(p):"memory");
#endif
}
inline size_t _cachelineSize() {
#ifndef _MSC_VER
// check that dc zva is permitted
uint64_t dczid;
__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
......@@ -72,6 +79,7 @@ inline size_t _cachelineSize() {
return size;
}
}
#endif
// too much was zeroed
return SIZE_MAX;
......
#pragma once
#if defined(__SSE2__) || defined(_MSC_VER)
#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#ifdef __AVX512VL__
......@@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _
}
#endif
#if defined(__SSE4_1__) || defined(_MSC_VER)
#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math")))
#endif
......
#ifndef __OPENCL_VERSION__
#if defined(__SSE2__) || defined(_MSC_VER)
#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <emmintrin.h> // SSE2
#endif
#ifdef __AVX2__
#include <immintrin.h> // AVX*
#elif defined(__SSE4_1__) || defined(_MSC_VER)
#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <smmintrin.h> // SSE4
#ifdef __FMA__
#include <immintrin.h> // FMA
#endif
#endif
#if defined(_MSC_VER) && defined(_M_ARM64)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
......@@ -183,7 +187,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
}
#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
#if defined(__SSE4_1__) || defined(_MSC_VER)
#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
{
__m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
......@@ -665,12 +669,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
#ifndef _MSC_VER
QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
{
philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
#endif
QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
......@@ -695,6 +701,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
}
#ifndef _MSC_VER
QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float64x2_t & rnd1, float64x2_t & rnd2)
......@@ -702,6 +709,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
}
#endif
#endif
#if defined(__ARM_FEATURE_SVE)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment