diff --git a/pystencils/data_types.py b/pystencils/data_types.py index 20953c7c810801c58407cdb02cc6d48fc8f73b5d..7300f01ea47f6693a2915b2cb063f907e46d1c32 100644 --- a/pystencils/data_types.py +++ b/pystencils/data_types.py @@ -697,7 +697,7 @@ class VectorType(Type): if self.instruction_set is None: return "%s[%d]" % (self.base_type, self.width) else: - if self.base_type == create_type("int64"): + if self.base_type == create_type("int64") or self.base_type == create_type("int32"): return self.instruction_set['int'] elif self.base_type == create_type("float64"): return self.instruction_set['double'] diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h index 8af30e1aba3794e078554dff2d6cb4ac9e4081b6..4206e37f634e01586223e1412ce7836e2499f65f 100644 --- a/pystencils/include/aesni_rand.h +++ b/pystencils/include/aesni_rand.h @@ -21,6 +21,7 @@ typedef std::uint32_t uint32; typedef std::uint64_t uint64; +#if defined(__AES__) || defined(_MSC_VER) QUALIFIERS __m128i aesni_keygen_assist(__m128i temp1, __m128i temp2) { __m128i temp3; temp2 = _mm_shuffle_epi32(temp2 ,0xff); @@ -302,6 +303,7 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr __m128d ignore; aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore); } +#endif #ifdef __AVX2__ diff --git a/pystencils/include/myintrin.h b/pystencils/include/myintrin.h index 24b8c0967f6562365ac67dee3e896feb82c1fe81..a94c316c44de7aa420e4a6be807a510ee35687dd 100644 --- a/pystencils/include/myintrin.h +++ b/pystencils/include/myintrin.h @@ -1,6 +1,6 @@ #pragma once -#ifdef __SSE2__ +#if defined(__SSE2__) || defined(_MSC_VER) QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) { #ifdef __AVX512VL__ @@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _ } #endif -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(_MSC_VER) #if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) __attribute__((optimize("no-associative-math"))) #endif diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index 36ba7004f061ddd65378034023fddd425f2b7b69..b4c83669d0fda05aee1a7e018904c927a59c7ad1 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -1,6 +1,6 @@ #include <cstdint> -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(_MSC_VER) #include <emmintrin.h> // SSE2 #endif #ifdef __AVX2__ @@ -115,7 +115,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 } #ifndef __CUDA_ARCH__ -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(_MSC_VER) QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key) { __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0)); diff --git a/pystencils_tests/test_random.py b/pystencils_tests/test_random.py index a73d430d003ab772e719e7a8658b9d139a29901d..85b8d7ee66adb639bee818f49e1f7082ac227d80 100644 --- a/pystencils_tests/test_random.py +++ b/pystencils_tests/test_random.py @@ -5,11 +5,21 @@ import pytest import pystencils as ps from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles, AESNIFourFloats, AESNITwoDoubles, random_symbol from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets +from pystencils.cpu.cpujit import get_compiler_config from pystencils.data_types import TypedSymbol RNGs = {('philox', 'float'): PhiloxFourFloats, ('philox', 'double'): PhiloxTwoDoubles, ('aesni', 'float'): AESNIFourFloats, ('aesni', 'double'): AESNITwoDoubles} +instruction_sets = get_supported_instruction_sets() +if get_compiler_config()['os'] == 'windows': + # skip instruction sets supported by CPU but not the compiler + if '/arch:avx2' not in get_compiler_config()['flags'].lower() and \ + '/arch:avx512' not in get_compiler_config()['flags'].lower(): + instruction_sets.remove('avx') + if '/arch:avx512' not in get_compiler_config()['flags'].lower(): + instruction_sets.remove('avx512') + @pytest.mark.parametrize('target,rng', (('cpu', 'philox'), ('cpu', 'aesni'), ('gpu', 'philox'))) @pytest.mark.parametrize('precision', ('float', 'double')) @@ -89,20 +99,20 @@ def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), def test_rng_offsets(kind, vectorized): if vectorized: test = test_rng_vectorized - if not get_supported_instruction_sets(): + if not instruction_sets: pytest.skip("cannot detect CPU instruction set") else: test = test_rng if kind == 'value': - test(get_supported_instruction_sets()[0] if vectorized else 'cpu', 'philox', 'float', 'float', t=8, + test(instruction_sets[0] if vectorized else 'cpu', 'philox', 'float', 'float', t=8, offsets=(6, 7), keys=(5, 309)) elif kind == 'symbol': offsets = (TypedSymbol("x0", np.uint32), TypedSymbol("y0", np.uint32)) - test(get_supported_instruction_sets()[0] if vectorized else 'cpu', 'philox', 'float', 'float', t=8, + test(instruction_sets[0] if vectorized else 'cpu', 'philox', 'float', 'float', t=8, offsets=offsets, offset_values=(6, 7), keys=(5, 309)) -@pytest.mark.parametrize('target', get_supported_instruction_sets()) +@pytest.mark.parametrize('target', instruction_sets) @pytest.mark.parametrize('rng', ('philox', 'aesni')) @pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double'))) def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None): @@ -139,11 +149,11 @@ def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), ke def test_rng_symbol(vectorized): """Make sure that the RNG symbol generator generates symbols and that the resulting code compiles""" if vectorized: - if not get_supported_instruction_sets(): + if not instruction_sets: pytest.skip("cannot detect CPU instruction set") else: cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, - 'instruction_set': get_supported_instruction_sets()[0]} + 'instruction_set': instruction_sets[0]} else: cpu_vectorize_info = None @@ -155,7 +165,7 @@ def test_rng_symbol(vectorized): ac.main_assignments[i] = ps.Assignment(ac.main_assignments[i].lhs, next(rng_symbol_gen)) symbols = [a.rhs for a in ac.main_assignments] assert len(symbols) == f.shape[-1] and len(set(symbols)) == f.shape[-1] - kernel = ps.create_kernel(ac, target=dh.default_target, cpu_vectorize_info=cpu_vectorize_info).compile() + ps.create_kernel(ac, target=dh.default_target, cpu_vectorize_info=cpu_vectorize_info).compile() @pytest.mark.parametrize('vectorized', (False, True)) @@ -171,11 +181,11 @@ def test_staggered(vectorized): if not vectorized: return - if not get_supported_instruction_sets(): + if not instruction_sets: pytest.skip("cannot detect CPU instruction set") pytest.importorskip('islpy') cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': False, - 'instruction_set': get_supported_instruction_sets()[0]} + 'instruction_set': instruction_sets[0]} dh.fill(j.name, 867) dh.run_kernel(kernel, seed=5, time_step=309)