Michael Kuron · cede566a · f0e9cd00 · 6cb620bb · 8d99d156 · d01fc61c
--- a/pystencils/backends/simd_instruction_sets.py

+ 41

− 56
+++ b/pystencils/backends/simd_instruction_sets.py

+ 41

− 56
 @@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_
 from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
 from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
 from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
+from pystencils.cache import memorycache
 from pystencils.typing import numpy_name_to_c


 @@ -31,83 +32,68 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
        return get_vector_instruction_set_x86(type_name, instruction_set)


-_cache = None
-_cachelinesize = None
-
-
+@memorycache
 def get_supported_instruction_sets():
    """List of supported instruction sets on current hardware, or None if query failed."""
-    global _cache
-    if _cache is not None:
-        return _cache.copy()
    if 'PYSTENCILS_SIMD' in os.environ:
        return os.environ['PYSTENCILS_SIMD'].split(',')
-    if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64':
-        # not supported by cpuinfo
+    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
        return ['neon']
    elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
-        # not supported by cpuinfo
        return ['neon']
-    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):  # not supported by cpuinfo
+    elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
+        result = ['neon']  # Neon is mandatory on 64-bit ARM
        libc = CDLL('libc.so.6')
        hwcap = libc.getauxval(16)  # AT_HWCAP
-        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
-        return ['rvv'] if hwcap & hwcap_isa_v else []
-    elif platform.machine().startswith('ppc64'):  # no flags reported by cpuinfo
-        import subprocess
-        import tempfile
-        from pystencils.cpu.cpujit import get_compiler_config
-        f = tempfile.NamedTemporaryFile(suffix='.cpp')
-        command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
-        macros = subprocess.check_output(command, input='', text=True)
-        if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
-            _cache = ['vsx']
-        else:
-            _cache = []
-        return _cache.copy()
-    try:
-        from cpuinfo import get_cpu_info
-    except ImportError:
-        return None
-
-    result = []
-    required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
-    required_avx_flags = {'avx', 'avx2'}
-    required_avx512_flags = {'avx512f'}
-    required_neon_flags = {'neon'}
-    required_sve_flags = {'sve'}
-    flags = set(get_cpu_info()['flags'])
-    if flags.issuperset(required_sse_flags):
-        result.append("sse")
-    if flags.issuperset(required_avx_flags):
-        result.append("avx")
-    if flags.issuperset(required_avx512_flags):
-        result.append("avx512")
-    if flags.issuperset(required_neon_flags):
-        result.append("neon")
-    if flags.issuperset(required_sve_flags):
-        if platform.system() == 'Linux':
-            libc = CDLL('libc.so.6')
+        if hwcap & (1 << 22):  # HWCAP_SVE
            length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
            if length < 0:
                raise OSError("SVE length query failed")
-            while length > 128:
+            while length >= 128:
                result.append(f"sve{length}")
                length //= 2
-        result.append("sve")
-    return result
+            result.append("sve")
+        return result
+    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
+        return ['rvv'] if hwcap & hwcap_isa_v else []
+    elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        return ['vsx'] if hwcap & 0x00000080 else []  # PPC_FEATURE_HAS_VSX
+    elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError:
+            return None
+
+        result = []
+        required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
+        required_avx_flags = {'avx', 'avx2'}
+        required_avx512_flags = {'avx512f'}
+        flags = set(get_cpu_info()['flags'])
+        if flags.issuperset(required_sse_flags):
+            result.append("sse")
+        if flags.issuperset(required_avx_flags):
+            result.append("avx")
+        if flags.issuperset(required_avx512_flags):
+            result.append("avx512")
+        return result
+    else:
+        raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
+                                  (platform.system(), platform.machine()))


+@memorycache
 def get_cacheline_size(instruction_set):
    """Get the size (in bytes) of a cache block that can be zeroed without memory access.
       Usually, this is identical to the cache line size."""
-    global _cachelinesize
    
    instruction_sets = get_vector_instruction_set('double', instruction_set)
    if 'cachelineSize' not in instruction_sets:
        return None
-    if _cachelinesize is not None:
-        return _cachelinesize
    
    import pystencils as ps
    from pystencils.astnodes import SympyAssignment
 @@ -120,5 +106,4 @@ def get_cacheline_size(instruction_set):
    ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
    kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
-    _cachelinesize = int(arr[0, 0])
-    return _cachelinesize
+    return int(arr[0, 0])