Merge branch 'arm64' into 'master'

Properly detect and enable vectorization on ARM See merge request pycodegen/pystencils!321

Merge branch 'arm64' into 'master'
178b4df7 · Helen Schottenhamml · 30b55d00 · 267ce6a4 · 178b4df7 · 178b4df7
Commit 178b4df7 authored 2 years ago by Helen Schottenhamml
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -156,7 +156,7 @@ arm64v8:
  extends: .multiarch_template
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
  variables:
-    PYSTENCILS_SIMD: "neon"
+    QEMU_CPU: "cortex-a76"
  before_script:
    - *multiarch_before_script
    - sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json
@@ -164,8 +164,6 @@ arm64v8:
 ppc64le:
  extends: .multiarch_template
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le
-  variables:
-    PYSTENCILS_SIMD: "vsx"
  before_script:
    - *multiarch_before_script
    - sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json
@@ -174,8 +172,6 @@ arm64v9:
  # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
  extends: .multiarch_template
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
-  variables:
-    PYSTENCILS_SIMD: "sve128,sve256,sve512,sve"
  before_script:
    - *multiarch_before_script
    - sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
@@ -187,6 +183,7 @@ riscv64:
  extends: .multiarch_template
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64
  variables:
+    # explicitly set SIMD as detection does not appear to work on QEMU
    PYSTENCILS_SIMD: "rvv"
    QEMU_CPU: "rv64,v=true"
  before_script:

--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_
 from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
 from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
 from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
+from pystencils.cache import memorycache
 from pystencils.typing import numpy_name_to_c
@@ -31,83 +32,68 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
        return get_vector_instruction_set_x86(type_name, instruction_set)
-_cache = None
+@memorycache
-_cachelinesize = None
 def get_supported_instruction_sets():
    """List of supported instruction sets on current hardware, or None if query failed."""
-    global _cache
-    if _cache is not None:
-        return _cache.copy()
    if 'PYSTENCILS_SIMD' in os.environ:
        return os.environ['PYSTENCILS_SIMD'].split(',')
-    if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64':
+    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
-        # not supported by cpuinfo
        return ['neon']
    elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
-        # not supported by cpuinfo
        return ['neon']
-    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):  # not supported by cpuinfo
+    elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
+        result = ['neon']  # Neon is mandatory on 64-bit ARM
        libc = CDLL('libc.so.6')
        hwcap = libc.getauxval(16)  # AT_HWCAP
-        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
+        if hwcap & (1 << 22):  # HWCAP_SVE
-        return ['rvv'] if hwcap & hwcap_isa_v else []
-    elif platform.machine().startswith('ppc64'):  # no flags reported by cpuinfo
-        import subprocess
-        import tempfile
-        from pystencils.cpu.cpujit import get_compiler_config
-        f = tempfile.NamedTemporaryFile(suffix='.cpp')
-        command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
-        macros = subprocess.check_output(command, input='', text=True)
-        if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
-            _cache = ['vsx']
-        else:
-            _cache = []
-        return _cache.copy()
-    try:
-        from cpuinfo import get_cpu_info
-    except ImportError:
-        return None
-    result = []
-    required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
-    required_avx_flags = {'avx', 'avx2'}
-    required_avx512_flags = {'avx512f'}
-    required_neon_flags = {'neon'}
-    required_sve_flags = {'sve'}
-    flags = set(get_cpu_info()['flags'])
-    if flags.issuperset(required_sse_flags):
-        result.append("sse")
-    if flags.issuperset(required_avx_flags):
-        result.append("avx")
-    if flags.issuperset(required_avx512_flags):
-        result.append("avx512")
-    if flags.issuperset(required_neon_flags):
-        result.append("neon")
-    if flags.issuperset(required_sve_flags):
-        if platform.system() == 'Linux':
-            libc = CDLL('libc.so.6')
            length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
            if length < 0:
                raise OSError("SVE length query failed")
-            while length > 128:
+            while length >= 128:
                result.append(f"sve{length}")
                length //= 2
-        result.append("sve")
+            result.append("sve")
-    return result
+        return result
+    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
+        return ['rvv'] if hwcap & hwcap_isa_v else []
+    elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        return ['vsx'] if hwcap & 0x00000080 else []  # PPC_FEATURE_HAS_VSX
+    elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError:
+            return None
+        result = []
+        required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
+        required_avx_flags = {'avx', 'avx2'}
+        required_avx512_flags = {'avx512f'}
+        flags = set(get_cpu_info()['flags'])
+        if flags.issuperset(required_sse_flags):
+            result.append("sse")
+        if flags.issuperset(required_avx_flags):
+            result.append("avx")
+        if flags.issuperset(required_avx512_flags):
+            result.append("avx512")
+        return result
+    else:
+        raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
+                                  (platform.system(), platform.machine()))
+@memorycache
 def get_cacheline_size(instruction_set):
    """Get the size (in bytes) of a cache block that can be zeroed without memory access.
       Usually, this is identical to the cache line size."""
-    global _cachelinesize
    instruction_sets = get_vector_instruction_set('double', instruction_set)
    if 'cachelineSize' not in instruction_sets:
        return None
-    if _cachelinesize is not None:
-        return _cachelinesize
    import pystencils as ps
    from pystencils.astnodes import SympyAssignment
@@ -120,5 +106,4 @@ def get_cacheline_size(instruction_set):
    ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
    kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
-    _cachelinesize = int(arr[0, 0])
+    return int(arr[0, 0])
-    return _cachelinesize
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -146,9 +146,7 @@ def read_config():
            ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
            ('restrict_qualifier', '__restrict__')
        ])
-        if platform.machine() == 'arm64':
+        if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
-            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', '')
-        elif platform.machine().startswith('ppc64'):
            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
                                                                                        '-mcpu=native')
    elif platform.system().lower() == 'windows':
@@ -177,8 +175,8 @@ def read_config():
                default_compiler_config['flags'] += ' ' + libomp
                break
    else:
-        raise ValueError("The detection of the platform with platform.system() did not work. "
+        raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
-                         "Pystencils is only supported for linux, windows, and darwin platforms.")
+                                  (platform.system(),))
    default_cache_config = OrderedDict([
        ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),