Skip to content
Snippets Groups Projects
Commit 178b4df7 authored by Helen Schottenhamml's avatar Helen Schottenhamml
Browse files

Merge branch 'arm64' into 'master'

Properly detect and enable vectorization on ARM

See merge request pycodegen/pystencils!321
parents 30b55d00 267ce6a4
No related branches found
No related tags found
No related merge requests found
......@@ -156,7 +156,7 @@ arm64v8:
extends: .multiarch_template
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
variables:
PYSTENCILS_SIMD: "neon"
QEMU_CPU: "cortex-a76"
before_script:
- *multiarch_before_script
- sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json
......@@ -164,8 +164,6 @@ arm64v8:
ppc64le:
extends: .multiarch_template
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le
variables:
PYSTENCILS_SIMD: "vsx"
before_script:
- *multiarch_before_script
- sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json
......@@ -174,8 +172,6 @@ arm64v9:
# SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
extends: .multiarch_template
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
variables:
PYSTENCILS_SIMD: "sve128,sve256,sve512,sve"
before_script:
- *multiarch_before_script
- sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
......@@ -187,6 +183,7 @@ riscv64:
extends: .multiarch_template
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64
variables:
# explicitly set SIMD as detection does not appear to work on QEMU
PYSTENCILS_SIMD: "rvv"
QEMU_CPU: "rv64,v=true"
before_script:
......
......@@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_
from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
from pystencils.cache import memorycache
from pystencils.typing import numpy_name_to_c
......@@ -31,83 +32,68 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
return get_vector_instruction_set_x86(type_name, instruction_set)
_cache = None
_cachelinesize = None
@memorycache
def get_supported_instruction_sets():
"""List of supported instruction sets on current hardware, or None if query failed."""
global _cache
if _cache is not None:
return _cache.copy()
if 'PYSTENCILS_SIMD' in os.environ:
return os.environ['PYSTENCILS_SIMD'].split(',')
if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64':
# not supported by cpuinfo
if platform.system() == 'Darwin' and platform.machine() == 'arm64':
return ['neon']
elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
# not supported by cpuinfo
return ['neon']
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): # not supported by cpuinfo
elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
result = ['neon'] # Neon is mandatory on 64-bit ARM
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V
return ['rvv'] if hwcap & hwcap_isa_v else []
elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo
import subprocess
import tempfile
from pystencils.cpu.cpujit import get_compiler_config
f = tempfile.NamedTemporaryFile(suffix='.cpp')
command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
macros = subprocess.check_output(command, input='', text=True)
if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
_cache = ['vsx']
else:
_cache = []
return _cache.copy()
try:
from cpuinfo import get_cpu_info
except ImportError:
return None
result = []
required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
required_avx_flags = {'avx', 'avx2'}
required_avx512_flags = {'avx512f'}
required_neon_flags = {'neon'}
required_sve_flags = {'sve'}
flags = set(get_cpu_info()['flags'])
if flags.issuperset(required_sse_flags):
result.append("sse")
if flags.issuperset(required_avx_flags):
result.append("avx")
if flags.issuperset(required_avx512_flags):
result.append("avx512")
if flags.issuperset(required_neon_flags):
result.append("neon")
if flags.issuperset(required_sve_flags):
if platform.system() == 'Linux':
libc = CDLL('libc.so.6')
if hwcap & (1 << 22): # HWCAP_SVE
length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL
if length < 0:
raise OSError("SVE length query failed")
while length > 128:
while length >= 128:
result.append(f"sve{length}")
length //= 2
result.append("sve")
return result
result.append("sve")
return result
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V
return ['rvv'] if hwcap & hwcap_isa_v else []
elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
return ['vsx'] if hwcap & 0x00000080 else [] # PPC_FEATURE_HAS_VSX
elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
try:
from cpuinfo import get_cpu_info
except ImportError:
return None
result = []
required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
required_avx_flags = {'avx', 'avx2'}
required_avx512_flags = {'avx512f'}
flags = set(get_cpu_info()['flags'])
if flags.issuperset(required_sse_flags):
result.append("sse")
if flags.issuperset(required_avx_flags):
result.append("avx")
if flags.issuperset(required_avx512_flags):
result.append("avx512")
return result
else:
raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
(platform.system(), platform.machine()))
@memorycache
def get_cacheline_size(instruction_set):
"""Get the size (in bytes) of a cache block that can be zeroed without memory access.
Usually, this is identical to the cache line size."""
global _cachelinesize
instruction_sets = get_vector_instruction_set('double', instruction_set)
if 'cachelineSize' not in instruction_sets:
return None
if _cachelinesize is not None:
return _cachelinesize
import pystencils as ps
from pystencils.astnodes import SympyAssignment
......@@ -120,5 +106,4 @@ def get_cacheline_size(instruction_set):
ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
kernel = ast.compile()
kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
_cachelinesize = int(arr[0, 0])
return _cachelinesize
return int(arr[0, 0])
......@@ -146,9 +146,7 @@ def read_config():
('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__')
])
if platform.machine() == 'arm64':
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', '')
elif platform.machine().startswith('ppc64'):
if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
'-mcpu=native')
elif platform.system().lower() == 'windows':
......@@ -177,8 +175,8 @@ def read_config():
default_compiler_config['flags'] += ' ' + libomp
break
else:
raise ValueError("The detection of the platform with platform.system() did not work. "
"Pystencils is only supported for linux, windows, and darwin platforms.")
raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
(platform.system(),))
default_cache_config = OrderedDict([
('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment