Skip to content
Snippets Groups Projects
Commit f0e9cd00 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

Remove cpuinfo dependency for SIMD detection on non-x86

parent 70afe477
No related branches found
No related tags found
1 merge request!321Properly detect and enable vectorization on ARM
Pipeline #53214 failed
...@@ -156,7 +156,7 @@ arm64v8: ...@@ -156,7 +156,7 @@ arm64v8:
extends: .multiarch_template extends: .multiarch_template
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64 image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
variables: variables:
PYSTENCILS_SIMD: "neon" QEMU_CPU: "cortex-a76"
before_script: before_script:
- *multiarch_before_script - *multiarch_before_script
- sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json - sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json
...@@ -164,8 +164,6 @@ arm64v8: ...@@ -164,8 +164,6 @@ arm64v8:
ppc64le: ppc64le:
extends: .multiarch_template extends: .multiarch_template
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le
variables:
PYSTENCILS_SIMD: "vsx"
before_script: before_script:
- *multiarch_before_script - *multiarch_before_script
- sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json - sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json
...@@ -174,8 +172,6 @@ arm64v9: ...@@ -174,8 +172,6 @@ arm64v9:
# SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors). # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
extends: .multiarch_template extends: .multiarch_template
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64 image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
variables:
PYSTENCILS_SIMD: "sve128,sve256,sve512,sve"
before_script: before_script:
- *multiarch_before_script - *multiarch_before_script
- sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json - sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
...@@ -187,6 +183,7 @@ riscv64: ...@@ -187,6 +183,7 @@ riscv64:
extends: .multiarch_template extends: .multiarch_template
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64 image: i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64
variables: variables:
# explicitly set SIMD as detection does not appear to work on QEMU
PYSTENCILS_SIMD: "rvv" PYSTENCILS_SIMD: "rvv"
QEMU_CPU: "rv64,v=true" QEMU_CPU: "rv64,v=true"
before_script: before_script:
......
...@@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_ ...@@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_
from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
from pystencils.cache import memorycache
from pystencils.typing import numpy_name_to_c from pystencils.typing import numpy_name_to_c
...@@ -31,79 +32,66 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'): ...@@ -31,79 +32,66 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
return get_vector_instruction_set_x86(type_name, instruction_set) return get_vector_instruction_set_x86(type_name, instruction_set)
_cache = None @memorycache
_cachelinesize = None
def get_supported_instruction_sets(): def get_supported_instruction_sets():
"""List of supported instruction sets on current hardware, or None if query failed.""" """List of supported instruction sets on current hardware, or None if query failed."""
global _cache
if _cache is not None:
return _cache.copy()
if 'PYSTENCILS_SIMD' in os.environ: if 'PYSTENCILS_SIMD' in os.environ:
return os.environ['PYSTENCILS_SIMD'].split(',') return os.environ['PYSTENCILS_SIMD'].split(',')
if platform.system() == 'Darwin' and platform.machine() == 'arm64': # not supported by cpuinfo if platform.system() == 'Darwin' and platform.machine() == 'arm64':
return ['neon'] return ['neon']
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): # not supported by cpuinfo elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
result = ['neon'] # Neon is mandatory on 64-bit ARM
libc = CDLL('libc.so.6') libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP hwcap = libc.getauxval(16) # AT_HWCAP
hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V if hwcap & (1 << 22): # HWCAP_SVE
return ['rvv'] if hwcap & hwcap_isa_v else []
elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo
import subprocess
import tempfile
from pystencils.cpu.cpujit import get_compiler_config
f = tempfile.NamedTemporaryFile(suffix='.cpp')
command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
macros = subprocess.check_output(command, input='', text=True)
if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
_cache = ['vsx']
else:
_cache = []
return _cache.copy()
try:
from cpuinfo import get_cpu_info
except ImportError:
return None
result = []
required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
required_avx_flags = {'avx', 'avx2'}
required_avx512_flags = {'avx512f'}
required_neon_flags = {'asimd'}
required_sve_flags = {'sve'}
flags = set(get_cpu_info()['flags'])
if flags.issuperset(required_sse_flags):
result.append("sse")
if flags.issuperset(required_avx_flags):
result.append("avx")
if flags.issuperset(required_avx512_flags):
result.append("avx512")
if flags.issuperset(required_neon_flags):
result.append("neon")
if flags.issuperset(required_sve_flags):
if platform.system() == 'Linux':
libc = CDLL('libc.so.6')
length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL
if length < 0: if length < 0:
raise OSError("SVE length query failed") raise OSError("SVE length query failed")
while length > 128: while length >= 128:
result.append(f"sve{length}") result.append(f"sve{length}")
length //= 2 length //= 2
result.append("sve") result.append("sve")
return result return result
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V
return ['rvv'] if hwcap & hwcap_isa_v else []
elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
return ['vsx'] if hwcap & 0x00000080 else [] # PPC_FEATURE_HAS_VSX
elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
try:
from cpuinfo import get_cpu_info
except ImportError:
return None
result = []
required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
required_avx_flags = {'avx', 'avx2'}
required_avx512_flags = {'avx512f'}
flags = set(get_cpu_info()['flags'])
if flags.issuperset(required_sse_flags):
result.append("sse")
if flags.issuperset(required_avx_flags):
result.append("avx")
if flags.issuperset(required_avx512_flags):
result.append("avx512")
return result
else:
raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
(platform.system(), platform.machine()))
@memorycache
def get_cacheline_size(instruction_set): def get_cacheline_size(instruction_set):
"""Get the size (in bytes) of a cache block that can be zeroed without memory access. """Get the size (in bytes) of a cache block that can be zeroed without memory access.
Usually, this is identical to the cache line size.""" Usually, this is identical to the cache line size."""
global _cachelinesize
instruction_sets = get_vector_instruction_set('double', instruction_set) instruction_sets = get_vector_instruction_set('double', instruction_set)
if 'cachelineSize' not in instruction_sets: if 'cachelineSize' not in instruction_sets:
return None return None
if _cachelinesize is not None:
return _cachelinesize
import pystencils as ps import pystencils as ps
from pystencils.astnodes import SympyAssignment from pystencils.astnodes import SympyAssignment
...@@ -116,5 +104,4 @@ def get_cacheline_size(instruction_set): ...@@ -116,5 +104,4 @@ def get_cacheline_size(instruction_set):
ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set}) ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
kernel = ast.compile() kernel = ast.compile()
kernel(**{f.name: arr, CachelineSize.symbol.name: 0}) kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
_cachelinesize = int(arr[0, 0]) return int(arr[0, 0])
return _cachelinesize
...@@ -172,8 +172,8 @@ def read_config(): ...@@ -172,8 +172,8 @@ def read_config():
default_compiler_config['flags'] += ' ' + libomp default_compiler_config['flags'] += ' ' + libomp
break break
else: else:
raise ValueError("The detection of the platform with platform.system() did not work. " raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
"Pystencils is only supported for linux, windows, and darwin platforms.") (platform.system(),))
default_cache_config = OrderedDict([ default_cache_config = OrderedDict([
('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')), ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment