Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
pystencils
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
pycodegen
pystencils
Commits
f0e9cd00
Commit
f0e9cd00
authored
2 years ago
by
Michael Kuron
Browse files
Options
Downloads
Patches
Plain Diff
Remove cpuinfo dependency for SIMD detection on non-x86
parent
70afe477
No related branches found
No related tags found
1 merge request
!321
Properly detect and enable vectorization on ARM
Pipeline
#53214
failed
2 years ago
Stage: pretest
Stage: test
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
.gitlab-ci.yml
+2
-5
2 additions, 5 deletions
.gitlab-ci.yml
pystencils/backends/simd_instruction_sets.py
+41
-54
41 additions, 54 deletions
pystencils/backends/simd_instruction_sets.py
pystencils/cpu/cpujit.py
+2
-2
2 additions, 2 deletions
pystencils/cpu/cpujit.py
with
45 additions
and
61 deletions
.gitlab-ci.yml
+
2
−
5
View file @
f0e9cd00
...
@@ -156,7 +156,7 @@ arm64v8:
...
@@ -156,7 +156,7 @@ arm64v8:
extends
:
.multiarch_template
extends
:
.multiarch_template
image
:
i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
image
:
i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
variables
:
variables
:
PYSTENCILS_SIMD
:
"
neon
"
QEMU_CPU
:
"
cortex-a76
"
before_script
:
before_script
:
-
*multiarch_before_script
-
*multiarch_before_script
-
sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json
-
sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json
...
@@ -164,8 +164,6 @@ arm64v8:
...
@@ -164,8 +164,6 @@ arm64v8:
ppc64le
:
ppc64le
:
extends
:
.multiarch_template
extends
:
.multiarch_template
image
:
i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le
image
:
i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le
variables
:
PYSTENCILS_SIMD
:
"
vsx"
before_script
:
before_script
:
-
*multiarch_before_script
-
*multiarch_before_script
-
sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json
-
sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json
...
@@ -174,8 +172,6 @@ arm64v9:
...
@@ -174,8 +172,6 @@ arm64v9:
# SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
# SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
extends
:
.multiarch_template
extends
:
.multiarch_template
image
:
i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
image
:
i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
variables
:
PYSTENCILS_SIMD
:
"
sve128,sve256,sve512,sve"
before_script
:
before_script
:
-
*multiarch_before_script
-
*multiarch_before_script
-
sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
-
sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
...
@@ -187,6 +183,7 @@ riscv64:
...
@@ -187,6 +183,7 @@ riscv64:
extends
:
.multiarch_template
extends
:
.multiarch_template
image
:
i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64
image
:
i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64
variables
:
variables
:
# explicitly set SIMD as detection does not appear to work on QEMU
PYSTENCILS_SIMD
:
"
rvv"
PYSTENCILS_SIMD
:
"
rvv"
QEMU_CPU
:
"
rv64,v=true"
QEMU_CPU
:
"
rv64,v=true"
before_script
:
before_script
:
...
...
This diff is collapsed.
Click to expand it.
pystencils/backends/simd_instruction_sets.py
+
41
−
54
View file @
f0e9cd00
...
@@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_
...
@@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_
from
pystencils.backends.arm_instruction_sets
import
get_vector_instruction_set_arm
from
pystencils.backends.arm_instruction_sets
import
get_vector_instruction_set_arm
from
pystencils.backends.ppc_instruction_sets
import
get_vector_instruction_set_ppc
from
pystencils.backends.ppc_instruction_sets
import
get_vector_instruction_set_ppc
from
pystencils.backends.riscv_instruction_sets
import
get_vector_instruction_set_riscv
from
pystencils.backends.riscv_instruction_sets
import
get_vector_instruction_set_riscv
from
pystencils.cache
import
memorycache
from
pystencils.typing
import
numpy_name_to_c
from
pystencils.typing
import
numpy_name_to_c
...
@@ -31,79 +32,66 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
...
@@ -31,79 +32,66 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
return
get_vector_instruction_set_x86
(
type_name
,
instruction_set
)
return
get_vector_instruction_set_x86
(
type_name
,
instruction_set
)
_cache
=
None
@memorycache
_cachelinesize
=
None
def
get_supported_instruction_sets
():
def
get_supported_instruction_sets
():
"""
List of supported instruction sets on current hardware, or None if query failed.
"""
"""
List of supported instruction sets on current hardware, or None if query failed.
"""
global
_cache
if
_cache
is
not
None
:
return
_cache
.
copy
()
if
'
PYSTENCILS_SIMD
'
in
os
.
environ
:
if
'
PYSTENCILS_SIMD
'
in
os
.
environ
:
return
os
.
environ
[
'
PYSTENCILS_SIMD
'
].
split
(
'
,
'
)
return
os
.
environ
[
'
PYSTENCILS_SIMD
'
].
split
(
'
,
'
)
if
platform
.
system
()
==
'
Darwin
'
and
platform
.
machine
()
==
'
arm64
'
:
# not supported by cpuinfo
if
platform
.
system
()
==
'
Darwin
'
and
platform
.
machine
()
==
'
arm64
'
:
return
[
'
neon
'
]
return
[
'
neon
'
]
elif
platform
.
system
()
==
'
Linux
'
and
platform
.
machine
().
startswith
(
'
riscv
'
):
# not supported by cpuinfo
elif
platform
.
system
()
==
'
Linux
'
and
platform
.
machine
()
==
'
aarch64
'
:
result
=
[
'
neon
'
]
# Neon is mandatory on 64-bit ARM
libc
=
CDLL
(
'
libc.so.6
'
)
libc
=
CDLL
(
'
libc.so.6
'
)
hwcap
=
libc
.
getauxval
(
16
)
# AT_HWCAP
hwcap
=
libc
.
getauxval
(
16
)
# AT_HWCAP
hwcap_isa_v
=
1
<<
(
ord
(
'
V
'
)
-
ord
(
'
A
'
))
# COMPAT_HWCAP_ISA_V
if
hwcap
&
(
1
<<
22
):
# HWCAP_SVE
return
[
'
rvv
'
]
if
hwcap
&
hwcap_isa_v
else
[]
elif
platform
.
machine
().
startswith
(
'
ppc64
'
):
# no flags reported by cpuinfo
import
subprocess
import
tempfile
from
pystencils.cpu.cpujit
import
get_compiler_config
f
=
tempfile
.
NamedTemporaryFile
(
suffix
=
'
.cpp
'
)
command
=
[
get_compiler_config
()[
'
command
'
],
'
-mcpu=native
'
,
'
-dM
'
,
'
-E
'
,
f
.
name
]
macros
=
subprocess
.
check_output
(
command
,
input
=
''
,
text
=
True
)
if
'
#define __VSX__
'
in
macros
and
'
#define __ALTIVEC__
'
in
macros
:
_cache
=
[
'
vsx
'
]
else
:
_cache
=
[]
return
_cache
.
copy
()
try
:
from
cpuinfo
import
get_cpu_info
except
ImportError
:
return
None
result
=
[]
required_sse_flags
=
{
'
sse
'
,
'
sse2
'
,
'
ssse3
'
,
'
sse4_1
'
,
'
sse4_2
'
}
required_avx_flags
=
{
'
avx
'
,
'
avx2
'
}
required_avx512_flags
=
{
'
avx512f
'
}
required_neon_flags
=
{
'
asimd
'
}
required_sve_flags
=
{
'
sve
'
}
flags
=
set
(
get_cpu_info
()[
'
flags
'
])
if
flags
.
issuperset
(
required_sse_flags
):
result
.
append
(
"
sse
"
)
if
flags
.
issuperset
(
required_avx_flags
):
result
.
append
(
"
avx
"
)
if
flags
.
issuperset
(
required_avx512_flags
):
result
.
append
(
"
avx512
"
)
if
flags
.
issuperset
(
required_neon_flags
):
result
.
append
(
"
neon
"
)
if
flags
.
issuperset
(
required_sve_flags
):
if
platform
.
system
()
==
'
Linux
'
:
libc
=
CDLL
(
'
libc.so.6
'
)
length
=
8
*
libc
.
prctl
(
51
,
0
,
0
,
0
,
0
)
# PR_SVE_GET_VL
length
=
8
*
libc
.
prctl
(
51
,
0
,
0
,
0
,
0
)
# PR_SVE_GET_VL
if
length
<
0
:
if
length
<
0
:
raise
OSError
(
"
SVE length query failed
"
)
raise
OSError
(
"
SVE length query failed
"
)
while
length
>
128
:
while
length
>
=
128
:
result
.
append
(
f
"
sve
{
length
}
"
)
result
.
append
(
f
"
sve
{
length
}
"
)
length
//=
2
length
//=
2
result
.
append
(
"
sve
"
)
result
.
append
(
"
sve
"
)
return
result
return
result
elif
platform
.
system
()
==
'
Linux
'
and
platform
.
machine
().
startswith
(
'
riscv
'
):
libc
=
CDLL
(
'
libc.so.6
'
)
hwcap
=
libc
.
getauxval
(
16
)
# AT_HWCAP
hwcap_isa_v
=
1
<<
(
ord
(
'
V
'
)
-
ord
(
'
A
'
))
# COMPAT_HWCAP_ISA_V
return
[
'
rvv
'
]
if
hwcap
&
hwcap_isa_v
else
[]
elif
platform
.
system
()
==
'
Linux
'
and
platform
.
machine
().
startswith
(
'
ppc64
'
):
libc
=
CDLL
(
'
libc.so.6
'
)
hwcap
=
libc
.
getauxval
(
16
)
# AT_HWCAP
return
[
'
vsx
'
]
if
hwcap
&
0x00000080
else
[]
# PPC_FEATURE_HAS_VSX
elif
platform
.
machine
()
in
[
'
x86_64
'
,
'
x86
'
,
'
AMD64
'
,
'
i386
'
]:
try
:
from
cpuinfo
import
get_cpu_info
except
ImportError
:
return
None
result
=
[]
required_sse_flags
=
{
'
sse
'
,
'
sse2
'
,
'
ssse3
'
,
'
sse4_1
'
,
'
sse4_2
'
}
required_avx_flags
=
{
'
avx
'
,
'
avx2
'
}
required_avx512_flags
=
{
'
avx512f
'
}
flags
=
set
(
get_cpu_info
()[
'
flags
'
])
if
flags
.
issuperset
(
required_sse_flags
):
result
.
append
(
"
sse
"
)
if
flags
.
issuperset
(
required_avx_flags
):
result
.
append
(
"
avx
"
)
if
flags
.
issuperset
(
required_avx512_flags
):
result
.
append
(
"
avx512
"
)
return
result
else
:
raise
NotImplementedError
(
'
Instruction set detection for %s on %s is not implemented
'
%
(
platform
.
system
(),
platform
.
machine
()))
@memorycache
def
get_cacheline_size
(
instruction_set
):
def
get_cacheline_size
(
instruction_set
):
"""
Get the size (in bytes) of a cache block that can be zeroed without memory access.
"""
Get the size (in bytes) of a cache block that can be zeroed without memory access.
Usually, this is identical to the cache line size.
"""
Usually, this is identical to the cache line size.
"""
global
_cachelinesize
instruction_sets
=
get_vector_instruction_set
(
'
double
'
,
instruction_set
)
instruction_sets
=
get_vector_instruction_set
(
'
double
'
,
instruction_set
)
if
'
cachelineSize
'
not
in
instruction_sets
:
if
'
cachelineSize
'
not
in
instruction_sets
:
return
None
return
None
if
_cachelinesize
is
not
None
:
return
_cachelinesize
import
pystencils
as
ps
import
pystencils
as
ps
from
pystencils.astnodes
import
SympyAssignment
from
pystencils.astnodes
import
SympyAssignment
...
@@ -116,5 +104,4 @@ def get_cacheline_size(instruction_set):
...
@@ -116,5 +104,4 @@ def get_cacheline_size(instruction_set):
ast
=
ps
.
create_kernel
(
ass
,
cpu_vectorize_info
=
{
'
instruction_set
'
:
instruction_set
})
ast
=
ps
.
create_kernel
(
ass
,
cpu_vectorize_info
=
{
'
instruction_set
'
:
instruction_set
})
kernel
=
ast
.
compile
()
kernel
=
ast
.
compile
()
kernel
(
**
{
f
.
name
:
arr
,
CachelineSize
.
symbol
.
name
:
0
})
kernel
(
**
{
f
.
name
:
arr
,
CachelineSize
.
symbol
.
name
:
0
})
_cachelinesize
=
int
(
arr
[
0
,
0
])
return
int
(
arr
[
0
,
0
])
return
_cachelinesize
This diff is collapsed.
Click to expand it.
pystencils/cpu/cpujit.py
+
2
−
2
View file @
f0e9cd00
...
@@ -172,8 +172,8 @@ def read_config():
...
@@ -172,8 +172,8 @@ def read_config():
default_compiler_config
[
'
flags
'
]
+=
'
'
+
libomp
default_compiler_config
[
'
flags
'
]
+=
'
'
+
libomp
break
break
else
:
else
:
raise
ValueError
(
"
The detection of the platform with platform.system() did not work.
"
raise
NotImplementedError
(
'
Generation of default compiler flags for %s is not implemented
'
%
"
Pystencils is only supported for linux, windows, and darwin platforms.
"
)
(
platform
.
system
(),)
)
default_cache_config
=
OrderedDict
([
default_cache_config
=
OrderedDict
([
(
'
object_cache
'
,
os
.
path
.
join
(
user_cache_dir
(
'
pystencils
'
),
'
objectcache
'
)),
(
'
object_cache
'
,
os
.
path
.
join
(
user_cache_dir
(
'
pystencils
'
),
'
objectcache
'
)),
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment