Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
pystencils
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
pycodegen
pystencils
Commits
e9eb29b9
Commit
e9eb29b9
authored
6 years ago
by
Martin Bauer
Browse files
Options
Downloads
Patches
Plain Diff
PACXX benchmark generation
parent
649e82b4
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
pacxx/benchmark.py
+170
-0
170 additions, 0 deletions
pacxx/benchmark.py
pacxx/benchmark_template.cpp
+103
-0
103 additions, 0 deletions
pacxx/benchmark_template.cpp
with
273 additions
and
0 deletions
pacxx/benchmark.py
0 → 100644
+
170
−
0
View file @
e9eb29b9
import
os
from
time
import
perf_counter
import
subprocess
from
tempfile
import
TemporaryDirectory
from
pystencils
import
create_data_handling
from
pystencils.backends.cbackend
import
CBackend
from
jinja2
import
Environment
,
FileSystemLoader
from
pystencils.backends.cbackend
import
generate_c
script_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
PAXX_ROOT
=
'
/local/bauer/code/pacxx/install
'
DEFAULT_PAXX_COMPILE_OPTIONS
=
(
'
-Ofast
'
,
'
-march=native
'
)
def
generate_benchmark_code
(
target_file
,
kernel_ast
,
target
):
assert
target
in
(
'
cpu
'
,
'
gpu
'
)
assert
hasattr
(
kernel_ast
,
'
indexing
'
),
"
AST has to be a CUDA kernel in order to create a PACXX kernel from it
"
backend
=
CBackend
()
function_body
=
kernel_ast
.
body
f_sizes
=
{
f
.
shape
[
-
1
]
for
f
in
kernel_ast
.
fields_accessed
}
assert
len
(
f_sizes
)
==
1
env
=
Environment
(
loader
=
FileSystemLoader
(
script_path
))
result
=
env
.
get_template
(
"
benchmark_template.cpp
"
).
render
(
f_size
=
f_sizes
.
pop
(),
code
=
backend
(
function_body
),
target
=
target
)
with
open
(
target_file
,
'
w
'
)
as
f
:
f
.
write
(
result
)
def
pacxx_compile
(
source
,
executable
,
options
=
DEFAULT_PAXX_COMPILE_OPTIONS
):
command
=
[
'
pacxx++
'
,
*
options
,
source
,
'
-o
'
,
executable
,
]
env
=
os
.
environ
.
copy
()
env
[
'
PATH
'
]
=
"
{}:{}
"
.
format
(
env
.
get
(
'
PATH
'
,
''
),
os
.
path
.
join
(
PAXX_ROOT
,
'
bin
'
))
env
[
'
LD_LIBRARY_PATH
'
]
=
"
{}:{}
"
.
format
(
env
.
get
(
'
LD_LIBRARY_PATH
'
,
''
),
os
.
path
.
join
(
PAXX_ROOT
,
'
lib
'
))
try
:
subprocess
.
check_output
(
command
,
env
=
env
,
stderr
=
subprocess
.
STDOUT
)
except
subprocess
.
CalledProcessError
as
e
:
print
(
"
"
.
join
(
command
))
print
(
e
.
output
.
decode
(
'
utf8
'
))
raise
e
def
run_paxx_benchmark
(
executable
,
domain_size
,
iterations
):
assert
len
(
domain_size
)
==
3
arguments
=
[
executable
,
*
domain_size
,
iterations
]
arguments
=
[
str
(
e
)
for
e
in
arguments
]
output
=
subprocess
.
check_output
(
arguments
)
return
float
(
output
)
/
iterations
def
paxx_benchmark
(
ast
,
domain_size
,
iterations
,
target
=
'
cpu
'
,
compile_options
=
DEFAULT_PAXX_COMPILE_OPTIONS
):
"""
Generates, compiles and runs the kernel with PAXX
Args:
ast: pystencils AST object (has to be generated for CUDA, even when run on CPU with pacxx)
domain_size: x, y, z extent of spatial domain
iterations: number of outer iterations
target: either
'
cpu
'
or
'
gpu
'
to specify where pacxx should run the kernel
compile_options: compile options for pacxx
Returns:
seconds for one outer iteration
"""
with
TemporaryDirectory
()
as
base_dir
:
code
=
os
.
path
.
join
(
base_dir
,
'
code.cpp
'
)
executable
=
os
.
path
.
join
(
base_dir
,
'
bench
'
)
generate_benchmark_code
(
code
,
ast
,
target
)
pacxx_compile
(
code
,
executable
,
compile_options
)
time_per_iteration
=
run_paxx_benchmark
(
executable
,
domain_size
,
iterations
)
return
time_per_iteration
def
lbm_performance_compare
(
domain_size
,
iterations
,
**
lb_params
):
"""
Runs benchmark with pacxx and with normal pystencils backends.
Args:
domain_size: 3-tuple with size of spatial domain
iterations: number of outer iterations
**lb_params: parameters passed to lbmpy to choose lattice Boltzmann algorithm & optimization options
Returns:
dictionary with measurements of time per iteration for different backends
"""
import
pycuda.driver
as
drv
from
lbmpy.creationfunctions
import
create_lb_ast
if
'
optimization
'
not
in
lb_params
:
lb_params
[
'
optimization
'
]
=
{}
lb_params
[
'
optimization
'
][
'
target
'
]
=
'
cpu
'
cpu_ast
=
create_lb_ast
(
**
lb_params
)
lb_params
[
'
optimization
'
][
'
target
'
]
=
'
gpu
'
gpu_ast
=
create_lb_ast
(
**
lb_params
)
# print kernel code of CPU or GPU version - just for comparison, files are not used
with
open
(
"
pystencils_cpu_code.c
"
,
'
w
'
)
as
f
:
print
(
generate_c
(
cpu_ast
),
file
=
f
)
with
open
(
"
pystencils_gpu_code.cu
"
,
'
w
'
)
as
f
:
print
(
generate_c
(
gpu_ast
),
file
=
f
)
cpu_kernel
=
cpu_ast
.
compile
()
gpu_kernel
=
gpu_ast
.
compile
()
f_sizes
=
{
f
.
shape
[
-
1
]
for
f
in
cpu_ast
.
fields_accessed
}
assert
len
(
f_sizes
)
==
1
f_size
=
f_sizes
.
pop
()
dh
=
create_data_handling
(
domain_size
,
default_target
=
'
gpu
'
,
default_layout
=
'
fzyx
'
)
dh
.
add_array
(
'
src
'
,
values_per_cell
=
f_size
)
dh
.
add_array
(
'
dst
'
,
values_per_cell
=
f_size
)
dh
.
fill
(
'
src
'
,
0
)
dh
.
fill
(
'
dst
'
,
0
)
# to keep it simple we run outer loop directly from Python
# make domain size large enough, otherwise we measure the python call overhead
def
run_benchmark
(
kernel
):
dh
.
all_to_gpu
()
for
i
in
range
(
10
):
# warmup
dh
.
run_kernel
(
kernel
)
drv
.
Context
.
synchronize
()
start
=
perf_counter
()
for
i
in
range
(
iterations
):
dh
.
run_kernel
(
kernel
)
drv
.
Context
.
synchronize
()
return
(
perf_counter
()
-
start
)
/
iterations
return
{
'
pystencils_cpu
'
:
run_benchmark
(
cpu_kernel
),
'
pystencils_gpu
'
:
run_benchmark
(
gpu_kernel
),
'
pacxx_cpu
'
:
paxx_benchmark
(
gpu_ast
,
domain_size
,
iterations
,
target
=
'
cpu
'
),
'
pacxx_gpu
'
:
paxx_benchmark
(
gpu_ast
,
domain_size
,
iterations
,
target
=
'
gpu
'
),
}
if
__name__
==
'
__main__
'
:
no_opt
=
{
'
openmp
'
:
8
,
# number of threads - pacxx uses also HT cores
'
split
'
:
False
,
'
vectorization
'
:
False
,
'
gpu_indexing_params
'
:
{
'
block_size
'
:
(
64
,
8
,
1
)},
}
only_vectorization
=
{
'
openmp
'
:
4
,
'
split
'
:
False
,
'
gpu_indexing_params
'
:
{
'
block_size
'
:
(
64
,
8
,
1
)},
'
vectorization
'
:
{
'
instruction_set
'
:
'
avx
'
,
'
assume_inner_stride_one
'
:
True
,
'
nontemporal
'
:
False
},
}
best
=
{
'
openmp
'
:
4
,
'
split
'
:
True
,
'
gpu_indexing_params
'
:
{
'
block_size
'
:
(
64
,
8
,
1
)},
'
vectorization
'
:
{
'
instruction_set
'
:
'
avx
'
,
'
assume_inner_stride_one
'
:
True
,
'
nontemporal
'
:
True
}
}
res
=
lbm_performance_compare
(
stencil
=
'
D3Q19
'
,
relaxation_rate
=
1.8
,
compressible
=
False
,
domain_size
=
(
512
,
128
,
32
),
iterations
=
500
,
optimization
=
only_vectorization
)
cpu_speedup
=
((
res
[
'
pacxx_cpu
'
]
/
res
[
'
pystencils_cpu
'
])
-
1
)
*
100
gpu_speedup
=
((
res
[
'
pacxx_gpu
'
]
/
res
[
'
pystencils_gpu
'
])
-
1
)
*
100
print
(
"
Time for one kernel call [s]
"
)
for
config_name
,
time
in
res
.
items
():
print
(
"
{0: <16}: {1}
"
.
format
(
config_name
,
time
))
print
(
"
CPU {:.02f}% GPU {:.02f}%
"
.
format
(
cpu_speedup
,
gpu_speedup
))
This diff is collapsed.
Click to expand it.
pacxx/benchmark_template.cpp
0 → 100644
+
103
−
0
View file @
e9eb29b9
#include
<PACXX.h>
#include
<vector>
#include
<sstream>
#include
<iostream>
#include
<chrono>
using
namespace
pacxx
::
v2
;
size_t
division_round_up
(
size_t
a
,
size_t
b
)
{
if
(
a
%
b
==
0
)
return
a
/
b
;
else
return
(
a
/
b
)
+
1
;
}
int
main
(
int
argc
,
char
**
argv
)
{
{
%
if
target
==
'
cpu
'
%
}
Executor
::
Create
<
NativeRuntime
>
(
0
);
{
%
elif
target
==
'
gpu
'
%
}
Executor
::
Create
<
CUDARuntime
>
(
0
);
{
%
endif
%
}
if
(
argc
!=
5
)
{
std
::
cout
<<
"Usage: ./benchmark xSize ySize zSize iterations"
<<
std
::
endl
;
return
1
;
}
Dimension3
domainSize
;
int64_t
iterations
;
auto
&
exec
=
Executor
::
get
(
0
);
std
::
stringstream
(
argv
[
1
]
)
>>
domainSize
.
x
;
std
::
stringstream
(
argv
[
2
]
)
>>
domainSize
.
y
;
std
::
stringstream
(
argv
[
3
]
)
>>
domainSize
.
z
;
std
::
stringstream
(
argv
[
4
]
)
>>
iterations
;
// add ghost layers to be comparable to pystencils native backend
domainSize
.
x
+=
2
;
domainSize
.
y
+=
2
;
domainSize
.
z
+=
2
;
int64_t
totalSize
=
domainSize
.
x
*
domainSize
.
y
*
domainSize
.
z
*
{{
f_size
}};
std
::
vector
<
double
>
src
(
totalSize
,
0.0
);
std
::
vector
<
double
>
dst
(
totalSize
,
0.0
);
auto
&
dsrc
=
exec
.
allocate
<
double
>
(
src
.
size
());
auto
&
ddst
=
exec
.
allocate
<
double
>
(
dst
.
size
());
dsrc
.
upload
(
src
.
data
(),
src
.
size
());
ddst
.
upload
(
dst
.
data
(),
dst
.
size
());
double
*
_data_src
=
dsrc
.
get
();
double
*
_data_dst
=
ddst
.
get
();
const
int64_t
_size_src_0
=
domainSize
.
x
;
const
int64_t
_size_src_1
=
domainSize
.
y
;
const
int64_t
_size_src_2
=
domainSize
.
z
;
// fzyx layout
const
int64_t
_stride_src_0
=
1
;
const
int64_t
_stride_src_1
=
domainSize
.
x
;
const
int64_t
_stride_src_2
=
domainSize
.
x
*
domainSize
.
y
;
const
int64_t
_stride_src_3
=
domainSize
.
x
*
domainSize
.
y
*
domainSize
.
z
;
auto
pacxxKernel
=
[
=
](
range
&
config
)
{
struct
Vec3D
{
int
x
;
int
y
;
int
z
;
};
const
Vec3D
blockDim
=
{
config
.
get_block_size
(
0
),
config
.
get_block_size
(
1
),
config
.
get_block_size
(
2
)
};
const
Vec3D
blockIdx
=
{
config
.
get_block
(
0
),
config
.
get_block
(
1
),
config
.
get_block
(
2
)
};
const
Vec3D
threadIdx
=
{
config
.
get_local
(
0
),
config
.
get_local
(
1
),
config
.
get_local
(
2
)
};
{{
code
|
indent
(
8
)
}}
};
size_t
blockSize
[]
=
{
64
,
8
,
1
};
KernelConfiguration
config
(
{
division_round_up
(
domainSize
.
x
-
2
,
blockSize
[
0
]),
division_round_up
(
domainSize
.
y
-
2
,
blockSize
[
1
]),
division_round_up
(
domainSize
.
z
-
2
,
blockSize
[
2
])
},
{
blockSize
[
0
],
blockSize
[
1
],
blockSize
[
2
]
});
// warm up
for
(
int64_t
i
=
0
;
i
<
10
;
++
i
)
{
exec
.
launch
(
pacxxKernel
,
config
);
}
exec
.
synchronize
();
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
for
(
int64_t
i
=
0
;
i
<
iterations
;
++
i
)
{
exec
.
launch
(
pacxxKernel
,
config
);
}
exec
.
synchronize
();
auto
duration
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
start
;
auto
ns
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
duration
);
std
::
cout
<<
ns
.
count
()
*
1e-9
<<
std
::
endl
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment