Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
pystencils
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
pycodegen
pystencils
Commits
a57a164a
Commit
a57a164a
authored
5 years ago
by
Stephan Seitz
Browse files
Options
Downloads
Patches
Plain Diff
llvm: Mark CUDA kernels and load/call resulting ptx with pycuda
parent
2e6f3efe
No related branches found
No related tags found
1 merge request
!53
Compile CUDA using the LLVM backend
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
pystencils/llvm/llvm.py
+21
-0
21 additions, 0 deletions
pystencils/llvm/llvm.py
pystencils/llvm/llvmjit.py
+27
-43
27 additions, 43 deletions
pystencils/llvm/llvmjit.py
pystencils_tests/test_jacobi_llvm.py
+9
-3
9 additions, 3 deletions
pystencils_tests/test_jacobi_llvm.py
with
57 additions
and
46 deletions
pystencils/llvm/llvm.py
+
21
−
0
View file @
a57a164a
...
@@ -13,6 +13,24 @@ from pystencils.data_types import (
...
@@ -13,6 +13,24 @@ from pystencils.data_types import (
from
pystencils.llvm.control_flow
import
Loop
from
pystencils.llvm.control_flow
import
Loop
# From Numba
def
set_cuda_kernel
(
lfunc
):
from
llvmlite.llvmpy.core
import
MetaData
,
MetaDataString
,
Constant
,
Type
m
=
lfunc
.
module
ops
=
lfunc
,
MetaDataString
.
get
(
m
,
"
kernel
"
),
Constant
.
int
(
Type
.
int
(),
1
)
md
=
MetaData
.
get
(
m
,
ops
)
nmd
=
m
.
get_or_insert_named_metadata
(
'
nvvm.annotations
'
)
nmd
.
add
(
md
)
# set nvvm ir version
i32
=
ir
.
IntType
(
32
)
md_ver
=
m
.
add_metadata
([
i32
(
1
),
i32
(
2
),
i32
(
2
),
i32
(
0
)])
m
.
add_named_metadata
(
'
nvvmir.version
'
,
md_ver
)
# From Numba
# From Numba
def
_call_sreg
(
builder
,
name
):
def
_call_sreg
(
builder
,
name
):
module
=
builder
.
module
module
=
builder
.
module
...
@@ -191,6 +209,9 @@ class LLVMPrinter(Printer):
...
@@ -191,6 +209,9 @@ class LLVMPrinter(Printer):
self
.
_print
(
func
.
body
)
self
.
_print
(
func
.
body
)
self
.
builder
.
ret_void
()
self
.
builder
.
ret_void
()
self
.
fn
=
fn
self
.
fn
=
fn
if
self
.
target
==
'
gpu
'
:
set_cuda_kernel
(
fn
)
return
fn
return
fn
def
_print_Block
(
self
,
block
):
def
_print_Block
(
self
,
block
):
...
...
This diff is collapsed.
Click to expand it.
pystencils/llvm/llvmjit.py
+
27
−
43
View file @
a57a164a
import
ctypes
as
ct
import
ctypes
as
ct
import
subprocess
import
subprocess
from
functools
import
partial
from
itertools
import
chain
from
os.path
import
exists
,
join
from
os.path
import
exists
,
join
import
llvmlite.binding
as
llvm
import
llvmlite.binding
as
llvm
...
@@ -103,9 +105,9 @@ def generate_and_jit(ast):
...
@@ -103,9 +105,9 @@ def generate_and_jit(ast):
target
=
'
gpu
'
if
ast
.
_backend
==
'
llvm_gpu
'
else
'
cpu
'
target
=
'
gpu
'
if
ast
.
_backend
==
'
llvm_gpu
'
else
'
cpu
'
gen
=
generate_llvm
(
ast
,
target
=
target
)
gen
=
generate_llvm
(
ast
,
target
=
target
)
if
isinstance
(
gen
,
ir
.
Module
):
if
isinstance
(
gen
,
ir
.
Module
):
return
compile_llvm
(
gen
,
target
)
return
compile_llvm
(
gen
,
target
,
ast
)
else
:
else
:
return
compile_llvm
(
gen
.
module
,
target
)
return
compile_llvm
(
gen
.
module
,
target
,
ast
)
def
make_python_function
(
ast
,
argument_dict
=
{},
func
=
None
):
def
make_python_function
(
ast
,
argument_dict
=
{},
func
=
None
):
...
@@ -120,8 +122,8 @@ def make_python_function(ast, argument_dict={}, func=None):
...
@@ -120,8 +122,8 @@ def make_python_function(ast, argument_dict={}, func=None):
return
lambda
:
func
(
*
args
)
return
lambda
:
func
(
*
args
)
def
compile_llvm
(
module
,
target
=
'
cpu
'
):
def
compile_llvm
(
module
,
target
=
'
cpu
'
,
ast
=
None
):
jit
=
CudaJit
()
if
target
==
"
gpu
"
else
Jit
()
jit
=
CudaJit
(
ast
)
if
target
==
"
gpu
"
else
Jit
()
jit
.
parse
(
module
)
jit
.
parse
(
module
)
jit
.
optimize
()
jit
.
optimize
()
jit
.
compile
()
jit
.
compile
()
...
@@ -243,12 +245,13 @@ class CudaJit(Jit):
...
@@ -243,12 +245,13 @@ class CudaJit(Jit):
default_data_layout
=
data_layout
[
MACHINE_BITS
]
default_data_layout
=
data_layout
[
MACHINE_BITS
]
def
__init__
(
self
):
def
__init__
(
self
,
ast
):
# super().__init__()
# super().__init__()
# self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
# self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
self
.
_data_layout
=
self
.
default_data_layout
[
self
.
MACHINE_BITS
]
self
.
_data_layout
=
self
.
default_data_layout
[
self
.
MACHINE_BITS
]
# self._target_data = llvm.create_target_data(self._data_layout)
# self._target_data = llvm.create_target_data(self._data_layout)
self
.
indexing
=
ast
.
indexing
def
optimize
(
self
):
def
optimize
(
self
):
pmb
=
llvm
.
create_pass_manager_builder
()
pmb
=
llvm
.
create_pass_manager_builder
()
...
@@ -278,7 +281,6 @@ class CudaJit(Jit):
...
@@ -278,7 +281,6 @@ class CudaJit(Jit):
llvmmod
.
verify
()
llvmmod
.
verify
()
llvmmod
.
name
=
'
module
'
llvmmod
.
name
=
'
module
'
self
.
module
=
str
(
llvmmod
)
self
.
_llvmmod
=
llvm
.
parse_assembly
(
str
(
llvmmod
))
self
.
_llvmmod
=
llvm
.
parse_assembly
(
str
(
llvmmod
))
def
compile
(
self
):
def
compile
(
self
):
...
@@ -287,48 +289,30 @@ class CudaJit(Jit):
...
@@ -287,48 +289,30 @@ class CudaJit(Jit):
compiler_cache
=
get_cache_config
()[
'
object_cache
'
]
compiler_cache
=
get_cache_config
()[
'
object_cache
'
]
ir_file
=
join
(
compiler_cache
,
hashlib
.
md5
(
str
(
self
.
_llvmmod
).
encode
()).
hexdigest
()
+
'
.ll
'
)
ir_file
=
join
(
compiler_cache
,
hashlib
.
md5
(
str
(
self
.
_llvmmod
).
encode
()).
hexdigest
()
+
'
.ll
'
)
ptx_file
=
ir_file
.
replace
(
'
.ll
'
,
'
.ptx
'
)
ptx_file
=
ir_file
.
replace
(
'
.ll
'
,
'
.ptx
'
)
try
:
from
pycuda.driver
import
Context
arch
=
"
sm_%d%d
"
%
Context
.
get_device
().
compute_capability
()
except
Exception
:
arch
=
"
sm_35
"
if
not
exists
(
ptx_file
):
if
not
exists
(
ptx_file
):
self
.
write_ll
(
ir_file
)
self
.
write_ll
(
ir_file
)
try
:
from
pycuda.driver
import
Context
arch
=
"
sm_%d%d
"
%
Context
.
get_device
().
compute_capability
()
except
Exception
:
arch
=
"
sm_35
"
subprocess
.
check_call
([
'
llc-10
'
,
'
-mcpu=
'
+
arch
,
ir_file
,
'
-o
'
,
ptx_file
])
subprocess
.
check_call
([
'
llc-10
'
,
'
-mcpu=
'
+
arch
,
ir_file
,
'
-o
'
,
ptx_file
])
# TODO: make loading of ptx work
# cubin_file = ir_file.replace('.ll', '.cubin')
# import pycuda.autoinit
# if not exists(cubin_file):
# subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
# def handler(compile_success_bool, info_str, error_str):
import
pycuda.driver
# if not compile_success_bool:
# print(info_str)
# print(error_str)
# # with open(ptx_file, 'rb') as f:
cuda_module
=
pycuda
.
driver
.
module_from_file
(
ptx_file
)
# also works: cubin_file
# # ptx_code = f.read()
self
.
cuda_module
=
cuda_module
# # from pycuda.driver import jit_input_type
# # self.linker.add_data(ptx_code, jit_input_type.PTX, 'foo')
# from pycuda.compiler import DynamicModule
# from pycuda.driver import jit_input_type
# module = DynamicModule().add_file(ptx_file, jit_input_type.PTX)
# module.link()
# # cuda_module = pycuda.driver.module_from_buffer(ptx_code, message_handler=handler)
# # print(dir(cuda_module))
# self.fptr = dict()
# module.get_function('kernel')
def
__call__
(
self
,
func
,
*
args
,
**
kwargs
):
def
__call__
(
self
,
func
,
*
args
,
**
kwargs
):
fptr
=
{}
shape
=
[
a
.
shape
for
a
in
chain
(
args
,
kwargs
.
values
())
if
hasattr
(
a
,
'
shape
'
)][
0
]
for
func
in
self
.
module
.
functions
:
block_and_thread_numbers
=
self
.
indexing
.
call_parameters
(
shape
)
if
not
func
.
is_declaration
:
block_and_thread_numbers
[
'
block
'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'
block
'
])
return_type
=
None
block_and_thread_numbers
[
'
grid
'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'
grid
'
])
if
func
.
ftype
.
return_type
!=
ir
.
VoidType
():
self
.
cuda_module
.
get_function
(
func
)(
*
args
,
**
kwargs
,
**
block_and_thread_numbers
)
return_type
=
to_ctypes
(
create_composite_type_from_string
(
str
(
func
.
ftype
.
return_type
)))
args
=
[
ctypes_from_llvm
(
arg
)
for
arg
in
func
.
ftype
.
args
]
def
get_function_ptr
(
self
,
name
):
function_address
=
self
.
ee
.
get_function_address
(
func
.
name
)
return
partial
(
self
.
_call__
,
name
)
fptr
[
func
.
name
]
=
ct
.
CFUNCTYPE
(
return_type
,
*
args
)(
function_address
)
self
.
fptr
=
fptr
This diff is collapsed.
Click to expand it.
pystencils_tests/test_jacobi_llvm.py
+
9
−
3
View file @
a57a164a
...
@@ -33,13 +33,19 @@ def test_jacobi_fixed_field_size():
...
@@ -33,13 +33,19 @@ def test_jacobi_fixed_field_size():
def
test_jacobi_fixed_field_size_gpu
():
def
test_jacobi_fixed_field_size_gpu
():
size
=
(
30
,
20
)
size
=
(
30
,
20
)
import
pycuda.autoinit
# noqa
from
pycuda.gpuarray
import
to_gpu
src_field_llvm
=
np
.
random
.
rand
(
*
size
)
src_field_llvm
=
np
.
random
.
rand
(
*
size
)
src_field_py
=
np
.
copy
(
src_field_llvm
)
src_field_py
=
np
.
copy
(
src_field_llvm
)
dst_field_llvm
=
np
.
zeros
(
size
)
dst_field_llvm
=
np
.
zeros
(
size
)
dst_field_py
=
np
.
zeros
(
size
)
dst_field_py
=
np
.
zeros
(
size
)
f
=
Field
.
create_from_numpy_array
(
"
f
"
,
src_field_llvm
)
f
=
Field
.
create_from_numpy_array
(
"
f
"
,
src_field_py
)
d
=
Field
.
create_from_numpy_array
(
"
d
"
,
dst_field_llvm
)
d
=
Field
.
create_from_numpy_array
(
"
d
"
,
dst_field_py
)
src_field_llvm
=
to_gpu
(
src_field_llvm
)
dst_field_llvm
=
to_gpu
(
dst_field_llvm
)
jacobi
=
Assignment
(
d
[
0
,
0
],
(
f
[
1
,
0
]
+
f
[
-
1
,
0
]
+
f
[
0
,
1
]
+
f
[
0
,
-
1
])
/
4
)
jacobi
=
Assignment
(
d
[
0
,
0
],
(
f
[
1
,
0
]
+
f
[
-
1
,
0
]
+
f
[
0
,
1
]
+
f
[
0
,
-
1
])
/
4
)
ast
=
create_kernel
([
jacobi
],
target
=
'
gpu
'
)
ast
=
create_kernel
([
jacobi
],
target
=
'
gpu
'
)
...
@@ -52,7 +58,7 @@ def test_jacobi_fixed_field_size_gpu():
...
@@ -52,7 +58,7 @@ def test_jacobi_fixed_field_size_gpu():
jit
=
generate_and_jit
(
ast
)
jit
=
generate_and_jit
(
ast
)
jit
(
'
kernel
'
,
dst_field_llvm
,
src_field_llvm
)
jit
(
'
kernel
'
,
dst_field_llvm
,
src_field_llvm
)
error
=
np
.
sum
(
np
.
abs
(
dst_field_py
-
dst_field_llvm
))
error
=
np
.
sum
(
np
.
abs
(
dst_field_py
-
dst_field_llvm
.
get
()
))
np
.
testing
.
assert_almost_equal
(
error
,
0.0
)
np
.
testing
.
assert_almost_equal
(
error
,
0.0
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment