Skip to content
Snippets Groups Projects
Commit a57a164a authored by Stephan Seitz's avatar Stephan Seitz
Browse files

llvm: Mark CUDA kernels and load/call resulting ptx with pycuda

parent 2e6f3efe
No related branches found
No related tags found
1 merge request!53Compile CUDA using the LLVM backend
...@@ -13,6 +13,24 @@ from pystencils.data_types import ( ...@@ -13,6 +13,24 @@ from pystencils.data_types import (
from pystencils.llvm.control_flow import Loop from pystencils.llvm.control_flow import Loop
# From Numba
def set_cuda_kernel(lfunc):
from llvmlite.llvmpy.core import MetaData, MetaDataString, Constant, Type
m = lfunc.module
ops = lfunc, MetaDataString.get(m, "kernel"), Constant.int(Type.int(), 1)
md = MetaData.get(m, ops)
nmd = m.get_or_insert_named_metadata('nvvm.annotations')
nmd.add(md)
# set nvvm ir version
i32 = ir.IntType(32)
md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
m.add_named_metadata('nvvmir.version', md_ver)
# From Numba # From Numba
def _call_sreg(builder, name): def _call_sreg(builder, name):
module = builder.module module = builder.module
...@@ -191,6 +209,9 @@ class LLVMPrinter(Printer): ...@@ -191,6 +209,9 @@ class LLVMPrinter(Printer):
self._print(func.body) self._print(func.body)
self.builder.ret_void() self.builder.ret_void()
self.fn = fn self.fn = fn
if self.target == 'gpu':
set_cuda_kernel(fn)
return fn return fn
def _print_Block(self, block): def _print_Block(self, block):
......
import ctypes as ct import ctypes as ct
import subprocess import subprocess
from functools import partial
from itertools import chain
from os.path import exists, join from os.path import exists, join
import llvmlite.binding as llvm import llvmlite.binding as llvm
...@@ -103,9 +105,9 @@ def generate_and_jit(ast): ...@@ -103,9 +105,9 @@ def generate_and_jit(ast):
target = 'gpu' if ast._backend == 'llvm_gpu' else 'cpu' target = 'gpu' if ast._backend == 'llvm_gpu' else 'cpu'
gen = generate_llvm(ast, target=target) gen = generate_llvm(ast, target=target)
if isinstance(gen, ir.Module): if isinstance(gen, ir.Module):
return compile_llvm(gen, target) return compile_llvm(gen, target, ast)
else: else:
return compile_llvm(gen.module, target) return compile_llvm(gen.module, target, ast)
def make_python_function(ast, argument_dict={}, func=None): def make_python_function(ast, argument_dict={}, func=None):
...@@ -120,8 +122,8 @@ def make_python_function(ast, argument_dict={}, func=None): ...@@ -120,8 +122,8 @@ def make_python_function(ast, argument_dict={}, func=None):
return lambda: func(*args) return lambda: func(*args)
def compile_llvm(module, target='cpu'): def compile_llvm(module, target='cpu', ast=None):
jit = CudaJit() if target == "gpu" else Jit() jit = CudaJit(ast) if target == "gpu" else Jit()
jit.parse(module) jit.parse(module)
jit.optimize() jit.optimize()
jit.compile() jit.compile()
...@@ -243,12 +245,13 @@ class CudaJit(Jit): ...@@ -243,12 +245,13 @@ class CudaJit(Jit):
default_data_layout = data_layout[MACHINE_BITS] default_data_layout = data_layout[MACHINE_BITS]
def __init__(self): def __init__(self, ast):
# super().__init__() # super().__init__()
# self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS]) # self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
self._data_layout = self.default_data_layout[self.MACHINE_BITS] self._data_layout = self.default_data_layout[self.MACHINE_BITS]
# self._target_data = llvm.create_target_data(self._data_layout) # self._target_data = llvm.create_target_data(self._data_layout)
self.indexing = ast.indexing
def optimize(self): def optimize(self):
pmb = llvm.create_pass_manager_builder() pmb = llvm.create_pass_manager_builder()
...@@ -278,7 +281,6 @@ class CudaJit(Jit): ...@@ -278,7 +281,6 @@ class CudaJit(Jit):
llvmmod.verify() llvmmod.verify()
llvmmod.name = 'module' llvmmod.name = 'module'
self.module = str(llvmmod)
self._llvmmod = llvm.parse_assembly(str(llvmmod)) self._llvmmod = llvm.parse_assembly(str(llvmmod))
def compile(self): def compile(self):
...@@ -287,48 +289,30 @@ class CudaJit(Jit): ...@@ -287,48 +289,30 @@ class CudaJit(Jit):
compiler_cache = get_cache_config()['object_cache'] compiler_cache = get_cache_config()['object_cache']
ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll') ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll')
ptx_file = ir_file.replace('.ll', '.ptx') ptx_file = ir_file.replace('.ll', '.ptx')
try:
from pycuda.driver import Context
arch = "sm_%d%d" % Context.get_device().compute_capability()
except Exception:
arch = "sm_35"
if not exists(ptx_file): if not exists(ptx_file):
self.write_ll(ir_file) self.write_ll(ir_file)
try:
from pycuda.driver import Context
arch = "sm_%d%d" % Context.get_device().compute_capability()
except Exception:
arch = "sm_35"
subprocess.check_call(['llc-10', '-mcpu=' + arch, ir_file, '-o', ptx_file]) subprocess.check_call(['llc-10', '-mcpu=' + arch, ir_file, '-o', ptx_file])
# TODO: make loading of ptx work # cubin_file = ir_file.replace('.ll', '.cubin')
# import pycuda.autoinit # if not exists(cubin_file):
# subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
# def handler(compile_success_bool, info_str, error_str): import pycuda.driver
# if not compile_success_bool:
# print(info_str)
# print(error_str)
# # with open(ptx_file, 'rb') as f: cuda_module = pycuda.driver.module_from_file(ptx_file) # also works: cubin_file
# # ptx_code = f.read() self.cuda_module = cuda_module
# # from pycuda.driver import jit_input_type
# # self.linker.add_data(ptx_code, jit_input_type.PTX, 'foo')
# from pycuda.compiler import DynamicModule
# from pycuda.driver import jit_input_type
# module = DynamicModule().add_file(ptx_file, jit_input_type.PTX)
# module.link()
# # cuda_module = pycuda.driver.module_from_buffer(ptx_code, message_handler=handler)
# # print(dir(cuda_module))
# self.fptr = dict()
# module.get_function('kernel')
def __call__(self, func, *args, **kwargs): def __call__(self, func, *args, **kwargs):
fptr = {} shape = [a.shape for a in chain(args, kwargs.values()) if hasattr(a, 'shape')][0]
for func in self.module.functions: block_and_thread_numbers = self.indexing.call_parameters(shape)
if not func.is_declaration: block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
return_type = None block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
if func.ftype.return_type != ir.VoidType(): self.cuda_module.get_function(func)(*args, **kwargs, **block_and_thread_numbers)
return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
args = [ctypes_from_llvm(arg) for arg in func.ftype.args] def get_function_ptr(self, name):
function_address = self.ee.get_function_address(func.name) return partial(self._call__, name)
fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
self.fptr = fptr
...@@ -33,13 +33,19 @@ def test_jacobi_fixed_field_size(): ...@@ -33,13 +33,19 @@ def test_jacobi_fixed_field_size():
def test_jacobi_fixed_field_size_gpu(): def test_jacobi_fixed_field_size_gpu():
size = (30, 20) size = (30, 20)
import pycuda.autoinit # noqa
from pycuda.gpuarray import to_gpu
src_field_llvm = np.random.rand(*size) src_field_llvm = np.random.rand(*size)
src_field_py = np.copy(src_field_llvm) src_field_py = np.copy(src_field_llvm)
dst_field_llvm = np.zeros(size) dst_field_llvm = np.zeros(size)
dst_field_py = np.zeros(size) dst_field_py = np.zeros(size)
f = Field.create_from_numpy_array("f", src_field_llvm) f = Field.create_from_numpy_array("f", src_field_py)
d = Field.create_from_numpy_array("d", dst_field_llvm) d = Field.create_from_numpy_array("d", dst_field_py)
src_field_llvm = to_gpu(src_field_llvm)
dst_field_llvm = to_gpu(dst_field_llvm)
jacobi = Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4) jacobi = Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)
ast = create_kernel([jacobi], target='gpu') ast = create_kernel([jacobi], target='gpu')
...@@ -52,7 +58,7 @@ def test_jacobi_fixed_field_size_gpu(): ...@@ -52,7 +58,7 @@ def test_jacobi_fixed_field_size_gpu():
jit = generate_and_jit(ast) jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm) jit('kernel', dst_field_llvm, src_field_llvm)
error = np.sum(np.abs(dst_field_py - dst_field_llvm)) error = np.sum(np.abs(dst_field_py - dst_field_llvm.get()))
np.testing.assert_almost_equal(error, 0.0) np.testing.assert_almost_equal(error, 0.0)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment