-
Christoph Alt authoredChristoph Alt authored
benchmark_gpu.py 6.52 KiB
from typing import Union, List
from collections import namedtuple
from pathlib import Path
from jinja2 import Environment, PackageLoader, StrictUndefined
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from pystencils.typing import get_base_type
from pystencils.sympyextensions import prod
from pystencils.transformations import get_common_field
# from pystencils.gpucuda import BlockIndexing
from pystencils_benchmark.enums import Compiler
_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
trim_blocks=True, lstrip_blocks=True)
def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
path: Path = None,
*,
compiler: Compiler = Compiler.GCC,
timing: bool = True,
cuda_block_size: tuple = (32, 1, 1)
) -> None:
if path is None:
path = Path('.')
else:
path.mkdir(parents=True, exist_ok=True)
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
if isinstance(kernel_asts, KernelFunction):
kernel_asts = [kernel_asts]
for kernel_ast in kernel_asts:
kernel_name = kernel_ast.function_name
header = kernel_header(kernel_ast)
with open(include_path / f'{kernel_name}.h', 'w+') as f:
f.write(header)
source = kernel_source(kernel_ast)
# TODO CUDA specific suffix
with open(src_path / f'{kernel_name}.cu', 'w+') as f:
f.write(source)
with open(src_path / 'main.cu', 'w+') as f:
f.write(kernel_main(kernel_asts,
timing=timing,
cuda_block_size=cuda_block_size))
copy_static_files(path)
compiler_toolchain(path, compiler)
def compiler_toolchain(path: Path, compiler: Compiler) -> None:
name = compiler.name
jinja_context = {
'compiler': name,
'likwid': False,
}
files = ['Makefile', f'{name}.mk']
for file_name in files:
with open(path / file_name, 'w+') as f:
template = _env.get_template(file_name).render(**jinja_context)
f.write(template)
def copy_static_files(path: Path) -> None:
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
files = ['timing.h', 'timing.c']
for file_name in files:
template = _env.get_template(file_name).render()
if file_name[-1] == 'h':
target_path = include_path / file_name
elif file_name[-1] == 'c':
target_path = src_path / file_name
# TODO CUDA specific suffix:
target_path = target_path.with_suffix('.cu')
else:
target_path = path / file_name
with open(target_path, 'w+') as f:
f.write(template)
def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_block_size: tuple):
"""
Return C code of a benchmark program for the given kernel.
Args:
kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
timing: add timing output to the code, prints time per iteration to stdout
cuda_block_size: defines the cuda block grid
Returns:
C code as string
"""
Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
'call_argument_list', 'blocks', 'grid'])
kernels = []
includes = set()
for kernel in kernels_ast:
name = kernel.function_name
accessed_fields = {f.name: f for f in kernel.fields_accessed}
constants = []
fields = []
call_parameters = []
block_and_thread_numbers = dict()
for p in kernel.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
elements = prod(field.shape)
fields.append((p.field_name, dtype, elements))
call_parameters.append(p.field_name)
common_shape = get_common_field(kernel.fields_accessed).shape
indexing = kernel.indexing
block_and_thread_numbers = indexing.call_parameters(common_shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
call_argument_list=",".join(call_parameters),
blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
includes.add(name)
jinja_context = {
'kernels': kernels,
'includes': includes,
'timing': timing,
}
main = _env.get_template('gpu/main.c').render(**jinja_context)
return main
def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
header_guard = f'_{kernel_ast.function_name.upper()}_H'
jinja_context = {
'header_guard': header_guard,
'function_signature': function_signature,
'target': 'gpu'
}
header = _env.get_template('gpu/kernel.h').render(**jinja_context)
return header
def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
kernel_name = kernel_ast.function_name
function_source = generate_c(kernel_ast, dialect=dialect)
headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
headers.update(get_headers(kernel_ast))
jinja_context = {
'function_source': function_source,
'headers': sorted(headers),
'timing': True,
'target': 'gpu'
}
source = _env.get_template('gpu/kernel.cu').render(**jinja_context)
return source