-
Christoph Alt authored
function. For now the only way to set the cuda_block_size size is to use pass it in the `ps.KernelConfig` to the generated kernel
Christoph Alt authoredfunction. For now the only way to set the cuda_block_size size is to use pass it in the `ps.KernelConfig` to the generated kernel
benchmark.py 5.20 KiB
from typing import Union, List
from collections import namedtuple
from pathlib import Path
from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from pystencils.typing import get_base_type
from pystencils.sympyextensions import prod
from pystencils.transformations import get_common_field
from pystencils_benchmark.common import (_env,
_kernel_source,
_kernel_header,
compiler_toolchain,
copy_static_files,
setup_directories)
from pystencils_benchmark.enums import Compiler
def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
lb_str = f"__launch_bounds__({', '.join(str(lb) for lb in launch_bounds)}) "
splitted = code.split("void ")
prefix = splitted[0]
if code.startswith("void "):
# just in case that there is nothing before the first void
prefix = ""
return prefix + "void " + lb_str + "void ".join(splitted[1:])
def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
path: Path = None,
*,
compiler: Compiler = Compiler.NVCC,
timing: bool = True,
launch_bounds: tuple = None,
) -> None:
src_path, include_path = setup_directories(path)
if isinstance(kernel_asts, KernelFunction):
kernel_asts = [kernel_asts]
for kernel_ast in kernel_asts:
kernel_name = kernel_ast.function_name
header = kernel_header(kernel_ast)
if launch_bounds:
header = _add_launch_bound(header, launch_bounds)
with open(include_path / f'{kernel_name}.h', 'w+') as f:
f.write(header)
source = kernel_source(kernel_ast)
if launch_bounds:
source = _add_launch_bound(source, launch_bounds)
# TODO CUDA specific suffix
with open(src_path / f'{kernel_name}.cu', 'w+') as f:
f.write(source)
with open(src_path / 'main.cu', 'w+') as f:
f.write(kernel_main(kernel_asts, timing=timing))
copy_static_files(path, source_file_suffix='.cu')
compiler_toolchain(path, compiler, likwid=False)
def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True):
"""
Return C code of a benchmark program for the given kernel.
Args:
kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
timing: add timing output to the code, prints time per iteration to stdout
Returns:
C code as string
"""
Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
'call_argument_list', 'blocks', 'grid'])
kernels = []
includes = set()
for kernel in kernels_ast:
name = kernel.function_name
accessed_fields = {f.name: f for f in kernel.fields_accessed}
constants = []
fields = []
call_parameters = []
block_and_thread_numbers = dict()
for p in kernel.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
elements = prod(field.shape)
fields.append((p.field_name, dtype, elements))
call_parameters.append(p.field_name)
common_shape = get_common_field(kernel.fields_accessed).shape
indexing = kernel.indexing
block_and_thread_numbers = indexing.call_parameters(common_shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
call_argument_list=",".join(call_parameters),
blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
includes.add(name)
jinja_context = {
'kernels': kernels,
'includes': includes,
'timing': timing,
}
main = _env.get_template('gpu/main.c').render(**jinja_context)
return main
def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
return _kernel_header(kernel_ast,
dialect=dialect,
template_file='gpu/kernel.h',
additional_jinja_context={'target': 'gpu'})
def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
return _kernel_source(kernel_ast,
dialect=dialect,
template_file='gpu/kernel.cu',
additional_jinja_context={'target': 'gpu'})