Skip to content
Snippets Groups Projects
benchmark.py 5.20 KiB
from typing import Union, List
from collections import namedtuple
from pathlib import Path

from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from pystencils.typing import get_base_type
from pystencils.sympyextensions import prod
from pystencils.transformations import get_common_field

from pystencils_benchmark.common import (_env,
                                         _kernel_source,
                                         _kernel_header,
                                         compiler_toolchain,
                                         copy_static_files,
                                         setup_directories)
from pystencils_benchmark.enums import Compiler


def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
    lb_str = f"__launch_bounds__({', '.join(str(lb) for lb in launch_bounds)}) "
    splitted = code.split("void ")
    prefix = splitted[0]
    if code.startswith("void "):
        # just in case that there is nothing before the first void
        prefix = ""
    return prefix + "void " + lb_str + "void ".join(splitted[1:])


def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                       path: Path = None,
                       *,
                       compiler: Compiler = Compiler.NVCC,
                       timing: bool = True,
                       launch_bounds: tuple = None,
                       ) -> None:

    src_path, include_path = setup_directories(path)

    if isinstance(kernel_asts, KernelFunction):
        kernel_asts = [kernel_asts]

    for kernel_ast in kernel_asts:
        kernel_name = kernel_ast.function_name

        header = kernel_header(kernel_ast)
        if launch_bounds:
            header = _add_launch_bound(header, launch_bounds)
        with open(include_path / f'{kernel_name}.h', 'w+') as f:
            f.write(header)

        source = kernel_source(kernel_ast)
        if launch_bounds:
            source = _add_launch_bound(source, launch_bounds)
        # TODO CUDA specific suffix
        with open(src_path / f'{kernel_name}.cu', 'w+') as f:
            f.write(source)

    with open(src_path / 'main.cu', 'w+') as f:
        f.write(kernel_main(kernel_asts, timing=timing))

    copy_static_files(path, source_file_suffix='.cu')
    compiler_toolchain(path, compiler, likwid=False)


def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True):
    """
    Return C code of a benchmark program for the given kernel.

    Args:
        kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
        timing: add timing output to the code, prints time per iteration to stdout
    Returns:
        C code as string
    """
    Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
                                   'call_argument_list', 'blocks', 'grid'])
    kernels = []
    includes = set()
    for kernel in kernels_ast:
        name = kernel.function_name
        accessed_fields = {f.name: f for f in kernel.fields_accessed}
        constants = []
        fields = []
        call_parameters = []
        block_and_thread_numbers = dict()
        for p in kernel.get_parameters():
            if not p.is_field_parameter:
                constants.append((p.symbol.name, str(p.symbol.dtype)))
                call_parameters.append(p.symbol.name)
            else:
                assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
                field = accessed_fields[p.field_name]
                dtype = str(get_base_type(p.symbol.dtype))
                elements = prod(field.shape)

                fields.append((p.field_name, dtype, elements))
                call_parameters.append(p.field_name)

            common_shape = get_common_field(kernel.fields_accessed).shape
            indexing = kernel.indexing
            block_and_thread_numbers = indexing.call_parameters(common_shape)
            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])

        kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
                              call_argument_list=",".join(call_parameters),
                              blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
        includes.add(name)

    jinja_context = {
        'kernels': kernels,
        'includes': includes,
        'timing': timing,
    }

    main = _env.get_template('gpu/main.c').render(**jinja_context)
    return main


def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
    return _kernel_header(kernel_ast,
                          dialect=dialect,
                          template_file='gpu/kernel.h',
                          additional_jinja_context={'target': 'gpu'})


def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
    return _kernel_source(kernel_ast,
                          dialect=dialect,
                          template_file='gpu/kernel.cu',
                          additional_jinja_context={'target': 'gpu'})