Skip to content
Snippets Groups Projects
benchmark_gpu.py 6.52 KiB
from typing import Union, List
from collections import namedtuple
from pathlib import Path
from jinja2 import Environment, PackageLoader, StrictUndefined

from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from pystencils.typing import get_base_type
from pystencils.sympyextensions import prod
from pystencils.transformations import get_common_field
# from pystencils.gpucuda import BlockIndexing

from pystencils_benchmark.enums import Compiler

_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
                   trim_blocks=True, lstrip_blocks=True)


def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                           path: Path = None,
                           *,
                           compiler: Compiler = Compiler.GCC,
                           timing: bool = True,
                           cuda_block_size: tuple = (32, 1, 1)
                           ) -> None:
    if path is None:
        path = Path('.')
    else:
        path.mkdir(parents=True, exist_ok=True)
    src_path = path / 'src'
    src_path.mkdir(parents=True, exist_ok=True)
    include_path = path / 'include'
    include_path.mkdir(parents=True, exist_ok=True)

    if isinstance(kernel_asts, KernelFunction):
        kernel_asts = [kernel_asts]

    for kernel_ast in kernel_asts:
        kernel_name = kernel_ast.function_name

        header = kernel_header(kernel_ast)
        with open(include_path / f'{kernel_name}.h', 'w+') as f:
            f.write(header)

        source = kernel_source(kernel_ast)
        # TODO CUDA specific suffix
        with open(src_path / f'{kernel_name}.cu', 'w+') as f:
            f.write(source)

    with open(src_path / 'main.cu', 'w+') as f:
        f.write(kernel_main(kernel_asts,
                            timing=timing,
                            cuda_block_size=cuda_block_size))

    copy_static_files(path)
    compiler_toolchain(path, compiler)


def compiler_toolchain(path: Path, compiler: Compiler) -> None:
    name = compiler.name
    jinja_context = {
        'compiler': name,
        'likwid': False,
    }

    files = ['Makefile', f'{name}.mk']
    for file_name in files:
        with open(path / file_name, 'w+') as f:
            template = _env.get_template(file_name).render(**jinja_context)
            f.write(template)


def copy_static_files(path: Path) -> None:
    src_path = path / 'src'
    src_path.mkdir(parents=True, exist_ok=True)
    include_path = path / 'include'
    include_path.mkdir(parents=True, exist_ok=True)

    files = ['timing.h', 'timing.c']
    for file_name in files:
        template = _env.get_template(file_name).render()
        if file_name[-1] == 'h':
            target_path = include_path / file_name
        elif file_name[-1] == 'c':
            target_path = src_path / file_name
            # TODO CUDA specific suffix:
            target_path = target_path.with_suffix('.cu')
        else:
            target_path = path / file_name
        with open(target_path, 'w+') as f:
            f.write(template)


def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, cuda_block_size: tuple):
    """
    Return C code of a benchmark program for the given kernel.

    Args:
        kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
        timing: add timing output to the code, prints time per iteration to stdout
        cuda_block_size: defines the cuda block grid
    Returns:
        C code as string
    """
    Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
                                   'call_argument_list', 'blocks', 'grid'])
    kernels = []
    includes = set()
    for kernel in kernels_ast:
        name = kernel.function_name
        accessed_fields = {f.name: f for f in kernel.fields_accessed}
        constants = []
        fields = []
        call_parameters = []
        block_and_thread_numbers = dict()
        for p in kernel.get_parameters():
            if not p.is_field_parameter:
                constants.append((p.symbol.name, str(p.symbol.dtype)))
                call_parameters.append(p.symbol.name)
            else:
                assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
                field = accessed_fields[p.field_name]
                dtype = str(get_base_type(p.symbol.dtype))
                elements = prod(field.shape)

                fields.append((p.field_name, dtype, elements))
                call_parameters.append(p.field_name)

            common_shape = get_common_field(kernel.fields_accessed).shape
            indexing = kernel.indexing
            block_and_thread_numbers = indexing.call_parameters(common_shape)
            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])

        kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
                              call_argument_list=",".join(call_parameters),
                              blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
        includes.add(name)

    jinja_context = {
        'kernels': kernels,
        'includes': includes,
        'timing': timing,
    }

    main = _env.get_template('gpu/main.c').render(**jinja_context)
    return main


def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
    header_guard = f'_{kernel_ast.function_name.upper()}_H'

    jinja_context = {
        'header_guard': header_guard,
        'function_signature': function_signature,
        'target': 'gpu'
    }

    header = _env.get_template('gpu/kernel.h').render(**jinja_context)
    return header


def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
    kernel_name = kernel_ast.function_name
    function_source = generate_c(kernel_ast, dialect=dialect)
    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
    headers.update(get_headers(kernel_ast))

    jinja_context = {
        'function_source': function_source,
        'headers': sorted(headers),
        'timing': True,
        'target': 'gpu'
    }

    source = _env.get_template('gpu/kernel.cu').render(**jinja_context)
    return source