from typing import Union, List from collections import namedtuple from pathlib import Path from jinja2 import Environment, PackageLoader, StrictUndefined from pystencils.backends.cbackend import generate_c, get_headers from pystencils.astnodes import KernelFunction from pystencils.enums import Backend from pystencils.data_types import get_base_type from pystencils.sympyextensions import prod from pystencils.transformations import get_common_shape from pystencils.gpucuda import BlockIndexing from pystencils_benchmark.enums import Compiler _env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True, trim_blocks=True, lstrip_blocks=True) def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]], path: Path = None, *, compiler: Compiler = Compiler.GCC) -> None: if path is None: path = Path('.') else: path.mkdir(parents=True, exist_ok=True) src_path = path / 'src' src_path.mkdir(parents=True, exist_ok=True) include_path = path / 'include' include_path.mkdir(parents=True, exist_ok=True) if isinstance(kernel_asts, KernelFunction): kernel_asts = [kernel_asts] for kernel_ast in kernel_asts: kernel_name = kernel_ast.function_name header = kernel_header(kernel_ast) with open(include_path / f'{kernel_name}.h', 'w+') as f: f.write(header) source = kernel_source(kernel_ast) # TODO CUDA specific suffix with open(src_path / f'{kernel_name}.cu', 'w+') as f: f.write(source) with open(src_path / 'main.cu', 'w+') as f: f.write(kernel_main(kernel_asts)) copy_static_files(path) compiler_toolchain(path, compiler) def compiler_toolchain(path: Path, compiler: Compiler) -> None: name = compiler.name jinja_context = { 'compiler': name, } files = ['Makefile', f'{name}.mk'] for file_name in files: with open(path / file_name, 'w+') as f: template = _env.get_template(file_name).render(**jinja_context) f.write(template) def copy_static_files(path: Path) -> None: src_path = path / 'src' src_path.mkdir(parents=True, exist_ok=True) include_path = path / 'include' include_path.mkdir(parents=True, exist_ok=True) files = ['timing.h', 'timing.c'] for file_name in files: template = _env.get_template(file_name).render() if file_name[-1] == 'h': target_path = include_path / file_name elif file_name[-1] == 'c': target_path = src_path / file_name # TODO CUDA specific suffix: target_path = target_path.with_suffix('.cu') else: target_path = path / file_name with open(target_path, 'w+') as f: f.write(template) def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)): """ Return C code of a benchmark program for the given kernel. Args: kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking timing: add timing output to the code, prints time per iteration to stdout cuda_block_size: defines the cuda block grid Returns: C code as string """ Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters', 'call_argument_list', 'blocks', 'grid']) kernels = [] includes = set() for kernel in kernels_ast: name = kernel.function_name accessed_fields = {f.name: f for f in kernel.fields_accessed} constants = [] fields = [] call_parameters = [] block_and_thread_numbers = dict() for p in kernel.get_parameters(): if not p.is_field_parameter: constants.append((p.symbol.name, str(p.symbol.dtype))) call_parameters.append(p.symbol.name) else: assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" field = accessed_fields[p.field_name] dtype = str(get_base_type(p.symbol.dtype)) elements = prod(field.shape) fields.append((p.field_name, dtype, elements)) call_parameters.append(p.field_name) common_shape = get_common_shape(kernel.fields_accessed) indexing = kernel.indexing block_and_thread_numbers = indexing.call_parameters(common_shape) block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters, call_argument_list=",".join(call_parameters), blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid'])) includes.add(name) jinja_context = { 'kernels': kernels, 'includes': includes, 'timing': timing, } main = _env.get_template('gpu/main.c').render(**jinja_context) return main def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) header_guard = f'_{kernel_ast.function_name.upper()}_H' jinja_context = { 'header_guard': header_guard, 'function_signature': function_signature, 'target': 'gpu' } header = _env.get_template('gpu/kernel.h').render(**jinja_context) return header def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: kernel_name = kernel_ast.function_name function_source = generate_c(kernel_ast, dialect=dialect) headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} headers.update(get_headers(kernel_ast)) jinja_context = { 'function_source': function_source, 'headers': sorted(headers), 'timing': True, 'target': 'gpu' } source = _env.get_template('gpu/kernel.cu').render(**jinja_context) return source