From c51eee431ec28df93b22265fe1cd6ac795197956 Mon Sep 17 00:00:00 2001 From: markus holzer <markus.holzer@fau.de> Date: Sun, 20 Feb 2022 07:58:05 +0100 Subject: [PATCH] Added CUDA benchmarks --- pystencils_benchmark/__init__.py | 1 + pystencils_benchmark/benchmark.py | 8 +- pystencils_benchmark/benchmark_gpu.py | 170 ++++++++++++++++++ pystencils_benchmark/enums.py | 1 + pystencils_benchmark/templates/NVCC.mk | 12 ++ .../templates/{ => cpu}/kernel.c | 0 .../templates/{ => cpu}/kernel.h | 0 .../templates/{ => cpu}/main.c | 0 pystencils_benchmark/templates/gpu/kernel.cu | 8 + pystencils_benchmark/templates/gpu/kernel.h | 11 ++ pystencils_benchmark/templates/gpu/main.c | 66 +++++++ tests/test_benchmark.py | 17 +- 12 files changed, 289 insertions(+), 5 deletions(-) create mode 100644 pystencils_benchmark/benchmark_gpu.py create mode 100644 pystencils_benchmark/templates/NVCC.mk rename pystencils_benchmark/templates/{ => cpu}/kernel.c (100%) rename pystencils_benchmark/templates/{ => cpu}/kernel.h (100%) rename pystencils_benchmark/templates/{ => cpu}/main.c (100%) create mode 100644 pystencils_benchmark/templates/gpu/kernel.cu create mode 100644 pystencils_benchmark/templates/gpu/kernel.h create mode 100644 pystencils_benchmark/templates/gpu/main.c diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py index 6f5f32c..8142abe 100644 --- a/pystencils_benchmark/__init__.py +++ b/pystencils_benchmark/__init__.py @@ -1,2 +1,3 @@ from .enums import Compiler from .benchmark import generate_benchmark, kernel_header, kernel_source +from .benchmark_gpu import generate_benchmark_gpu diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py index df6bd9b..3247800 100644 --- a/pystencils_benchmark/benchmark.py +++ b/pystencils_benchmark/benchmark.py @@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined import numpy as np from pystencils.backends.cbackend import generate_c, get_headers -from pystencils.astnodes import KernelFunction, PragmaBlock +from pystencils.astnodes import KernelFunction from pystencils.enums import Backend from pystencils.typing import get_base_type from pystencils.sympyextensions import prod @@ -159,7 +159,7 @@ def kernel_main(kernels_ast: List[KernelFunction], *, 'likwid': likwid, } - main = _env.get_template('main.c').render(**jinja_context) + main = _env.get_template('cpu/main.c').render(**jinja_context) return main @@ -172,7 +172,7 @@ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s 'function_signature': function_signature, } - header = _env.get_template('kernel.h').render(**jinja_context) + header = _env.get_template('cpu/kernel.h').render(**jinja_context) return header @@ -188,5 +188,5 @@ def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s 'timing': True, } - source = _env.get_template('kernel.c').render(**jinja_context) + source = _env.get_template('cpu/kernel.c').render(**jinja_context) return source diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py new file mode 100644 index 0000000..67ce0dd --- /dev/null +++ b/pystencils_benchmark/benchmark_gpu.py @@ -0,0 +1,170 @@ +from typing import Union, List +from collections import namedtuple +from pathlib import Path +from jinja2 import Environment, PackageLoader, StrictUndefined + +from pystencils.backends.cbackend import generate_c, get_headers +from pystencils.astnodes import KernelFunction +from pystencils.enums import Backend +from pystencils.data_types import get_base_type +from pystencils.sympyextensions import prod +from pystencils.transformations import get_common_shape +from pystencils.gpucuda import BlockIndexing + +from pystencils_benchmark.enums import Compiler + +_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True, + trim_blocks=True, lstrip_blocks=True) + + +def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]], + path: Path = None, + *, + compiler: Compiler = Compiler.GCC) -> None: + if path is None: + path = Path('.') + else: + path.mkdir(parents=True, exist_ok=True) + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + + if isinstance(kernel_asts, KernelFunction): + kernel_asts = [kernel_asts] + + for kernel_ast in kernel_asts: + kernel_name = kernel_ast.function_name + + header = kernel_header(kernel_ast) + with open(include_path / f'{kernel_name}.h', 'w+') as f: + f.write(header) + + source = kernel_source(kernel_ast) + with open(src_path / f'{kernel_name}.c', 'w+') as f: + f.write(source) + + with open(src_path / 'main.c', 'w+') as f: + f.write(kernel_main(kernel_asts)) + + copy_static_files(path) + compiler_toolchain(path, compiler) + + +def compiler_toolchain(path: Path, compiler: Compiler) -> None: + name = compiler.name + jinja_context = { + 'compiler': name, + } + + files = ['Makefile', f'{name}.mk'] + for file_name in files: + with open(path / file_name, 'w+') as f: + template = _env.get_template(file_name).render(**jinja_context) + f.write(template) + + +def copy_static_files(path: Path) -> None: + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + + files = ['timing.h', 'timing.c'] + for file_name in files: + template = _env.get_template(file_name).render() + if file_name[-1] == 'h': + target_path = include_path / file_name + elif file_name[-1] == 'c': + target_path = src_path / file_name + else: + target_path = path / file_name + with open(target_path, 'w+') as f: + f.write(template) + + +def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)): + """ + Return C code of a benchmark program for the given kernel. + + Args: + kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking + timing: add timing output to the code, prints time per iteration to stdout + cuda_block_size: defines the cuda block grid + Returns: + C code as string + """ + Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters', + 'call_argument_list', 'blocks', 'grid']) + kernels = [] + includes = set() + for kernel in kernels_ast: + name = kernel.function_name + accessed_fields = {f.name: f for f in kernel.fields_accessed} + constants = [] + fields = [] + call_parameters = [] + block_and_thread_numbers = dict() + for p in kernel.get_parameters(): + if not p.is_field_parameter: + constants.append((p.symbol.name, str(p.symbol.dtype))) + call_parameters.append(p.symbol.name) + else: + assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" + field = accessed_fields[p.field_name] + dtype = str(get_base_type(p.symbol.dtype)) + elements = prod(field.shape) + + fields.append((p.field_name, dtype, elements)) + call_parameters.append(p.field_name) + + common_shape = get_common_shape(kernel.fields_accessed) + indexing = kernel.indexing + block_and_thread_numbers = indexing.call_parameters(common_shape) + block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) + block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) + + kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters, + call_argument_list=",".join(call_parameters), + blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid'])) + includes.add(name) + + jinja_context = { + 'kernels': kernels, + 'includes': includes, + 'timing': timing, + } + + main = _env.get_template('gpu/main.c').render(**jinja_context) + return main + + +def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: + function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) + header_guard = f'_{kernel_ast.function_name.upper()}_H' + + jinja_context = { + 'header_guard': header_guard, + 'function_signature': function_signature, + 'target': 'gpu' + } + + header = _env.get_template('gpu/kernel.h').render(**jinja_context) + return header + + +def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: + kernel_name = kernel_ast.function_name + function_source = generate_c(kernel_ast, dialect=dialect) + headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} + headers.update(get_headers(kernel_ast)) + + jinja_context = { + 'function_source': function_source, + 'headers': sorted(headers), + 'timing': True, + 'target': 'gpu' + } + + source = _env.get_template('gpu/kernel.cu').render(**jinja_context) + return source diff --git a/pystencils_benchmark/enums.py b/pystencils_benchmark/enums.py index ec56c8a..84cf49e 100644 --- a/pystencils_benchmark/enums.py +++ b/pystencils_benchmark/enums.py @@ -6,3 +6,4 @@ class Compiler(Enum): GCCdebug = auto() Clang = auto() ICC = auto() + NVCC = auto() diff --git a/pystencils_benchmark/templates/NVCC.mk b/pystencils_benchmark/templates/NVCC.mk new file mode 100644 index 0000000..71010ab --- /dev/null +++ b/pystencils_benchmark/templates/NVCC.mk @@ -0,0 +1,12 @@ +CC = nvcc +LINKER = $(CC) + +# More warning pls +#CFLAGS += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return +# Maybe too much warnings +#CFLAGS += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code +# Specific C flags +CFLAGS := -use_fast_math +DEFINES = -D_GNU_SOURCE -DNDEBUG +INCLUDES = +LIBS = diff --git a/pystencils_benchmark/templates/kernel.c b/pystencils_benchmark/templates/cpu/kernel.c similarity index 100% rename from pystencils_benchmark/templates/kernel.c rename to pystencils_benchmark/templates/cpu/kernel.c diff --git a/pystencils_benchmark/templates/kernel.h b/pystencils_benchmark/templates/cpu/kernel.h similarity index 100% rename from pystencils_benchmark/templates/kernel.h rename to pystencils_benchmark/templates/cpu/kernel.h diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/cpu/main.c similarity index 100% rename from pystencils_benchmark/templates/main.c rename to pystencils_benchmark/templates/cpu/main.c diff --git a/pystencils_benchmark/templates/gpu/kernel.cu b/pystencils_benchmark/templates/gpu/kernel.cu new file mode 100644 index 0000000..973369f --- /dev/null +++ b/pystencils_benchmark/templates/gpu/kernel.cu @@ -0,0 +1,8 @@ +{% for header in headers %} +#include {{header}} +{% endfor %} + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ + +{{function_source}} diff --git a/pystencils_benchmark/templates/gpu/kernel.h b/pystencils_benchmark/templates/gpu/kernel.h new file mode 100644 index 0000000..102d9c9 --- /dev/null +++ b/pystencils_benchmark/templates/gpu/kernel.h @@ -0,0 +1,11 @@ +#ifndef {{header_guard}} +#define {{header_guard}} + + + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ + +{{function_signature}}; + +#endif diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c new file mode 100644 index 0000000..b2f3571 --- /dev/null +++ b/pystencils_benchmark/templates/gpu/main.c @@ -0,0 +1,66 @@ +#include <assert.h> +#include <math.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include "timing.h" + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ +#include <cuda_runtime.h> + +//kernels +{% for include in includes %} +#include "{{ include }}.h" +{% endfor %} + +int main(int argc, char **argv) +{ + if(argc < 2) { + printf("Usage: %s <n_repeat>\n", argv[0]); + return -1; + } + int n_repeat = atoi(argv[1]); + {% for kernel in kernels %} + + { // Kernel: {{kernel.name}} + {% for field_name, dataType, elements in kernel.fields %} + {{dataType}} *{{field_name}}; + cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}})); + cudaMemset({{field_name}}, 0.23, {{elements}}); + {% endfor %} + + dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}}); + dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}}); + + for(int warmup = 1; warmup >= 0; --warmup) { + int repeat = 2; + if(warmup == 0) { + repeat = n_repeat; + } + + {% if timing %} + double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime; + timing(&wcStartTime, &cpuStartTime); + {% endif %} + + for (; repeat > 0; --repeat) + { + {{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}}); + } + + {% if timing %} + timing(&wcEndTime, &cpuEndTime); + + if( warmup == 0) + printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat ); + {% endif %} + } + {% for field_name, dataType, elements in kernel.fields %} + cudaFree({{field_name}}); + {% endfor %} + } + {% endfor %} +} diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 70af02e..4ecd4d8 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -6,7 +6,7 @@ import tempfile import pytest import pystencils as ps from pathlib import Path -from pystencils_benchmark import generate_benchmark, Compiler +from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang) @@ -39,3 +39,18 @@ def test_generate(compiler, config_kwarg): subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) +def test_generate_gpu(): + compiler = Compiler.NVCC + a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) + + @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU)) + def vadd(): + a[0] @= b[0] + c[0] + kernel_vadd = ps.create_kernel(**vadd) + + temp_dir = Path('/home/markus/pystencils_benchmark_testfolder') + generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler) + subprocess.run(['make', '-C', f'{temp_dir}'], check=True) + subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) + + -- GitLab