From c51eee431ec28df93b22265fe1cd6ac795197956 Mon Sep 17 00:00:00 2001
From: markus holzer <markus.holzer@fau.de>
Date: Sun, 20 Feb 2022 07:58:05 +0100
Subject: [PATCH] Added CUDA benchmarks

---
 pystencils_benchmark/__init__.py              |   1 +
 pystencils_benchmark/benchmark.py             |   8 +-
 pystencils_benchmark/benchmark_gpu.py         | 170 ++++++++++++++++++
 pystencils_benchmark/enums.py                 |   1 +
 pystencils_benchmark/templates/NVCC.mk        |  12 ++
 .../templates/{ => cpu}/kernel.c              |   0
 .../templates/{ => cpu}/kernel.h              |   0
 .../templates/{ => cpu}/main.c                |   0
 pystencils_benchmark/templates/gpu/kernel.cu  |   8 +
 pystencils_benchmark/templates/gpu/kernel.h   |  11 ++
 pystencils_benchmark/templates/gpu/main.c     |  66 +++++++
 tests/test_benchmark.py                       |  17 +-
 12 files changed, 289 insertions(+), 5 deletions(-)
 create mode 100644 pystencils_benchmark/benchmark_gpu.py
 create mode 100644 pystencils_benchmark/templates/NVCC.mk
 rename pystencils_benchmark/templates/{ => cpu}/kernel.c (100%)
 rename pystencils_benchmark/templates/{ => cpu}/kernel.h (100%)
 rename pystencils_benchmark/templates/{ => cpu}/main.c (100%)
 create mode 100644 pystencils_benchmark/templates/gpu/kernel.cu
 create mode 100644 pystencils_benchmark/templates/gpu/kernel.h
 create mode 100644 pystencils_benchmark/templates/gpu/main.c

diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py
index 6f5f32c..8142abe 100644
--- a/pystencils_benchmark/__init__.py
+++ b/pystencils_benchmark/__init__.py
@@ -1,2 +1,3 @@
 from .enums import Compiler
 from .benchmark import generate_benchmark, kernel_header, kernel_source
+from .benchmark_gpu import generate_benchmark_gpu
diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py
index df6bd9b..3247800 100644
--- a/pystencils_benchmark/benchmark.py
+++ b/pystencils_benchmark/benchmark.py
@@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined
 import numpy as np
 
 from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.astnodes import KernelFunction, PragmaBlock
+from pystencils.astnodes import KernelFunction
 from pystencils.enums import Backend
 from pystencils.typing import get_base_type
 from pystencils.sympyextensions import prod
@@ -159,7 +159,7 @@ def kernel_main(kernels_ast: List[KernelFunction], *,
         'likwid': likwid,
     }
 
-    main = _env.get_template('main.c').render(**jinja_context)
+    main = _env.get_template('cpu/main.c').render(**jinja_context)
     return main
 
 
@@ -172,7 +172,7 @@ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
         'function_signature': function_signature,
     }
 
-    header = _env.get_template('kernel.h').render(**jinja_context)
+    header = _env.get_template('cpu/kernel.h').render(**jinja_context)
     return header
 
 
@@ -188,5 +188,5 @@ def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
         'timing': True,
     }
 
-    source = _env.get_template('kernel.c').render(**jinja_context)
+    source = _env.get_template('cpu/kernel.c').render(**jinja_context)
     return source
diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/benchmark_gpu.py
new file mode 100644
index 0000000..67ce0dd
--- /dev/null
+++ b/pystencils_benchmark/benchmark_gpu.py
@@ -0,0 +1,170 @@
+from typing import Union, List
+from collections import namedtuple
+from pathlib import Path
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.astnodes import KernelFunction
+from pystencils.enums import Backend
+from pystencils.data_types import get_base_type
+from pystencils.sympyextensions import prod
+from pystencils.transformations import get_common_shape
+from pystencils.gpucuda import BlockIndexing
+
+from pystencils_benchmark.enums import Compiler
+
+_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
+                   trim_blocks=True, lstrip_blocks=True)
+
+
+def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
+                           path: Path = None,
+                           *,
+                           compiler: Compiler = Compiler.GCC) -> None:
+    if path is None:
+        path = Path('.')
+    else:
+        path.mkdir(parents=True, exist_ok=True)
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+
+    if isinstance(kernel_asts, KernelFunction):
+        kernel_asts = [kernel_asts]
+
+    for kernel_ast in kernel_asts:
+        kernel_name = kernel_ast.function_name
+
+        header = kernel_header(kernel_ast)
+        with open(include_path / f'{kernel_name}.h', 'w+') as f:
+            f.write(header)
+
+        source = kernel_source(kernel_ast)
+        with open(src_path / f'{kernel_name}.c', 'w+') as f:
+            f.write(source)
+
+    with open(src_path / 'main.c', 'w+') as f:
+        f.write(kernel_main(kernel_asts))
+
+    copy_static_files(path)
+    compiler_toolchain(path, compiler)
+
+
+def compiler_toolchain(path: Path, compiler: Compiler) -> None:
+    name = compiler.name
+    jinja_context = {
+        'compiler': name,
+    }
+
+    files = ['Makefile', f'{name}.mk']
+    for file_name in files:
+        with open(path / file_name, 'w+') as f:
+            template = _env.get_template(file_name).render(**jinja_context)
+            f.write(template)
+
+
+def copy_static_files(path: Path) -> None:
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+
+    files = ['timing.h', 'timing.c']
+    for file_name in files:
+        template = _env.get_template(file_name).render()
+        if file_name[-1] == 'h':
+            target_path = include_path / file_name
+        elif file_name[-1] == 'c':
+            target_path = src_path / file_name
+        else:
+            target_path = path / file_name
+        with open(target_path, 'w+') as f:
+            f.write(template)
+
+
+def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)):
+    """
+    Return C code of a benchmark program for the given kernel.
+
+    Args:
+        kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
+        timing: add timing output to the code, prints time per iteration to stdout
+        cuda_block_size: defines the cuda block grid
+    Returns:
+        C code as string
+    """
+    Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
+                                   'call_argument_list', 'blocks', 'grid'])
+    kernels = []
+    includes = set()
+    for kernel in kernels_ast:
+        name = kernel.function_name
+        accessed_fields = {f.name: f for f in kernel.fields_accessed}
+        constants = []
+        fields = []
+        call_parameters = []
+        block_and_thread_numbers = dict()
+        for p in kernel.get_parameters():
+            if not p.is_field_parameter:
+                constants.append((p.symbol.name, str(p.symbol.dtype)))
+                call_parameters.append(p.symbol.name)
+            else:
+                assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
+                field = accessed_fields[p.field_name]
+                dtype = str(get_base_type(p.symbol.dtype))
+                elements = prod(field.shape)
+
+                fields.append((p.field_name, dtype, elements))
+                call_parameters.append(p.field_name)
+
+            common_shape = get_common_shape(kernel.fields_accessed)
+            indexing = kernel.indexing
+            block_and_thread_numbers = indexing.call_parameters(common_shape)
+            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+
+        kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
+                              call_argument_list=",".join(call_parameters),
+                              blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
+        includes.add(name)
+
+    jinja_context = {
+        'kernels': kernels,
+        'includes': includes,
+        'timing': timing,
+    }
+
+    main = _env.get_template('gpu/main.c').render(**jinja_context)
+    return main
+
+
+def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
+    header_guard = f'_{kernel_ast.function_name.upper()}_H'
+
+    jinja_context = {
+        'header_guard': header_guard,
+        'function_signature': function_signature,
+        'target': 'gpu'
+    }
+
+    header = _env.get_template('gpu/kernel.h').render(**jinja_context)
+    return header
+
+
+def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    kernel_name = kernel_ast.function_name
+    function_source = generate_c(kernel_ast, dialect=dialect)
+    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
+    headers.update(get_headers(kernel_ast))
+
+    jinja_context = {
+        'function_source': function_source,
+        'headers': sorted(headers),
+        'timing': True,
+        'target': 'gpu'
+    }
+
+    source = _env.get_template('gpu/kernel.cu').render(**jinja_context)
+    return source
diff --git a/pystencils_benchmark/enums.py b/pystencils_benchmark/enums.py
index ec56c8a..84cf49e 100644
--- a/pystencils_benchmark/enums.py
+++ b/pystencils_benchmark/enums.py
@@ -6,3 +6,4 @@ class Compiler(Enum):
     GCCdebug = auto()
     Clang = auto()
     ICC = auto()
+    NVCC = auto()
diff --git a/pystencils_benchmark/templates/NVCC.mk b/pystencils_benchmark/templates/NVCC.mk
new file mode 100644
index 0000000..71010ab
--- /dev/null
+++ b/pystencils_benchmark/templates/NVCC.mk
@@ -0,0 +1,12 @@
+CC  = nvcc
+LINKER = $(CC)
+
+# More warning pls
+#CFLAGS   += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align  -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
+# Maybe too much warnings
+#CFLAGS   += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
+# Specific C flags
+CFLAGS   := -use_fast_math
+DEFINES  = -D_GNU_SOURCE -DNDEBUG
+INCLUDES =
+LIBS     =
diff --git a/pystencils_benchmark/templates/kernel.c b/pystencils_benchmark/templates/cpu/kernel.c
similarity index 100%
rename from pystencils_benchmark/templates/kernel.c
rename to pystencils_benchmark/templates/cpu/kernel.c
diff --git a/pystencils_benchmark/templates/kernel.h b/pystencils_benchmark/templates/cpu/kernel.h
similarity index 100%
rename from pystencils_benchmark/templates/kernel.h
rename to pystencils_benchmark/templates/cpu/kernel.h
diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/cpu/main.c
similarity index 100%
rename from pystencils_benchmark/templates/main.c
rename to pystencils_benchmark/templates/cpu/main.c
diff --git a/pystencils_benchmark/templates/gpu/kernel.cu b/pystencils_benchmark/templates/gpu/kernel.cu
new file mode 100644
index 0000000..973369f
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/kernel.cu
@@ -0,0 +1,8 @@
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_source}}
diff --git a/pystencils_benchmark/templates/gpu/kernel.h b/pystencils_benchmark/templates/gpu/kernel.h
new file mode 100644
index 0000000..102d9c9
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/kernel.h
@@ -0,0 +1,11 @@
+#ifndef {{header_guard}}
+#define {{header_guard}}
+
+
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_signature}};
+
+#endif
diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c
new file mode 100644
index 0000000..b2f3571
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/main.c
@@ -0,0 +1,66 @@
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "timing.h"
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+#include <cuda_runtime.h>
+
+//kernels
+{% for include in includes %}
+#include "{{ include }}.h"
+{% endfor %}
+
+int main(int argc, char **argv)
+{
+    if(argc < 2) {
+        printf("Usage: %s <n_repeat>\n", argv[0]);
+        return -1;
+    }
+    int n_repeat = atoi(argv[1]);
+    {% for kernel in kernels %}
+
+    { // Kernel: {{kernel.name}}
+        {% for field_name, dataType, elements in kernel.fields %}
+          {{dataType}} *{{field_name}};
+          cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}}));
+          cudaMemset({{field_name}},     0.23, {{elements}});
+        {% endfor %}
+
+        dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}});
+        dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}});
+
+        for(int warmup = 1; warmup >= 0; --warmup) {
+            int repeat = 2;
+            if(warmup == 0) {
+                repeat = n_repeat;
+            }
+
+            {% if timing %}
+            double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
+            timing(&wcStartTime, &cpuStartTime);
+            {% endif %}
+
+            for (; repeat > 0; --repeat)
+            {
+                {{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}});
+            }
+
+            {% if timing %}
+            timing(&wcEndTime, &cpuEndTime);
+
+            if( warmup == 0)
+                printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat );
+            {% endif %}
+        }
+        {% for field_name, dataType, elements in kernel.fields %}
+        cudaFree({{field_name}});
+        {% endfor %}
+    }
+    {% endfor %}
+}
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 70af02e..4ecd4d8 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -6,7 +6,7 @@ import tempfile
 import pytest
 import pystencils as ps
 from pathlib import Path
-from pystencils_benchmark import generate_benchmark, Compiler
+from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu
 
 
 compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
@@ -39,3 +39,18 @@ def test_generate(compiler, config_kwarg):
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
 
+def test_generate_gpu():
+    compiler = Compiler.NVCC
+    a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def vadd():
+        a[0] @= b[0] + c[0]
+    kernel_vadd = ps.create_kernel(**vadd)
+
+    temp_dir = Path('/home/markus/pystencils_benchmark_testfolder')
+    generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler)
+    subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
+    subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
+
+
-- 
GitLab