Markus Holzer · Markus Holzer · fcfbef80 · fcfbef80 · fcfbef80 · fcfbef80
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,6 +26,7 @@ test:
  image: i10git.cs.fau.de:5005/pycodegen/pystencils-benchmark/pystencils-benchmark
  tags:
    - docker
+    - cuda
  script:
    - pip install tox
    - echo $TOX_ENV

--- a/Dockerfile
+++ b/Dockerfile
-FROM ubuntu:latest
+FROM nvidia/cuda:12.1.1-devel-ubuntu22.04

 LABEL maintainer="jan.hoenig@fau.de"
-LABEL version="0.1"
+LABEL version="0.2"
 LABEL description="Custom docker image for pystencils-benchmark"

 ARG DEBIAN_FRONTEND=noninteractive

--- a/MANIFEST.in
+++ b/MANIFEST.in
 include README.md
 include LICENSE.md
 include pystencils_benchmark/templates/*
+include pystencils_benchmark/templates/cpu/*
+include pystencils_benchmark/templates/gpu/*
--- a/pystencils_benchmark/__init__.py
+++ b/pystencils_benchmark/__init__.py
 from .enums import Compiler
-from .benchmark import generate_benchmark, kernel_header, kernel_source
+from . import gpu
+from . import cpu
--- a/pystencils_benchmark/common.py
+++ b/pystencils_benchmark/common.py
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.astnodes import KernelFunction
+from pystencils.enums import Backend
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils_benchmark.enums import Compiler
+from pathlib import Path
+
+_env = Environment(loader=PackageLoader('pystencils_benchmark'),
+                   undefined=StrictUndefined,
+                   keep_trailing_newline=True,
+                   trim_blocks=True, lstrip_blocks=True)
+
+
+def _kernel_header(kernel_ast: KernelFunction,
+                   dialect: Backend = Backend.C,
+                   *,
+                   template_file: str,
+                   additional_jinja_context: dict = None) -> str:
+
+    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
+    header_guard = f'_{kernel_ast.function_name.upper()}_H'
+
+    jinja_context = {
+        'header_guard': header_guard,
+        'function_signature': function_signature,
+    }
+    if additional_jinja_context is not None:
+        jinja_context.update(additional_jinja_context)
+
+    header = _env.get_template(template_file).render(**jinja_context)
+    return header
+
+
+def _kernel_source(kernel_ast: KernelFunction,
+                   dialect: Backend = Backend.C,
+                   *,
+                   template_file: str,
+                   additional_jinja_context: dict = None) -> str:
+
+    kernel_name = kernel_ast.function_name
+    function_source = generate_c(kernel_ast, dialect=dialect)
+    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
+    headers.update(get_headers(kernel_ast))
+
+    jinja_context = {
+        'function_source': function_source,
+        'headers': sorted(headers),
+        'timing': True,
+    }
+
+    if additional_jinja_context is not None:
+        jinja_context.update(additional_jinja_context)
+
+    source = _env.get_template(template_file).render(**jinja_context)
+    return source
+
+
+def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None:
+    name = compiler.name
+    jinja_context = {
+        'compiler': name,
+        'likwid': likwid,
+    }
+
+    files = ['Makefile', f'{name}.mk']
+    for file_name in files:
+        with open(path / file_name, 'w+') as f:
+            template = _env.get_template(file_name).render(**jinja_context)
+            f.write(template)
+
+
+def copy_static_files(path: Path, *, source_file_suffix='.c') -> None:
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+
+    files = ['timing.h', 'timing.c']
+    for file_name in files:
+        template = _env.get_template(file_name).render()
+        if file_name[-1] == 'h':
+            target_path = include_path / file_name
+        elif file_name[-1] == 'c':
+            target_path = src_path / file_name
+            target_path = target_path.with_suffix(source_file_suffix)
+        else:
+            target_path = path / file_name
+        with open(target_path, 'w+') as f:
+            f.write(template)
+
+
+def setup_directories(path: Path):
+    if path is None:
+        path = Path('.')
+    else:
+        path.mkdir(parents=True, exist_ok=True)
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+    return src_path, include_path
--- a/pystencils_benchmark/cpu/__init__.py
+++ b/pystencils_benchmark/cpu/__init__.py
+from .benchmark import generate_benchmark
--- a/pystencils_benchmark/benchmark.py
+++ b/pystencils_benchmark/benchmark.py
 from typing import Union, List
 from collections import namedtuple
 from pathlib import Path
-from jinja2 import Environment, PackageLoader, StrictUndefined

 import numpy as np

-from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.astnodes import KernelFunction, PragmaBlock
 from pystencils.enums import Backend
 from pystencils.typing import get_base_type
 from pystencils.sympyextensions import prod
 from pystencils.integer_functions import modulo_ceil

+from pystencils_benchmark.common import (_env,
+                                         _kernel_source,
+                                         _kernel_header,
+                                         compiler_toolchain,
+                                         copy_static_files,
+                                         setup_directories)
 from pystencils_benchmark.enums import Compiler


-_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
-                   trim_blocks=True, lstrip_blocks=True)
-
-
 def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                       path: Path = None,
                       *,
@@ -26,14 +26,8 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                       timing: bool = True,
                       likwid: bool = False
                       ) -> None:
-    if path is None:
-        path = Path('.')
-    else:
-        path.mkdir(parents=True, exist_ok=True)
-    src_path = path / 'src'
-    src_path.mkdir(parents=True, exist_ok=True)
-    include_path = path / 'include'
-    include_path.mkdir(parents=True, exist_ok=True)
+
+    src_path, include_path = setup_directories(path)

    if isinstance(kernel_asts, KernelFunction):
        kernel_asts = [kernel_asts]
@@ -56,39 +50,6 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
    compiler_toolchain(path, compiler, likwid)


-def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None:
-    name = compiler.name
-    jinja_context = {
-        'compiler': name,
-        'likwid': likwid,
-    }
-
-    files = ['Makefile', f'{name}.mk']
-    for file_name in files:
-        with open(path / file_name, 'w+') as f:
-            template = _env.get_template(file_name).render(**jinja_context)
-            f.write(template)
-
-
-def copy_static_files(path: Path) -> None:
-    src_path = path / 'src'
-    src_path.mkdir(parents=True, exist_ok=True)
-    include_path = path / 'include'
-    include_path.mkdir(parents=True, exist_ok=True)
-
-    files = ['timing.h', 'timing.c']
-    for file_name in files:
-        template = _env.get_template(file_name).render()
-        if file_name[-1] == 'h':
-            target_path = include_path / file_name
-        elif file_name[-1] == 'c':
-            target_path = src_path / file_name
-        else:
-            target_path = path / file_name
-        with open(target_path, 'w+') as f:
-            f.write(template)
-
-
 def kernel_main(kernels_ast: List[KernelFunction], *,
                timing: bool = True, likwid: bool = False) -> str:
    """
@@ -159,34 +120,13 @@ def kernel_main(kernels_ast: List[KernelFunction], *,
        'likwid': likwid,
    }

-    main = _env.get_template('main.c').render(**jinja_context)
+    main = _env.get_template('cpu/main.c').render(**jinja_context)
    return main


 def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
-    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
-    header_guard = f'_{kernel_ast.function_name.upper()}_H'
-
-    jinja_context = {
-        'header_guard': header_guard,
-        'function_signature': function_signature,
-    }
-
-    header = _env.get_template('kernel.h').render(**jinja_context)
-    return header
+    return _kernel_header(kernel_ast, dialect=dialect, template_file='cpu/kernel.h')


 def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
-    kernel_name = kernel_ast.function_name
-    function_source = generate_c(kernel_ast, dialect=dialect)
-    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
-    headers.update(get_headers(kernel_ast))
-
-    jinja_context = {
-        'function_source': function_source,
-        'headers': sorted(headers),
-        'timing': True,
-    }
-
-    source = _env.get_template('kernel.c').render(**jinja_context)
-    return source
+    return _kernel_source(kernel_ast, dialect=dialect, template_file='cpu/kernel.c')
--- a/pystencils_benchmark/enums.py
+++ b/pystencils_benchmark/enums.py
@@ -6,3 +6,4 @@ class Compiler(Enum):
    GCCdebug = auto()
    Clang = auto()
    ICC = auto()
+    NVCC = auto()
--- a/pystencils_benchmark/gpu/__init__.py
+++ b/pystencils_benchmark/gpu/__init__.py
+from .benchmark import generate_benchmark
--- a/pystencils_benchmark/gpu/benchmark.py
+++ b/pystencils_benchmark/gpu/benchmark.py
+from typing import Union, List
+from collections import namedtuple
+from pathlib import Path
+
+from pystencils.astnodes import KernelFunction
+from pystencils.enums import Backend
+from pystencils.typing import get_base_type
+from pystencils.sympyextensions import prod
+from pystencils.transformations import get_common_field
+
+from pystencils_benchmark.common import (_env,
+                                         _kernel_source,
+                                         _kernel_header,
+                                         compiler_toolchain,
+                                         copy_static_files,
+                                         setup_directories)
+from pystencils_benchmark.enums import Compiler
+
+
+def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
+    lb_str = f"__launch_bounds__({', '.join(str(lb) for lb in launch_bounds)}) "
+    splitted = code.split("void ")
+    prefix = splitted[0]
+    if code.startswith("void "):
+        # just in case that there is nothing before the first void
+        prefix = ""
+    return prefix + "void " + lb_str + "void ".join(splitted[1:])
+
+
+def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
+                       path: Path = None,
+                       *,
+                       compiler: Compiler = Compiler.NVCC,
+                       timing: bool = True,
+                       launch_bounds: tuple = None,
+                       ) -> None:
+
+    src_path, include_path = setup_directories(path)
+
+    if isinstance(kernel_asts, KernelFunction):
+        kernel_asts = [kernel_asts]
+
+    for kernel_ast in kernel_asts:
+        kernel_name = kernel_ast.function_name
+
+        header = kernel_header(kernel_ast)
+        if launch_bounds:
+            header = _add_launch_bound(header, launch_bounds)
+        with open(include_path / f'{kernel_name}.h', 'w+') as f:
+            f.write(header)
+
+        source = kernel_source(kernel_ast)
+        if launch_bounds:
+            source = _add_launch_bound(source, launch_bounds)
+        # TODO CUDA specific suffix
+        with open(src_path / f'{kernel_name}.cu', 'w+') as f:
+            f.write(source)
+
+    with open(src_path / 'main.cu', 'w+') as f:
+        f.write(kernel_main(kernel_asts, timing=timing))
+
+    copy_static_files(path, source_file_suffix='.cu')
+    compiler_toolchain(path, compiler, likwid=False)
+
+
+def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True):
+    """
+    Return C code of a benchmark program for the given kernel.
+
+    Args:
+        kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
+        timing: add timing output to the code, prints time per iteration to stdout
+    Returns:
+        C code as string
+    """
+    Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
+                                   'call_argument_list', 'blocks', 'grid'])
+    kernels = []
+    includes = set()
+    for kernel in kernels_ast:
+        name = kernel.function_name
+        accessed_fields = {f.name: f for f in kernel.fields_accessed}
+        constants = []
+        fields = []
+        call_parameters = []
+        block_and_thread_numbers = dict()
+        for p in kernel.get_parameters():
+            if not p.is_field_parameter:
+                constants.append((p.symbol.name, str(p.symbol.dtype)))
+                call_parameters.append(p.symbol.name)
+            else:
+                assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
+                field = accessed_fields[p.field_name]
+                dtype = str(get_base_type(p.symbol.dtype))
+                elements = prod(field.shape)
+
+                fields.append((p.field_name, dtype, elements))
+                call_parameters.append(p.field_name)
+
+            common_shape = get_common_field(kernel.fields_accessed).shape
+            indexing = kernel.indexing
+            block_and_thread_numbers = indexing.call_parameters(common_shape)
+            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+
+        kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
+                              call_argument_list=",".join(call_parameters),
+                              blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
+        includes.add(name)
+
+    jinja_context = {
+        'kernels': kernels,
+        'includes': includes,
+        'timing': timing,
+    }
+
+    main = _env.get_template('gpu/main.c').render(**jinja_context)
+    return main
+
+
+def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    return _kernel_header(kernel_ast,
+                          dialect=dialect,
+                          template_file='gpu/kernel.h',
+                          additional_jinja_context={'target': 'gpu'})
+
+
+def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    return _kernel_source(kernel_ast,
+                          dialect=dialect,
+                          template_file='gpu/kernel.cu',
+                          additional_jinja_context={'target': 'gpu'})
--- a/pystencils_benchmark/templates/Makefile
+++ b/pystencils_benchmark/templates/Makefile
@@ -29,6 +29,8 @@ LIBS += -llikwid
 VPATH     = $(SRC_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
+# TODO CUDA specific SUFFIX
+OBJ       += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
 CFLAGS := $(CFLAGS) $(DEFINES) $(INCLUDES)


@@ -43,6 +45,12 @@ $(BUILD_DIR)/%.o:  %.c
 	$(Q)$(CC) -c $(CFLAGS) $< -o $@
 	$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d

+# TODO CUDA specific SUFFIX
+$(BUILD_DIR)/%.o:  %.cu
+	@echo "===>  COMPILE  $@"
+	$(Q)$(CC) -c $(CFLAGS) $< -o $@
+	$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
 $(BUILD_DIR)/%.s:  %.c
 	@echo "===>  GENERATE ASM  $@"
 	$(Q)$(CC) -S $(CFLAGS) $< -o $@

--- a/pystencils_benchmark/templates/NVCC.mk
+++ b/pystencils_benchmark/templates/NVCC.mk
+CC  = nvcc
+LINKER = $(CC)
+
+# More warning pls
+#CFLAGS   += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align  -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
+# Maybe too much warnings
+#CFLAGS   += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
+# Specific C flags
+CFLAGS   := -use_fast_math
+DEFINES  = -D_GNU_SOURCE -DNDEBUG
+INCLUDES =
+LIBS     =
--- a/pystencils_benchmark/templates/kernel.c
+++ b/pystencils_benchmark/templates/kernel.c
--- a/pystencils_benchmark/templates/kernel.h
+++ b/pystencils_benchmark/templates/kernel.h
--- a/pystencils_benchmark/templates/main.c
+++ b/pystencils_benchmark/templates/main.c
--- a/pystencils_benchmark/templates/gpu/kernel.cu
+++ b/pystencils_benchmark/templates/gpu/kernel.cu
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_source}}
--- a/pystencils_benchmark/templates/gpu/kernel.h
+++ b/pystencils_benchmark/templates/gpu/kernel.h
+#ifndef {{header_guard}}
+#define {{header_guard}}
+
+
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_signature}};
+
+#endif
--- a/pystencils_benchmark/templates/gpu/main.c
+++ b/pystencils_benchmark/templates/gpu/main.c
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "timing.h"
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+#include <cuda_runtime.h>
+
+//kernels
+{% for include in includes %}
+#include "{{ include }}.h"
+{% endfor %}
+
+int main(int argc, char **argv)
+{
+    if(argc < 2) {
+        printf("Usage: %s <n_repeat>\n", argv[0]);
+        return -1;
+    }
+    int n_repeat = atoi(argv[1]);
+    {% for kernel in kernels %}
+
+    { // Kernel: {{kernel.name}}
+        {% for field_name, dataType, elements in kernel.fields %}
+          {{dataType}} *{{field_name}};
+          cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}}));
+          cudaMemset({{field_name}},     0.23, {{elements}});
+        {% endfor %}
+
+        {% for constantName, dataType in kernel.constants %}
+        // Constant {{constantName}}
+        {{dataType}} {{constantName}};
+        {{constantName}} = 0.23;
+        {% endfor %}
+
+        dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}});
+        dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}});
+
+        for(int warmup = 1; warmup >= 0; --warmup) {
+            int repeat = 2;
+            if(warmup == 0) {
+                repeat = n_repeat;
+            }
+
+            {% if timing %}
+            double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
+            timing(&wcStartTime, &cpuStartTime);
+            {% endif %}
+
+            for (; repeat > 0; --repeat)
+            {
+                {{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}});
+            }
+
+            {% if timing %}
+            timing(&wcEndTime, &cpuEndTime);
+
+            if( warmup == 0)
+                printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat );
+            {% endif %}
+        }
+        {% for field_name, dataType, elements in kernel.fields %}
+        cudaFree({{field_name}});
+        {% endfor %}
+    }
+    {% endfor %}
+}
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,10 @@ license = AGPLv3
 version = 0.0.1

 [options]
-packages = pystencils_benchmark
+packages = 
+    pystencils_benchmark
+    pystencils_benchmark.gpu
+    pystencils_benchmark.cpu
 install_requires =
    jinja2 >= 3.0
    pystencils >= 0.3.4

--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -6,7 +6,9 @@ import tempfile
 import pytest
 import pystencils as ps
 from pathlib import Path
-from pystencils_benchmark import generate_benchmark, Compiler
+
+from pystencils_benchmark import Compiler
+import pystencils_benchmark as pb


 compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
@@ -16,6 +18,20 @@ config_kwargs = ({},
                                         'assume_aligned': True}})


+def nvidia_gpu_available():
+    try:
+        return subprocess.call(['nvidia-smi']) == 0
+    except (FileNotFoundError,):
+        return False
+
+
+def nvcc_available():
+    try:
+        return subprocess.call(['nvcc', '--version']) == 0
+    except (FileNotFoundError,):
+        return False
+
+
 @pytest.mark.parametrize('compiler', compilers)
 @pytest.mark.parametrize('config_kwarg', config_kwargs)
 def test_generate(compiler, config_kwarg):
@@ -34,8 +50,36 @@ def test_generate(compiler, config_kwarg):

    with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
        temp_dir = Path(temp_dir)
-        generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
+        pb.cpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
        subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
        subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)


+gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)})
+
+
+@pytest.mark.parametrize('kwargs', gpu_kwargs)
+def test_generate_gpu(kwargs):
+    compiler = Compiler.NVCC
+    a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
+    alpha = sp.symbols('alpha')
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def vadd():
+        a[0] @= b[0] + c[0]
+    kernel_vadd = ps.create_kernel(**vadd)
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def daxpy():
+        b[0] @= alpha * a[0] + b[0]
+    kernel_daxpy = ps.create_kernel(**daxpy)
+
+    with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
+        temp_dir = Path(temp_dir)
+        pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs)
+        if not nvcc_available():
+            pytest.skip("nvcc is not available!")
+        subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
+        if not nvidia_gpu_available():
+            pytest.skip("There is no GPU available!")
+        subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
No results found