From 51353a2bdf95069ce3a15a6de8d27cc208ad8992 Mon Sep 17 00:00:00 2001
From: Markus Holzer <markus.holzer@fau.de>
Date: Wed, 18 Sep 2024 13:53:32 +0200
Subject: [PATCH] Add CUDA support

---
 .gitlab-ci.yml                                |   1 +
 Dockerfile                                    |   4 +-
 MANIFEST.in                                   |   2 +
 pystencils_benchmark/__init__.py              |   3 +-
 pystencils_benchmark/common.py                | 102 ++++++++++++++
 pystencils_benchmark/cpu/__init__.py          |   1 +
 pystencils_benchmark/{ => cpu}/benchmark.py   |  82 ++---------
 pystencils_benchmark/enums.py                 |   1 +
 pystencils_benchmark/gpu/__init__.py          |   1 +
 pystencils_benchmark/gpu/benchmark.py         | 132 ++++++++++++++++++
 pystencils_benchmark/templates/Makefile       |   8 ++
 pystencils_benchmark/templates/NVCC.mk        |  12 ++
 .../templates/{ => cpu}/kernel.c              |   0
 .../templates/{ => cpu}/kernel.h              |   0
 .../templates/{ => cpu}/main.c                |   0
 pystencils_benchmark/templates/gpu/kernel.cu  |   8 ++
 pystencils_benchmark/templates/gpu/kernel.h   |  11 ++
 pystencils_benchmark/templates/gpu/main.c     |  72 ++++++++++
 setup.cfg                                     |   5 +-
 tests/test_benchmark.py                       |  48 ++++++-
 tests/test_launch_bounds.py                   |  19 +++
 ve_example/test.py                            |  14 +-
 22 files changed, 442 insertions(+), 84 deletions(-)
 create mode 100644 pystencils_benchmark/common.py
 create mode 100644 pystencils_benchmark/cpu/__init__.py
 rename pystencils_benchmark/{ => cpu}/benchmark.py (65%)
 create mode 100644 pystencils_benchmark/gpu/__init__.py
 create mode 100644 pystencils_benchmark/gpu/benchmark.py
 create mode 100644 pystencils_benchmark/templates/NVCC.mk
 rename pystencils_benchmark/templates/{ => cpu}/kernel.c (100%)
 rename pystencils_benchmark/templates/{ => cpu}/kernel.h (100%)
 rename pystencils_benchmark/templates/{ => cpu}/main.c (100%)
 create mode 100644 pystencils_benchmark/templates/gpu/kernel.cu
 create mode 100644 pystencils_benchmark/templates/gpu/kernel.h
 create mode 100644 pystencils_benchmark/templates/gpu/main.c
 create mode 100644 tests/test_launch_bounds.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 65e62e3..7a159bd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,6 +26,7 @@ test:
   image: i10git.cs.fau.de:5005/pycodegen/pystencils-benchmark/pystencils-benchmark
   tags:
     - docker
+    - cuda
   script:
     - pip install tox
     - echo $TOX_ENV
diff --git a/Dockerfile b/Dockerfile
index 2b8f8a8..dcb5493 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
-FROM ubuntu:latest
+FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
 
 LABEL maintainer="jan.hoenig@fau.de"
-LABEL version="0.1"
+LABEL version="0.2"
 LABEL description="Custom docker image for pystencils-benchmark"
 
 ARG DEBIAN_FRONTEND=noninteractive
diff --git a/MANIFEST.in b/MANIFEST.in
index ef395db..38e6285 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,5 @@
 include README.md
 include LICENSE.md
 include pystencils_benchmark/templates/*
+include pystencils_benchmark/templates/cpu/*
+include pystencils_benchmark/templates/gpu/*
diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py
index 6f5f32c..86d8f56 100644
--- a/pystencils_benchmark/__init__.py
+++ b/pystencils_benchmark/__init__.py
@@ -1,2 +1,3 @@
 from .enums import Compiler
-from .benchmark import generate_benchmark, kernel_header, kernel_source
+from . import gpu
+from . import cpu
diff --git a/pystencils_benchmark/common.py b/pystencils_benchmark/common.py
new file mode 100644
index 0000000..70cabd6
--- /dev/null
+++ b/pystencils_benchmark/common.py
@@ -0,0 +1,102 @@
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.astnodes import KernelFunction
+from pystencils.enums import Backend
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils_benchmark.enums import Compiler
+from pathlib import Path
+
+_env = Environment(loader=PackageLoader('pystencils_benchmark'),
+                   undefined=StrictUndefined,
+                   keep_trailing_newline=True,
+                   trim_blocks=True, lstrip_blocks=True)
+
+
+def _kernel_header(kernel_ast: KernelFunction,
+                   dialect: Backend = Backend.C,
+                   *,
+                   template_file: str,
+                   additional_jinja_context: dict = None) -> str:
+
+    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
+    header_guard = f'_{kernel_ast.function_name.upper()}_H'
+
+    jinja_context = {
+        'header_guard': header_guard,
+        'function_signature': function_signature,
+    }
+    if additional_jinja_context is not None:
+        jinja_context.update(additional_jinja_context)
+
+    header = _env.get_template(template_file).render(**jinja_context)
+    return header
+
+
+def _kernel_source(kernel_ast: KernelFunction,
+                   dialect: Backend = Backend.C,
+                   *,
+                   template_file: str,
+                   additional_jinja_context: dict = None) -> str:
+
+    kernel_name = kernel_ast.function_name
+    function_source = generate_c(kernel_ast, dialect=dialect)
+    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
+    headers.update(get_headers(kernel_ast))
+
+    jinja_context = {
+        'function_source': function_source,
+        'headers': sorted(headers),
+        'timing': True,
+    }
+
+    if additional_jinja_context is not None:
+        jinja_context.update(additional_jinja_context)
+
+    source = _env.get_template(template_file).render(**jinja_context)
+    return source
+
+
+def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None:
+    name = compiler.name
+    jinja_context = {
+        'compiler': name,
+        'likwid': likwid,
+    }
+
+    files = ['Makefile', f'{name}.mk']
+    for file_name in files:
+        with open(path / file_name, 'w+') as f:
+            template = _env.get_template(file_name).render(**jinja_context)
+            f.write(template)
+
+
+def copy_static_files(path: Path, *, source_file_suffix='.c') -> None:
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+
+    files = ['timing.h', 'timing.c']
+    for file_name in files:
+        template = _env.get_template(file_name).render()
+        if file_name[-1] == 'h':
+            target_path = include_path / file_name
+        elif file_name[-1] == 'c':
+            target_path = src_path / file_name
+            target_path = target_path.with_suffix(source_file_suffix)
+        else:
+            target_path = path / file_name
+        with open(target_path, 'w+') as f:
+            f.write(template)
+
+
+def setup_directories(path: Path):
+    if path is None:
+        path = Path('.')
+    else:
+        path.mkdir(parents=True, exist_ok=True)
+    src_path = path / 'src'
+    src_path.mkdir(parents=True, exist_ok=True)
+    include_path = path / 'include'
+    include_path.mkdir(parents=True, exist_ok=True)
+    return src_path, include_path
diff --git a/pystencils_benchmark/cpu/__init__.py b/pystencils_benchmark/cpu/__init__.py
new file mode 100644
index 0000000..cfd889e
--- /dev/null
+++ b/pystencils_benchmark/cpu/__init__.py
@@ -0,0 +1 @@
+from .benchmark import generate_benchmark
diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/cpu/benchmark.py
similarity index 65%
rename from pystencils_benchmark/benchmark.py
rename to pystencils_benchmark/cpu/benchmark.py
index df6bd9b..4258ee5 100644
--- a/pystencils_benchmark/benchmark.py
+++ b/pystencils_benchmark/cpu/benchmark.py
@@ -1,24 +1,24 @@
 from typing import Union, List
 from collections import namedtuple
 from pathlib import Path
-from jinja2 import Environment, PackageLoader, StrictUndefined
 
 import numpy as np
 
-from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.astnodes import KernelFunction, PragmaBlock
 from pystencils.enums import Backend
 from pystencils.typing import get_base_type
 from pystencils.sympyextensions import prod
 from pystencils.integer_functions import modulo_ceil
 
+from pystencils_benchmark.common import (_env,
+                                         _kernel_source,
+                                         _kernel_header,
+                                         compiler_toolchain,
+                                         copy_static_files,
+                                         setup_directories)
 from pystencils_benchmark.enums import Compiler
 
 
-_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
-                   trim_blocks=True, lstrip_blocks=True)
-
-
 def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                        path: Path = None,
                        *,
@@ -26,14 +26,8 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                        timing: bool = True,
                        likwid: bool = False
                        ) -> None:
-    if path is None:
-        path = Path('.')
-    else:
-        path.mkdir(parents=True, exist_ok=True)
-    src_path = path / 'src'
-    src_path.mkdir(parents=True, exist_ok=True)
-    include_path = path / 'include'
-    include_path.mkdir(parents=True, exist_ok=True)
+
+    src_path, include_path = setup_directories(path)
 
     if isinstance(kernel_asts, KernelFunction):
         kernel_asts = [kernel_asts]
@@ -56,39 +50,6 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
     compiler_toolchain(path, compiler, likwid)
 
 
-def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None:
-    name = compiler.name
-    jinja_context = {
-        'compiler': name,
-        'likwid': likwid,
-    }
-
-    files = ['Makefile', f'{name}.mk']
-    for file_name in files:
-        with open(path / file_name, 'w+') as f:
-            template = _env.get_template(file_name).render(**jinja_context)
-            f.write(template)
-
-
-def copy_static_files(path: Path) -> None:
-    src_path = path / 'src'
-    src_path.mkdir(parents=True, exist_ok=True)
-    include_path = path / 'include'
-    include_path.mkdir(parents=True, exist_ok=True)
-
-    files = ['timing.h', 'timing.c']
-    for file_name in files:
-        template = _env.get_template(file_name).render()
-        if file_name[-1] == 'h':
-            target_path = include_path / file_name
-        elif file_name[-1] == 'c':
-            target_path = src_path / file_name
-        else:
-            target_path = path / file_name
-        with open(target_path, 'w+') as f:
-            f.write(template)
-
-
 def kernel_main(kernels_ast: List[KernelFunction], *,
                 timing: bool = True, likwid: bool = False) -> str:
     """
@@ -159,34 +120,13 @@ def kernel_main(kernels_ast: List[KernelFunction], *,
         'likwid': likwid,
     }
 
-    main = _env.get_template('main.c').render(**jinja_context)
+    main = _env.get_template('cpu/main.c').render(**jinja_context)
     return main
 
 
 def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
-    function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
-    header_guard = f'_{kernel_ast.function_name.upper()}_H'
-
-    jinja_context = {
-        'header_guard': header_guard,
-        'function_signature': function_signature,
-    }
-
-    header = _env.get_template('kernel.h').render(**jinja_context)
-    return header
+    return _kernel_header(kernel_ast, dialect=dialect, template_file='cpu/kernel.h')
 
 
 def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
-    kernel_name = kernel_ast.function_name
-    function_source = generate_c(kernel_ast, dialect=dialect)
-    headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
-    headers.update(get_headers(kernel_ast))
-
-    jinja_context = {
-        'function_source': function_source,
-        'headers': sorted(headers),
-        'timing': True,
-    }
-
-    source = _env.get_template('kernel.c').render(**jinja_context)
-    return source
+    return _kernel_source(kernel_ast, dialect=dialect, template_file='cpu/kernel.c')
diff --git a/pystencils_benchmark/enums.py b/pystencils_benchmark/enums.py
index ec56c8a..84cf49e 100644
--- a/pystencils_benchmark/enums.py
+++ b/pystencils_benchmark/enums.py
@@ -6,3 +6,4 @@ class Compiler(Enum):
     GCCdebug = auto()
     Clang = auto()
     ICC = auto()
+    NVCC = auto()
diff --git a/pystencils_benchmark/gpu/__init__.py b/pystencils_benchmark/gpu/__init__.py
new file mode 100644
index 0000000..cfd889e
--- /dev/null
+++ b/pystencils_benchmark/gpu/__init__.py
@@ -0,0 +1 @@
+from .benchmark import generate_benchmark
diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py
new file mode 100644
index 0000000..1e9ce37
--- /dev/null
+++ b/pystencils_benchmark/gpu/benchmark.py
@@ -0,0 +1,132 @@
+from typing import Union, List
+from collections import namedtuple
+from pathlib import Path
+
+from pystencils.astnodes import KernelFunction
+from pystencils.enums import Backend
+from pystencils.typing import get_base_type
+from pystencils.sympyextensions import prod
+from pystencils.transformations import get_common_field
+
+from pystencils_benchmark.common import (_env,
+                                         _kernel_source,
+                                         _kernel_header,
+                                         compiler_toolchain,
+                                         copy_static_files,
+                                         setup_directories)
+from pystencils_benchmark.enums import Compiler
+
+
+def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
+    lb_str = f"__launch_bounds__({', '.join(str(lb) for lb in launch_bounds)}) "
+    splitted = code.split("void ")
+    prefix = splitted[0]
+    if code.startswith("void "):
+        # just in case that there is nothing before the first void
+        prefix = ""
+    return prefix + "void " + lb_str + "void ".join(splitted[1:])
+
+
+def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
+                       path: Path = None,
+                       *,
+                       compiler: Compiler = Compiler.NVCC,
+                       timing: bool = True,
+                       launch_bounds: tuple = None,
+                       ) -> None:
+
+    src_path, include_path = setup_directories(path)
+
+    if isinstance(kernel_asts, KernelFunction):
+        kernel_asts = [kernel_asts]
+
+    for kernel_ast in kernel_asts:
+        kernel_name = kernel_ast.function_name
+
+        header = kernel_header(kernel_ast)
+        if launch_bounds:
+            header = _add_launch_bound(header, launch_bounds)
+        with open(include_path / f'{kernel_name}.h', 'w+') as f:
+            f.write(header)
+
+        source = kernel_source(kernel_ast)
+        if launch_bounds:
+            source = _add_launch_bound(source, launch_bounds)
+        # TODO CUDA specific suffix
+        with open(src_path / f'{kernel_name}.cu', 'w+') as f:
+            f.write(source)
+
+    with open(src_path / 'main.cu', 'w+') as f:
+        f.write(kernel_main(kernel_asts, timing=timing))
+
+    copy_static_files(path, source_file_suffix='.cu')
+    compiler_toolchain(path, compiler, likwid=False)
+
+
+def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True):
+    """
+    Return C code of a benchmark program for the given kernel.
+
+    Args:
+        kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
+        timing: add timing output to the code, prints time per iteration to stdout
+    Returns:
+        C code as string
+    """
+    Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
+                                   'call_argument_list', 'blocks', 'grid'])
+    kernels = []
+    includes = set()
+    for kernel in kernels_ast:
+        name = kernel.function_name
+        accessed_fields = {f.name: f for f in kernel.fields_accessed}
+        constants = []
+        fields = []
+        call_parameters = []
+        block_and_thread_numbers = dict()
+        for p in kernel.get_parameters():
+            if not p.is_field_parameter:
+                constants.append((p.symbol.name, str(p.symbol.dtype)))
+                call_parameters.append(p.symbol.name)
+            else:
+                assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
+                field = accessed_fields[p.field_name]
+                dtype = str(get_base_type(p.symbol.dtype))
+                elements = prod(field.shape)
+
+                fields.append((p.field_name, dtype, elements))
+                call_parameters.append(p.field_name)
+
+            common_shape = get_common_field(kernel.fields_accessed).shape
+            indexing = kernel.indexing
+            block_and_thread_numbers = indexing.call_parameters(common_shape)
+            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+
+        kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
+                              call_argument_list=",".join(call_parameters),
+                              blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
+        includes.add(name)
+
+    jinja_context = {
+        'kernels': kernels,
+        'includes': includes,
+        'timing': timing,
+    }
+
+    main = _env.get_template('gpu/main.c').render(**jinja_context)
+    return main
+
+
+def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    return _kernel_header(kernel_ast,
+                          dialect=dialect,
+                          template_file='gpu/kernel.h',
+                          additional_jinja_context={'target': 'gpu'})
+
+
+def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
+    return _kernel_source(kernel_ast,
+                          dialect=dialect,
+                          template_file='gpu/kernel.cu',
+                          additional_jinja_context={'target': 'gpu'})
diff --git a/pystencils_benchmark/templates/Makefile b/pystencils_benchmark/templates/Makefile
index 66b68b8..d62c513 100644
--- a/pystencils_benchmark/templates/Makefile
+++ b/pystencils_benchmark/templates/Makefile
@@ -29,6 +29,8 @@ LIBS += -llikwid
 VPATH     = $(SRC_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
+# TODO CUDA specific SUFFIX
+OBJ       += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
 CFLAGS := $(CFLAGS) $(DEFINES) $(INCLUDES)
 
 
@@ -43,6 +45,12 @@ $(BUILD_DIR)/%.o:  %.c
 	$(Q)$(CC) -c $(CFLAGS) $< -o $@
 	$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 
+# TODO CUDA specific SUFFIX
+$(BUILD_DIR)/%.o:  %.cu
+	@echo "===>  COMPILE  $@"
+	$(Q)$(CC) -c $(CFLAGS) $< -o $@
+	$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
 $(BUILD_DIR)/%.s:  %.c
 	@echo "===>  GENERATE ASM  $@"
 	$(Q)$(CC) -S $(CFLAGS) $< -o $@
diff --git a/pystencils_benchmark/templates/NVCC.mk b/pystencils_benchmark/templates/NVCC.mk
new file mode 100644
index 0000000..71010ab
--- /dev/null
+++ b/pystencils_benchmark/templates/NVCC.mk
@@ -0,0 +1,12 @@
+CC  = nvcc
+LINKER = $(CC)
+
+# More warning pls
+#CFLAGS   += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align  -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
+# Maybe too much warnings
+#CFLAGS   += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
+# Specific C flags
+CFLAGS   := -use_fast_math
+DEFINES  = -D_GNU_SOURCE -DNDEBUG
+INCLUDES =
+LIBS     =
diff --git a/pystencils_benchmark/templates/kernel.c b/pystencils_benchmark/templates/cpu/kernel.c
similarity index 100%
rename from pystencils_benchmark/templates/kernel.c
rename to pystencils_benchmark/templates/cpu/kernel.c
diff --git a/pystencils_benchmark/templates/kernel.h b/pystencils_benchmark/templates/cpu/kernel.h
similarity index 100%
rename from pystencils_benchmark/templates/kernel.h
rename to pystencils_benchmark/templates/cpu/kernel.h
diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/cpu/main.c
similarity index 100%
rename from pystencils_benchmark/templates/main.c
rename to pystencils_benchmark/templates/cpu/main.c
diff --git a/pystencils_benchmark/templates/gpu/kernel.cu b/pystencils_benchmark/templates/gpu/kernel.cu
new file mode 100644
index 0000000..973369f
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/kernel.cu
@@ -0,0 +1,8 @@
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_source}}
diff --git a/pystencils_benchmark/templates/gpu/kernel.h b/pystencils_benchmark/templates/gpu/kernel.h
new file mode 100644
index 0000000..102d9c9
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/kernel.h
@@ -0,0 +1,11 @@
+#ifndef {{header_guard}}
+#define {{header_guard}}
+
+
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+
+{{function_signature}};
+
+#endif
diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c
new file mode 100644
index 0000000..1373048
--- /dev/null
+++ b/pystencils_benchmark/templates/gpu/main.c
@@ -0,0 +1,72 @@
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "timing.h"
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX __global__
+#include <cuda_runtime.h>
+
+//kernels
+{% for include in includes %}
+#include "{{ include }}.h"
+{% endfor %}
+
+int main(int argc, char **argv)
+{
+    if(argc < 2) {
+        printf("Usage: %s <n_repeat>\n", argv[0]);
+        return -1;
+    }
+    int n_repeat = atoi(argv[1]);
+    {% for kernel in kernels %}
+
+    { // Kernel: {{kernel.name}}
+        {% for field_name, dataType, elements in kernel.fields %}
+          {{dataType}} *{{field_name}};
+          cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}}));
+          cudaMemset({{field_name}},     0.23, {{elements}});
+        {% endfor %}
+
+        {% for constantName, dataType in kernel.constants %}
+        // Constant {{constantName}}
+        {{dataType}} {{constantName}};
+        {{constantName}} = 0.23;
+        {% endfor %}
+
+        dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}});
+        dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}});
+
+        for(int warmup = 1; warmup >= 0; --warmup) {
+            int repeat = 2;
+            if(warmup == 0) {
+                repeat = n_repeat;
+            }
+
+            {% if timing %}
+            double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
+            timing(&wcStartTime, &cpuStartTime);
+            {% endif %}
+
+            for (; repeat > 0; --repeat)
+            {
+                {{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}});
+            }
+
+            {% if timing %}
+            timing(&wcEndTime, &cpuEndTime);
+
+            if( warmup == 0)
+                printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat );
+            {% endif %}
+        }
+        {% for field_name, dataType, elements in kernel.fields %}
+        cudaFree({{field_name}});
+        {% endfor %}
+    }
+    {% endfor %}
+}
diff --git a/setup.cfg b/setup.cfg
index 4063797..8bc42e9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,10 @@ license = AGPLv3
 version = 0.0.1
 
 [options]
-packages = pystencils_benchmark
+packages = 
+    pystencils_benchmark
+    pystencils_benchmark.gpu
+    pystencils_benchmark.cpu
 install_requires =
     jinja2 >= 3.0
     pystencils >= 0.3.4
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 70af02e..b47f9c5 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -6,7 +6,9 @@ import tempfile
 import pytest
 import pystencils as ps
 from pathlib import Path
-from pystencils_benchmark import generate_benchmark, Compiler
+
+from pystencils_benchmark import Compiler
+import pystencils_benchmark as pb
 
 
 compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
@@ -16,6 +18,20 @@ config_kwargs = ({},
                                          'assume_aligned': True}})
 
 
+def nvidia_gpu_available():
+    try:
+        return subprocess.call(['nvidia-smi']) == 0
+    except (FileNotFoundError,):
+        return False
+
+
+def nvcc_available():
+    try:
+        return subprocess.call(['nvcc', '--version']) == 0
+    except (FileNotFoundError,):
+        return False
+
+
 @pytest.mark.parametrize('compiler', compilers)
 @pytest.mark.parametrize('config_kwarg', config_kwargs)
 def test_generate(compiler, config_kwarg):
@@ -34,8 +50,36 @@ def test_generate(compiler, config_kwarg):
 
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
-        generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
+        pb.cpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
         subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
 
+gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)})
+
+
+@pytest.mark.parametrize('kwargs', gpu_kwargs)
+def test_generate_gpu(kwargs):
+    compiler = Compiler.NVCC
+    a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
+    alpha = sp.symbols('alpha')
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def vadd():
+        a[0] @= b[0] + c[0]
+    kernel_vadd = ps.create_kernel(**vadd)
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def daxpy():
+        b[0] @= alpha * a[0] + b[0]
+    kernel_daxpy = ps.create_kernel(**daxpy)
+
+    with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
+        temp_dir = Path(temp_dir)
+        pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs)
+        if not nvcc_available():
+            pytest.skip("nvcc is not available!")
+        subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
+        if not nvidia_gpu_available():
+            pytest.skip("There is no GPU available!")
+        subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
diff --git a/tests/test_launch_bounds.py b/tests/test_launch_bounds.py
new file mode 100644
index 0000000..48af06d
--- /dev/null
+++ b/tests/test_launch_bounds.py
@@ -0,0 +1,19 @@
+import numpy as np
+import pystencils as ps
+from pystencils_benchmark.gpu.benchmark import kernel_header, _add_launch_bound, kernel_source
+
+
+def test_launch_bounds():
+    a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def vadd():
+        a[0] @= b[0] + c[0]
+    kernel_vadd = ps.create_kernel(**vadd)
+    launch_bounds = (256, 2)
+    header = kernel_header(kernel_vadd)
+    header = _add_launch_bound(header, launch_bounds)
+    assert "void __launch_bounds__(256, 2)" in header
+    source = kernel_source(kernel_vadd)
+    source = _add_launch_bound(source, launch_bounds)
+    assert "void __launch_bounds__(256, 2)" in source 
diff --git a/ve_example/test.py b/ve_example/test.py
index ee5e32b..9bb91d6 100755
--- a/ve_example/test.py
+++ b/ve_example/test.py
@@ -4,15 +4,16 @@ import subprocess
 import numpy as np
 import sympy as sp
 import pystencils as ps
-from pystencils_benchmark import generate_benchmark, Compiler
+import pystencils_benchmark as pb
 from pathlib import Path
 
 
-def generate(path: Path, compiler: Compiler):
+def generate(path: Path, compiler: pb.Compiler):
     a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
     alpha = sp.symbols('alpha')
 
     kernels = []
+
     @ps.kernel_config(ps.CreateKernelConfig())
     def vadd():
         a[0] @= b[0] + c[0]
@@ -33,20 +34,20 @@ def generate(path: Path, compiler: Compiler):
         b[0] @= alpha * a[0] + b[0]
     kernels.append(ps.create_kernel(**daxpy_vector))
 
-    generate_benchmark(kernels, path, compiler=compiler)
+    pb.cpu.generate_benchmark(kernels, path, compiler=compiler)
 
 
 def make(path: Path):
     subprocess.run(['make'], check=True)
 
 
-def execute(path: Path, compiler: Compiler):
+def execute(path: Path, compiler: pb.Compiler):
     subprocess.run([f'./benchmark-{compiler.name}', '100'], check=True)
 
 
 def main():
-    compiler = Compiler.GCCdebug
-    path = Path.cwd()
+    compiler = pb.Compiler.GCCdebug
+    path = Path.cwd() / 'generated'
     generate(path, compiler)
     make(path)
     execute(path, compiler)
@@ -54,4 +55,3 @@ def main():
 
 if __name__ == '__main__':
     main()
-
-- 
GitLab