Merge branch 'Enable_OSACA_usage' into 'master'

Enable osaca usage See merge request !165

Merge branch 'Enable_OSACA_usage' into 'master'
2d758462 · Jan Hönig · cfd770cf · 0391c91d · 2d758462 · 2d758462
Commit 2d758462 authored 5 years ago by Jan Hönig
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
-import os
 import subprocess
+import warnings
+import tempfile
+from pathlib import Path

-from jinja2 import Template
+from jinja2 import Environment, PackageLoader, StrictUndefined

 from pystencils.astnodes import PragmaBlock
 from pystencils.backends.cbackend import generate_c, get_headers
@@ -10,116 +12,6 @@ from pystencils.data_types import get_base_type
 from pystencils.include import get_pystencils_include_path
 from pystencils.sympyextensions import prod

-benchmark_template = Template("""
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-
-{{ includes }}
-
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-
-
-{{kernel_code}}
-
-
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  {%- endif %}
-
-  {%- for field_name, dataType, size in fields %}
-
-  // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
-  for (unsigned long long i = 0; i < {{size}}; ++i)
-    {{field_name}}[i] = 0.23;
-
-  if(var_false)
-    dummy({{field_name}});
-
-  {%- endfor %}
-
-
-
-  {%- for constantName, dataType in constants %}
-
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-
-  {%- endfor %}
-
-  {%- if likwid and openmp %}
-  #pragma omp parallel
-  {
-  likwid_markerRegisterRegion("loop");
-  #pragma omp barrier
-  {%- elif likwid %}
-  likwid_markerRegisterRegion("loop");
-  {%- endif %}
-
-  for(int warmup = 1; warmup >= 0; --warmup) {
-    int repeat = 2;
-    if(warmup == 0) {
-      repeat = atoi(argv[1]);
-      {%- if likwid %}
-      likwid_markerStartRegion("loop");
-      {%- endif %}
-    }
-    
-    {%- if timing %}
-    double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
-    timing(&wcStartTime, &cpuStartTime);
-    {%- endif %}
-    
-    for (; repeat > 0; --repeat)
-    {
-      {{kernelName}}({{call_argument_list}});
-
-      // Dummy calls
-      {%- for field_name, dataType, size in fields %}
-      if(var_false) dummy((void*){{field_name}});
-      {%- endfor %}
-      {%- for constantName, dataType in constants %}
-      if(var_false) dummy((void*)&{{constantName}});
-      {%- endfor %}
-    }
-    {%- if timing %}
-    timing(&wcEndTime, &cpuEndTime);
-    if( warmup == 0)
-        printf("%e\\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
-    {%- endif %}
-
-  }
-
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- if openmp %}
-  }
-  {%- endif %}
-  {%- endif %}
-
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-}
-""")
-

 def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
    """Return C code of a benchmark program for the given kernel.
@@ -157,7 +49,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
        if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
            ast.body.args[0].pragma_line = ''

-    args = {
+    jinja_context = {
        'likwid': likwid,
        'openmp': openmp,
        'kernel_code': generate_c(ast, dialect='c'),
@@ -168,16 +60,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
        'includes': includes,
        'timing': timing,
    }
-    return benchmark_template.render(**args)

+    env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
+
+    return env.get_template('benchmark.c').render(**jinja_context)

-def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
+
+def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
    """Runs the given kernel with outer loop in C

    Args:
-        ast:
+        ast: pystencils ast which is used to compile the benchmark file
        inner_iterations: timings are recorded around this many iterations
        outer_iterations: number of timings recorded
+        path: path where the benchmark file is stored. If None a tmp folder is created

    Returns:
        list of times per iterations for each outer iteration
@@ -185,26 +81,40 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
    import kerncraft

    benchmark_code = generate_benchmark(ast, timing=True)
-    with open('bench.c', 'w') as f:
+
+    if path is None:
+        path = tempfile.mkdtemp()
+
+    if isinstance(path, str):
+        path = Path(path)
+
+    with open(path / 'bench.c', 'w') as f:
        f.write(benchmark_code)

-    kerncraft_path = os.path.dirname(kerncraft.__file__)
+    kerncraft_path = Path(kerncraft.__file__).parent

    extra_flags = ['-I' + get_pystencils_include_path(),
-                   '-I' + os.path.join(kerncraft_path, 'headers')]
+                   '-I' + str(kerncraft_path / 'headers')]

    compiler_config = get_compiler_config()
    compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
    compile_cmd += [*extra_flags,
-                    os.path.join(kerncraft_path, 'headers', 'timing.c'),
-                    os.path.join(kerncraft_path, 'headers', 'dummy.c'),
-                    'bench.c',
-                    '-o', 'bench',
+                    kerncraft_path / 'headers' / 'timing.c',
+                    kerncraft_path / 'headers' / 'dummy.c',
+                    path / 'bench.c',
+                    '-o', path / 'bench',
                    ]
    run_compile_step(compile_cmd)

+    time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)]))
+    benchmark_time_limit = 20
+    if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations:
+        warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than "
+                f"{benchmark_time_limit} seconds for this kernel")
+        warnings.warn(warn)
+
    results = []
    for _ in range(outer_iterations):
-        benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))
+        benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)]))
        results.append(benchmark_time)
    return results
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
 import warnings
+import fcntl
 from collections import defaultdict
 from tempfile import TemporaryDirectory
 from typing import Optional

-import kerncraft
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
 import sympy as sp
 from kerncraft.kerncraft import KernelCode
 from kerncraft.machinemodel import MachineModel

-from pystencils.astnodes import (
-    KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
+from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
 from pystencils.field import get_layout_from_strides
-from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
 from pystencils.sympyextensions import count_operations_in_ast
 from pystencils.transformations import filtered_tree_iteration
 from pystencils.utils import DotDict
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.cpu.kernelcreation import add_openmp


 class PyStencilsKerncraftKernel(KernelCode):
@@ -34,8 +36,10 @@ class PyStencilsKerncraftKernel(KernelCode):
            assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
                    coordinates is not known. In this case either a structures of array (SoA) or
                    array of structures (AoS) layout is assumed
+            debug_print: print debug information
+            filename: used for caching
        """
-        kerncraft.kernel.Kernel.__init__(self, machine)
+        super(KernelCode, self).__init__(machine=machine)

        # Initialize state
        self.asm_block = None
@@ -96,7 +100,7 @@ class PyStencilsKerncraftKernel(KernelCode):
        for field in fields_accessed:
            layout = get_layout_tuple(field)
            permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
+            self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape))

        # Scalars may be safely ignored
        # for param in ast.get_parameters():
@@ -129,24 +133,64 @@ class PyStencilsKerncraftKernel(KernelCode):
            print("-----------------------------  FLOPS -------------------------------")
            pprint(self._flops)

-    def as_code(self, type_='iaca', openmp=False, as_filename=False):
+    def get_kernel_header(self, name='pystencils_kernel'):
+        file_name = "pystencils_kernel.h"
+        file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
+        lock_mode, lock_fp = self.lock_intermediate(file_path)
+
+        if lock_mode == fcntl.LOCK_EX:
+            function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True)
+
+            jinja_context = {
+                'function_signature': function_signature,
+            }
+
+            env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
+            file_header = env.get_template('kernel.h').render(**jinja_context)
+            with open(file_path, 'w') as f:
+                f.write(file_header)
+
+            fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock
+
+        return file_path, lock_fp
+
+    def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
        """
        Generate and return compilable source code.

        Args:
-            type_: can be iaca or likwid.
            openmp: if true, openmp code will be generated
-            as_filename:
+            name: kernel name
        """
-        code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
-        if as_filename:
-            fp, already_available = self._get_intermediate_file(f'kernel_{type_}.c',
-                                                                machine_and_compiler_dependent=False)
-            if not already_available:
-                fp.write(code)
-            return fp.name
-        else:
-            return code
+        filename = 'pystencils_kernl'
+        if openmp:
+            filename += '-omp'
+        filename += '.c'
+        file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
+        lock_mode, lock_fp = self.lock_intermediate(file_path)
+
+        if lock_mode == fcntl.LOCK_EX:
+            header_list = get_headers(self.kernel_ast)
+            includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
+
+            if openmp:
+                add_openmp(self.kernel_ast)
+
+            kernel_code = generate_c(self.kernel_ast, dialect='c')
+
+            jinja_context = {
+                'includes': includes,
+                'kernel_code': kernel_code,
+            }
+
+            env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
+            file_header = env.get_template('kernel.c').render(**jinja_context)
+            with open(file_path, 'w') as f:
+                f.write(file_header)
+
+            fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock
+
+        return file_path, lock_fp


 class KerncraftParameters(DotDict):
@@ -161,6 +205,7 @@ class KerncraftParameters(DotDict):
        self['iterations'] = 10
        self['unit'] = 'cy/CL'
        self['ignore_warnings'] = True
+        self['incore_model'] = 'OSACA'


 # ------------------------------------------- Helper functions ---------------------------------------------------------

--- a/pystencils/kerncraft_coupling/templates/benchmark.c
+++ b/pystencils/kerncraft_coupling/templates/benchmark.c
+
+#include "kerncraft.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <stdio.h>
+
+{{ includes }}
+
+{%- if likwid %}
+#include <likwid.h>
+{%- endif %}
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX
+void dummy(void *);
+void timing(double* wcTime, double* cpuTime);
+extern int var_false;
+
+
+{{kernel_code}}
+
+
+int main(int argc, char **argv)
+{
+  {%- if likwid %}
+  likwid_markerInit();
+  {%- endif %}
+
+  {%- for field_name, dataType, size in fields %}
+
+  // Initialization {{field_name}}
+  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
+  for (unsigned long long i = 0; i < {{size}}; ++i)
+    {{field_name}}[i] = 0.23;
+
+  if(var_false)
+    dummy({{field_name}});
+
+  {%- endfor %}
+
+
+
+  {%- for constantName, dataType in constants %}
+
+  // Constant {{constantName}}
+  {{dataType}} {{constantName}};
+  {{constantName}} = 0.23;
+  if(var_false)
+      dummy(& {{constantName}});
+
+  {%- endfor %}
+
+  {%- if likwid and openmp %}
+  #pragma omp parallel
+  {
+  likwid_markerRegisterRegion("loop");
+  #pragma omp barrier
+  {%- elif likwid %}
+  likwid_markerRegisterRegion("loop");
+  {%- endif %}
+
+  for(int warmup = 1; warmup >= 0; --warmup) {
+    int repeat = 2;
+    if(warmup == 0) {
+      repeat = atoi(argv[1]);
+      {%- if likwid %}
+      likwid_markerStartRegion("loop");
+      {%- endif %}
+    }
+    
+    {%- if timing %}
+    double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
+    timing(&wcStartTime, &cpuStartTime);
+    {%- endif %}
+    
+    for (; repeat > 0; --repeat)
+    {
+      {{kernelName}}({{call_argument_list}});
+
+      // Dummy calls
+      {%- for field_name, dataType, size in fields %}
+      if(var_false) dummy((void*){{field_name}});
+      {%- endfor %}
+      {%- for constantName, dataType in constants %}
+      if(var_false) dummy((void*)&{{constantName}});
+      {%- endfor %}
+    }
+    {%- if timing %}
+    timing(&wcEndTime, &cpuEndTime);
+    if( warmup == 0)
+        printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
+    {%- endif %}
+
+  }
+
+  {%- if likwid %}
+  likwid_markerStopRegion("loop");
+  {%- if openmp %}
+  }
+  {%- endif %}
+  {%- endif %}
+
+  {%- if likwid %}
+  likwid_markerClose();
+  {%- endif %}
+}
--- a/pystencils/kerncraft_coupling/templates/kernel.c
+++ b/pystencils/kerncraft_coupling/templates/kernel.c
+
+#include "kerncraft.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <stdio.h>
+
+{{ includes }}
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX
+void dummy(void *);
+void timing(double* wcTime, double* cpuTime);
+extern int var_false;
+
+
+{{kernel_code}}
\ No newline at end of file
--- a/pystencils/kerncraft_coupling/templates/kernel.h
+++ b/pystencils/kerncraft_coupling/templates/kernel.h
+#define FUNC_PREFIX
+
+{{function_signature}}
\ No newline at end of file
--- a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
+++ b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
--- a/pystencils_tests/kerncraft_inputs/default_machine_file.yaml
+++ b/pystencils_tests/kerncraft_inputs/default_machine_file.yaml
-kerncraft version: 0.7.3
-clock: 2.7 GHz
-cores per socket: 8
-cores per NUMA domain: 8
-NUMA domains per socket: 1
-model type: Intel Core SandyBridge EP processor
-model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
-sockets: 2
-threads per core: 2
-cacheline size: 64 B
-compiler:
-    !!omap
-    - icc: -O3 -xAVX -fno-alias -qopenmp
-    - clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
-    - gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
-micro-architecture: SNB
-FLOPs per cycle:
-    SP: {total: 16, ADD: 8, MUL: 8}
-    DP: {total: 8, ADD: 4, MUL: 4}
-overlapping model:
-    ports: ["0", "0DV", "1", "2", "3", "4", "5"]
-    performance counter metric:
-        Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
-            UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
-            UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
-            UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
-non-overlapping model:
-    ports: ["2D", "3D"]
-    performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
-write-allocate: True
-memory hierarchy:
-    - level: L1
-      cache per group: {
-         'sets': 64, 'ways': 8, 'cl_size': 64, # 32 kB
-         'replacement_policy': 'LRU',
-         'write_allocate': True, 'write_back': True,
-         'load_from': 'L2', 'store_to': 'L2'}
-      cores per group: 1
-      threads per group: 2
-      groups: 16
-      performance counter metrics:
-          accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3]
-          misses: L1D_REPLACEMENT:PMC[0-3]
-          evicts: L1D_M_EVICT:PMC[0-3]
-    - level: L2
-      cache per group: {
-         'sets': 512, 'ways': 8, 'cl_size': 64, # 256 kB
-         'replacement_policy': 'LRU',
-         'write_allocate': True, 'write_back': True,
-         'load_from': 'L3', 'store_to': 'L3'}
-      cores per group: 1
-      threads per group: 2
-      groups: 16
-      non-overlap upstream throughput: [32 B/cy, 'half-duplex']
-      performance counter metrics:
-          accesses: L1D_REPLACEMENT:PMC[0-3]
-          misses: L2_LINES_IN_ALL:PMC[0-3]
-          evicts: L2_TRANS_L2_WB:PMC[0-3]
-    - level: L3
-      cache per group: {
-         'sets': 20480, 'ways': 16, 'cl_size': 64, # 20 MB
-         'replacement_policy': 'LRU',
-         'write_allocate': True, 'write_back': True}
-      cores per group: 8
-      threads per group: 16
-      groups: 2
-      non-overlap upstream throughput: [32 B/cy, 'half-duplex']
-      performance counter metrics:
-          accesses: L2_LINES_IN_ALL:PMC[0-3]
-          misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] +
-                   CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_RD:MBOX3C[01])
-          evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] +
-                   CAS_COUNT_WR:MBOX2C[01] + CAS_COUNT_WR:MBOX3C[01])
-    - level: MEM
-      cores per group: 8
-      non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
-      size per group: null
-      threads per group: 16
-benchmarks:
-  kernels:
-    copy:
-      FLOPs per iteration: 0
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 8.00 B, streams: 1}
-    daxpy:
-      FLOPs per iteration: 2
-      read streams: {bytes: 16.00 B, streams: 2}
-      read+write streams: {bytes: 8.00 B, streams: 1}
-      write streams: {bytes: 8.00 B, streams: 1}
-    load:
-      FLOPs per iteration: 0
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 0.00 B, streams: 0}
-    triad:
-      FLOPs per iteration: 2
-      read streams: {bytes: 24.00 B, streams: 3}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 8.00 B, streams: 1}
-    update:
-      FLOPs per iteration: 0
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 8.00 B, streams: 1}
-      write streams: {bytes: 8.00 B, streams: 1}
-  measurements:
-    L1:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [81.98 GB/s, 163.75 GB/s, 245.62 GB/s, 327.69 GB/s, 409.41 GB/s, 489.83
-              GB/s, 571.67 GB/s, 653.50 GB/s]
-          daxpy: [71.55 GB/s, 143.01 GB/s, 214.86 GB/s, 286.26 GB/s, 355.60 GB/s,
-            426.71 GB/s, 497.45 GB/s, 568.97 GB/s]
-          load: [61.92 GB/s, 122.79 GB/s, 183.01 GB/s, 244.30 GB/s, 306.76 GB/s, 368.46
-              GB/s, 427.41 GB/s, 490.88 GB/s]
-          triad: [81.61 GB/s, 163.25 GB/s, 244.92 GB/s, 326.65 GB/s, 406.69 GB/s,
-            487.76 GB/s, 569.10 GB/s, 650.39 GB/s]
-          update: [84.03 GB/s, 168.02 GB/s, 252.10 GB/s, 335.94 GB/s, 419.90 GB/s,
-            503.88 GB/s, 587.86 GB/s, 671.88 GB/s]
-        size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB,
-          16.00 kB, 16.00 kB]
-        size per thread: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00
-            kB, 16.00 kB, 16.00 kB]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00
-            kB, 128.00 kB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [79.53 GB/s, 158.70 GB/s, 238.20 GB/s, 317.62 GB/s, 397.09 GB/s, 476.33
-              GB/s, 555.69 GB/s, 634.96 GB/s]
-          daxpy: [70.94 GB/s, 141.90 GB/s, 212.97 GB/s, 283.91 GB/s, 354.93 GB/s,
-            425.85 GB/s, 496.74 GB/s, 567.40 GB/s]
-          load: [57.01 GB/s, 114.11 GB/s, 171.11 GB/s, 228.13 GB/s, 285.15 GB/s, 342.11
-              GB/s, 399.11 GB/s, 456.11 GB/s]
-          triad: [79.48 GB/s, 159.03 GB/s, 238.53 GB/s, 318.04 GB/s, 392.11 GB/s,
-            477.10 GB/s, 538.36 GB/s, 636.02 GB/s]
-          update: [82.75 GB/s, 165.55 GB/s, 248.50 GB/s, 331.32 GB/s, 414.06 GB/s,
-            496.82 GB/s, 579.83 GB/s, 662.36 GB/s]
-        size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB,
-          16.00 kB, 16.00 kB]
-        size per thread: [8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00
-            kB, 8.00 kB]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00
-            kB, 128.00 kB]
-    L2:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [41.28 GB/s, 81.96 GB/s, 120.28 GB/s, 160.70 GB/s, 203.22 GB/s, 239.97
-              GB/s, 271.13 GB/s, 307.01 GB/s]
-          daxpy: [48.85 GB/s, 98.62 GB/s, 143.29 GB/s, 197.76 GB/s, 230.58 GB/s, 284.98
-              GB/s, 334.22 GB/s, 385.72 GB/s]
-          load: [38.51 GB/s, 76.67 GB/s, 114.73 GB/s, 152.90 GB/s, 188.69 GB/s, 223.64
-              GB/s, 265.21 GB/s, 289.41 GB/s]
-          triad: [40.92 GB/s, 83.49 GB/s, 124.48 GB/s, 165.24 GB/s, 206.74 GB/s, 237.90
-              GB/s, 274.96 GB/s, 329.09 GB/s]
-          update: [50.37 GB/s, 100.05 GB/s, 145.43 GB/s, 196.82 GB/s, 244.07 GB/s,
-            301.62 GB/s, 336.88 GB/s, 403.78 GB/s]
-        size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
-            kB, 128.00 kB, 128.00 kB]
-        size per thread: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
-            kB, 128.00 kB, 128.00 kB]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00
-            kB, 0.90 MB, 1.02 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [42.17 GB/s, 83.47 GB/s, 124.57 GB/s, 163.78 GB/s, 202.56 GB/s, 242.80
-              GB/s, 276.95 GB/s, 311.36 GB/s]
-          daxpy: [50.87 GB/s, 98.72 GB/s, 152.12 GB/s, 193.48 GB/s, 251.36 GB/s, 301.72
-              GB/s, 352.55 GB/s, 365.28 GB/s]
-          load: [39.62 GB/s, 79.03 GB/s, 118.03 GB/s, 157.85 GB/s, 196.48 GB/s, 237.44
-              GB/s, 276.81 GB/s, 309.71 GB/s]
-          triad: [44.80 GB/s, 88.35 GB/s, 125.13 GB/s, 169.94 GB/s, 209.60 GB/s, 260.15
-              GB/s, 300.75 GB/s, 333.08 GB/s]
-          update: [49.80 GB/s, 100.70 GB/s, 150.56 GB/s, 196.44 GB/s, 251.90 GB/s,
-            280.93 GB/s, 352.74 GB/s, 399.27 GB/s]
-        size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
-            kB, 128.00 kB, 128.00 kB]
-        size per thread: [64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00
-            kB, 64.00 kB, 64.00 kB]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00
-            kB, 0.90 MB, 1.02 MB]
-    L3:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [23.21 GB/s, 46.01 GB/s, 67.96 GB/s, 90.17 GB/s, 111.47 GB/s, 133.14
-              GB/s, 153.84 GB/s, 174.92 GB/s]
-          daxpy: [30.35 GB/s, 60.32 GB/s, 90.00 GB/s, 119.71 GB/s, 148.87 GB/s, 178.39
-              GB/s, 207.10 GB/s, 236.25 GB/s]
-          load: [23.35 GB/s, 46.52 GB/s, 69.57 GB/s, 92.60 GB/s, 115.77 GB/s, 138.89
-              GB/s, 161.82 GB/s, 184.11 GB/s]
-          triad: [25.18 GB/s, 50.08 GB/s, 74.33 GB/s, 98.78 GB/s, 122.66 GB/s, 146.78
-              GB/s, 170.52 GB/s, 194.47 GB/s]
-          update: [32.67 GB/s, 64.65 GB/s, 95.98 GB/s, 127.29 GB/s, 157.67 GB/s, 188.22
-              GB/s, 217.41 GB/s, 246.99 GB/s]
-        size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
-            MB, 1.25 MB]
-        size per thread: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
-            MB, 1.25 MB]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB,
-          10.00 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [23.83 GB/s, 47.25 GB/s, 69.84 GB/s, 92.61 GB/s, 114.31 GB/s, 136.48
-              GB/s, 157.55 GB/s, 178.99 GB/s]
-          daxpy: [31.52 GB/s, 62.72 GB/s, 93.43 GB/s, 124.29 GB/s, 154.55 GB/s, 185.18
-              GB/s, 215.10 GB/s, 245.24 GB/s]
-          load: [27.63 GB/s, 54.93 GB/s, 81.57 GB/s, 108.63 GB/s, 134.91 GB/s, 161.72
-              GB/s, 188.15 GB/s, 214.94 GB/s]
-          triad: [25.90 GB/s, 51.76 GB/s, 76.73 GB/s, 102.29 GB/s, 126.17 GB/s, 152.10
-              GB/s, 176.71 GB/s, 200.64 GB/s]
-          update: [34.10 GB/s, 67.67 GB/s, 100.62 GB/s, 133.50 GB/s, 165.61 GB/s,
-            197.74 GB/s, 228.73 GB/s, 259.05 GB/s]
-        size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
-            MB, 1.25 MB]
-        size per thread: [625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00
-            kB, 625.00 kB, 625.00 kB]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB,
-          10.00 MB]
-    MEM:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [11.60 GB/s, 21.29 GB/s, 25.94 GB/s, 27.28 GB/s, 27.47 GB/s, 27.36
-              GB/s, 27.21 GB/s, 27.12 GB/s]
-          daxpy: [17.33 GB/s, 31.89 GB/s, 38.65 GB/s, 40.50 GB/s, 40.81 GB/s, 40.62
-              GB/s, 40.59 GB/s, 40.26 GB/s]
-          load: [12.01 GB/s, 23.04 GB/s, 32.79 GB/s, 40.21 GB/s, 43.39 GB/s, 44.14
-              GB/s, 44.42 GB/s, 44.40 GB/s]
-          triad: [12.73 GB/s, 24.27 GB/s, 30.43 GB/s, 31.46 GB/s, 31.77 GB/s, 31.74
-              GB/s, 31.65 GB/s, 31.52 GB/s]
-          update: [18.91 GB/s, 32.43 GB/s, 37.28 GB/s, 39.98 GB/s, 40.99 GB/s, 40.92
-              GB/s, 40.61 GB/s, 40.34 GB/s]
-        size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
-          5.71 MB, 5.00 MB]
-        size per thread: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
-          5.71 MB, 5.00 MB]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [10.92 GB/s, 20.62 GB/s, 25.34 GB/s, 26.22 GB/s, 26.32 GB/s, 26.31
-              GB/s, 26.22 GB/s, 26.16 GB/s]
-          daxpy: [17.15 GB/s, 31.96 GB/s, 38.12 GB/s, 39.19 GB/s, 39.38 GB/s, 39.16
-              GB/s, 39.06 GB/s, 38.87 GB/s]
-          load: [13.49 GB/s, 25.92 GB/s, 36.16 GB/s, 41.56 GB/s, 43.34 GB/s, 43.40
-              GB/s, 43.01 GB/s, 42.66 GB/s]
-          triad: [12.38 GB/s, 23.17 GB/s, 28.69 GB/s, 29.98 GB/s, 30.50 GB/s, 30.59
-              GB/s, 30.75 GB/s, 30.70 GB/s]
-          update: [19.67 GB/s, 34.93 GB/s, 39.93 GB/s, 40.79 GB/s, 40.43 GB/s, 40.03
-              GB/s, 39.62 GB/s, 39.33 GB/s]
-        size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
-          5.71 MB, 5.00 MB]
-        size per thread: [20.00 MB, 10.00 MB, 6.67 MB, 5.00 MB, 4.00 MB, 3.33 MB,
-          2.86 MB, 2.50 MB]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB]
-
--- a/pystencils_tests/test_kerncraft_coupling.py
+++ b/pystencils_tests/test_kerncraft_coupling.py
-import os
-
 import numpy as np
 import pytest
 import sympy as sp
-import kerncraft
+from pathlib import Path
+
+from kerncraft.kernel import KernelCode
+from kerncraft.machinemodel import MachineModel
+from kerncraft.models import ECM, ECMData, Benchmark

 from pystencils import Assignment, Field
 from pystencils.cpu import create_kernel
 from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
-from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
+from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark
+from pystencils.timeloop import TimeLoop

-SCRIPT_FOLDER = os.path.dirname(os.path.realpath(__file__))
-INPUT_FOLDER = os.path.join(SCRIPT_FOLDER, "kerncraft_inputs")
+SCRIPT_FOLDER = Path(__file__).parent
+INPUT_FOLDER = SCRIPT_FOLDER / "kerncraft_inputs"


 @pytest.mark.kerncraft
 def test_compilation():
-    machine_file_path = os.path.join(INPUT_FOLDER, "default_machine_file.yaml")
-    machine = kerncraft.machinemodel.MachineModel(path_to_yaml=machine_file_path)
+    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
+    machine = MachineModel(path_to_yaml=machine_file_path)

-    kernel_file_path = os.path.join(INPUT_FOLDER, "2d-5pt.c")
+    kernel_file_path = INPUT_FOLDER / "2d-5pt.c"
    with open(kernel_file_path) as kernel_file:
-        reference_kernel = kerncraft.kernel.KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path)
-        reference_kernel.as_code('likwid')
+        reference_kernel = KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path)
+        reference_kernel.get_kernel_header(name='test_kernel')
+        reference_kernel.get_kernel_code(name='test_kernel')
+        reference_kernel.get_main_code(kernel_function_name='test_kernel')

    size = [30, 50, 3]
    arr = np.zeros(size)
@@ -38,31 +43,31 @@ def test_compilation():

 @pytest.mark.kerncraft
 def analysis(kernel, model='ecmdata'):
-    machine_file_path = os.path.join(INPUT_FOLDER, "default_machine_file.yaml")
-    machine = kerncraft.machinemodel.MachineModel(path_to_yaml=machine_file_path)
+    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
+    machine = MachineModel(path_to_yaml=machine_file_path)
    if model == 'ecmdata':
-        model = kerncraft.models.ECMData(kernel, machine, KerncraftParameters())
+        model = ECMData(kernel, machine, KerncraftParameters())
    elif model == 'ecm':
-        model = kerncraft.models.ECM(kernel, machine, KerncraftParameters())
+        model = ECM(kernel, machine, KerncraftParameters())
        # model.analyze()
        # model.plot()
    elif model == 'benchmark':
-        model = kerncraft.models.Benchmark(kernel, machine, KerncraftParameters())
+        model = Benchmark(kernel, machine, KerncraftParameters())
    else:
-        model = kerncraft.models.ECM(kernel, machine, KerncraftParameters())
+        model = ECM(kernel, machine, KerncraftParameters())
    model.analyze()
    return model


 @pytest.mark.kerncraft
-def test_3d_7pt_iaca():
-    # Make sure you use the intel compiler
+def test_3d_7pt_osaca():
+
    size = [20, 200, 200]
-    kernel_file_path = os.path.join(INPUT_FOLDER, "3d-7pt.c")
-    machine_file_path = os.path.join(INPUT_FOLDER, "default_machine_file.yaml")
-    machine = kerncraft.machinemodel.MachineModel(path_to_yaml=machine_file_path)
+    kernel_file_path = INPUT_FOLDER / "3d-7pt.c"
+    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
+    machine_model = MachineModel(path_to_yaml=machine_file_path)
    with open(kernel_file_path) as kernel_file:
-        reference_kernel = kerncraft.kernel.KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path)
+        reference_kernel = KernelCode(kernel_file.read(), machine=machine_model, filename=kernel_file_path)
    reference_kernel.set_constant('M', size[0])
    reference_kernel.set_constant('N', size[1])
    assert size[1] == size[2]
@@ -76,7 +81,7 @@ def test_3d_7pt_iaca():

    update_rule = Assignment(b[0, 0, 0], s * rhs)
    ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast, machine)
+    k = PyStencilsKerncraftKernel(ast, machine=machine_model)
    analysis(k, model='ecm')
    assert reference_kernel._flops == k._flops
    # assert reference.results['cl throughput'] == analysis.results['cl throughput']
@@ -85,9 +90,9 @@ def test_3d_7pt_iaca():
 @pytest.mark.kerncraft
 def test_2d_5pt():
    size = [30, 50, 3]
-    kernel_file_path = os.path.join(INPUT_FOLDER, "2d-5pt.c")
+    kernel_file_path = INPUT_FOLDER / "2d-5pt.c"
    with open(kernel_file_path) as kernel_file:
-        reference_kernel = kerncraft.kernel.KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
+        reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
    reference = analysis(reference_kernel)

    arr = np.zeros(size)
@@ -107,9 +112,9 @@ def test_2d_5pt():
 @pytest.mark.kerncraft
 def test_3d_7pt():
    size = [30, 50, 50]
-    kernel_file_path = os.path.join(INPUT_FOLDER, "3d-7pt.c")
+    kernel_file_path = INPUT_FOLDER / "3d-7pt.c"
    with open(kernel_file_path) as kernel_file:
-        reference_kernel = kerncraft.kernel.KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
+        reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
    reference_kernel.set_constant('M', size[0])
    reference_kernel.set_constant('N', size[1])
    assert size[1] == size[2]
@@ -128,3 +133,29 @@ def test_3d_7pt():

    for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
        assert e1 == e2
+
+
+@pytest.mark.kerncraft
+def test_benchmark():
+    size = [30, 50, 50]
+    arr = np.zeros(size)
+    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
+    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
+    s = sp.Symbol("s")
+    rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
+
+    update_rule = Assignment(b[0, 0, 0], s * rhs)
+    ast = create_kernel([update_rule])
+
+    c_benchmark_run = run_c_benchmark(ast, inner_iterations=1000, outer_iterations=1)
+
+    kernel = ast.compile()
+    a = np.full(size, fill_value=0.23)
+    b = np.full(size, fill_value=0.23)
+
+    timeloop = TimeLoop(steps=1)
+    timeloop.add_call(kernel, {'a': a, 'b': b, 's': 0.23})
+
+    timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1)
+
+    np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4)