From 9da89cd1cb7646d663361737ecd5ae8e964fb478 Mon Sep 17 00:00:00 2001
From: Julian Hammer <julian.hammer@fau.de>
Date: Fri, 6 Nov 2020 15:44:34 +0100
Subject: [PATCH] Updated Kerncraft Coupling

---
 .../kerncraft_coupling/kerncraft_interface.py | 222 +++++++++++++++++-
 1 file changed, 214 insertions(+), 8 deletions(-)

diff --git a/pystencils/pystencils/kerncraft_coupling/kerncraft_interface.py b/pystencils/pystencils/kerncraft_coupling/kerncraft_interface.py
index 0aa4dcd24..0f156a01b 100644
--- a/pystencils/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/pystencils/kerncraft_coupling/kerncraft_interface.py
@@ -1,4 +1,8 @@
 from tempfile import TemporaryDirectory
+import fcntl
+import textwrap
+from copy import deepcopy
+import warnings
 
 import sympy as sp
 from collections import defaultdict
@@ -6,14 +10,18 @@ import kerncraft
 import kerncraft.kernel
 from typing import Optional
 from kerncraft.machinemodel import MachineModel
+from jinja2 import Template
 
 from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-from pystencils.astnodes import LoopOverCoordinate, SympyAssignment, ResolvedFieldAccess, KernelFunction
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.astnodes import \
+    LoopOverCoordinate, SympyAssignment, ResolvedFieldAccess, KernelFunction
 from pystencils.field import get_layout_from_strides
 from pystencils.sympyextensions import count_operations_in_ast
 from pystencils.transformations import filtered_tree_iteration
 from pystencils.utils import DotDict
-import warnings
+from pystencils.astnodes import PragmaBlock
+
 
 
 class PyStencilsKerncraftKernel(kerncraft.kernel.KernelCode):
@@ -39,6 +47,7 @@ class PyStencilsKerncraftKernel(kerncraft.kernel.KernelCode):
         # Initialize state
         self.asm_block = None
         self._filename = filename
+        self._keep_intermediates = False
 
         self.kernel_ast = ast
         self.temporary_dir = TemporaryDirectory()
@@ -94,7 +103,7 @@ class PyStencilsKerncraftKernel(kerncraft.kernel.KernelCode):
         for field in fields_accessed:
             layout = get_layout_tuple(field)
             permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
+            self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
 
         # Scalars may be safely ignored
         # for param in ast.get_parameters():
@@ -127,14 +136,211 @@ class PyStencilsKerncraftKernel(kerncraft.kernel.KernelCode):
             print("-----------------------------  FLOPS -------------------------------")
             pprint(self._flops)
 
-    def as_code(self, type_='iaca', openmp=False):
+    def get_kernel_header(self, name='kernel'):
+        """
+        Generate and store kernel.h
+
+        :return: tuple of filename of header and file pointer of lockfile
+        """
+        file_name = 'kernel.h'
+        file_path = self.get_intermediate_location(
+            file_name, machine_and_compiler_dependent=False)
+        lock_mode, lock_fp = self.lock_intermediate(file_path)
+        if lock_mode == fcntl.LOCK_SH:
+            # use cache
+            with open(file_path) as f:
+                code = f.read()
+        else:  # lock_mode == fcntl.LOCK_EX
+            # needs update
+            code = generate_c(self.kernel_ast, signature_only=True)
+            with open(file_path, 'w') as f:
+                f.write(code)
+            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
+
+        return file_name, lock_fp
+
+    def get_kernel_code(self, openmp=False, name='kernel'):
+        """
+        Generate and return compilable source code with kernel function from AST.
+
+        :param openmp: include openmp paragmas (or strip them)
+        :param name: name of kernel function
+        """
+        assert not openmp, "openmp is currently not support by pystencils"
+        filename = 'kernel'
+        if openmp:
+            filename += '-omp'
+        filename += '.c'
+        file_path = self.get_intermediate_location(
+            filename, machine_and_compiler_dependent=False)
+        lock_mode, lock_fp = self.lock_intermediate(file_path)
+
+        if lock_mode == fcntl.LOCK_SH:
+            # use cache
+            with open(file_path) as f:
+                code = f.read()
+        else:  # lock_mode == fcntl.LOCK_EX
+            # needs update
+
+            kernel_template =Template(textwrap.dedent("""
+            #include <stdlib.h>
+            #include <stdint.h>
+            #include <stdbool.h>
+            #include <math.h>
+
+            #define RESTRICT __restrict__
+            #define FUNC_PREFIX
+
+            {{ includes }}
+
+            {{kernel_code}}
+            """))
+            header_list = get_headers(self.kernel_ast)
+            includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
+
+            ast = deepcopy(self.kernel_ast)
+            # Strip "#pragma omp parallel" from within kernel, because main function takes care of 
+            # that
+            if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
+                ast.body.args[0].pragma_line = ''
+
+            code = kernel_template.render(
+                kernel_code=generate_c(self.kernel_ast, dialect='c'),
+                includes=includes)
+
+            # Store to file
+            with open(file_path, 'w') as f:
+                f.write(code)
+            print(code)
+            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
+
+        return file_path, lock_fp
+
+    CODE_TEMPLATE = textwrap.dedent("""
+        #include <likwid.h>
+        #include <stdlib.h>
+        #include <stdint.h>
+        #include <stdbool.h>
+        #include <math.h>
+        #include "kerncraft.h"
+        #include "kernel.h"
+
+        #define RESTRICT __restrict__
+        #define FUNC_PREFIX
+        void dummy(void *);
+        extern int var_false;
+
+        int main(int argc, char **argv) {
+          {%- for constantName, dataType in constants %}
+          // Constant {{constantName}}
+          {{dataType}} {{constantName}};
+          {{constantName}} = 0.23;
+          {%- endfor %}
+
+          // Declaring arrays
+          {%- for field_name, dataType, size in fields %}
+
+          // Initialization {{field_name}}
+          double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
+          // TODO initialize in parallel context in same order as they are touched
+          for (unsigned long long i = 0; i < {{size}}; ++i)
+            {{field_name}}[i] = 0.23;
+          {%- endfor %}
+
+          likwid_markerInit();
+          #pragma omp parallel
+          {
+            likwid_markerRegisterRegion("loop");
+            #pragma omp barrier
+
+            // Initializing arrays in same order as touched in kernel loop nest
+            //INIT_ARRAYS;
+
+            // Dummy call
+            {%- for field_name, dataType, size in fields %}
+            if(var_false) dummy({{field_name}});
+            {%- endfor %}
+            {%- for constantName, dataType in constants %}
+            if(var_false) dummy(&{{constantName}});
+            {%- endfor %}
+
+            for(int warmup = 1; warmup >= 0; --warmup) {
+              int repeat = 2;
+              if(warmup == 0) {
+                repeat = atoi(argv[1]);
+                likwid_markerStartRegion("loop");
+              }
+
+              for(; repeat > 0; --repeat) {
+                {{kernelName}}({{call_argument_list}});
+
+                {%- for field_name, dataType, size in fields %}
+                if(var_false) dummy({{field_name}});
+                {%- endfor %}
+                {%- for constantName, dataType in constants %}
+                if(var_false) dummy(&{{constantName}});
+                {%- endfor %}
+              }
+
+            }
+            likwid_markerStopRegion("loop");
+          }
+          likwid_markerClose();
+          return 0;
+        }
+        """)
+
+    def get_main_code(self, kernel_function_name='kernel'):
         """
-        Generate and return compilable source code.
+        Generate and return compilable source code from AST.
 
-        :param type: can be iaca or likwid.
-        :param openmp: if true, openmp code will be generated
+        :return: tuple of filename and shared lock file pointer
         """
-        return generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
+        # TODO produce nicer code, including help text and other "comfort features".
+        assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \
+                                            "based on a kernel description rather than code."
+
+        file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False)
+        lock_mode, lock_fp = self.lock_intermediate(file_path)
+
+        if lock_mode == fcntl.LOCK_SH:
+            # use cache
+            with open(file_path) as f:
+                code = f.read()
+        else:  # lock_mode == fcntl.LOCK_EX
+            # needs update
+            accessed_fields = {f.name: f for f in ast.fields_accessed}
+            constants = []
+            fields = []
+            call_parameters = []
+            for p in ast.get_parameters():
+                if not p.is_field_parameter:
+                    constants.append((p.symbol.name, str(p.symbol.dtype)))
+                    call_parameters.append(p.symbol.name)
+                else:
+                    assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
+                    field = accessed_fields[p.field_name]
+                    dtype = str(get_base_type(p.symbol.dtype))
+                    fields.append((p.field_name, dtype, prod(field.shape)))
+                    call_parameters.append(p.field_name)
+
+            header_list = get_headers(ast)
+            includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
+
+            # Generate code
+            code = benchmark_template.render(
+                kernelName=ast.function_name,
+                fields=fields,
+                constants=constants,
+                call_agument_list=','.join(call_parameters),
+                includes=includes)
+
+            # Store to file
+            with open(file_path, 'w') as f:
+                f.write(code)
+            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
+
+        return file_path, lock_fp
 
 
 class KerncraftParameters(DotDict):
-- 
GitLab