changes for lbmpy tests

5e3a72de · Jan Laukemann · 1754ef27 · 5e3a72de · 5e3a72de
Commit 5e3a72de authored 5 years ago by Jan Laukemann
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
 from jinja2 import Template
+import numpy as np
 import os
 import subprocess
 from pystencils.include import get_pystencils_include_path
@@ -8,6 +9,25 @@ from pystencils.sympyextensions import prod
 from pystencils.data_types import get_base_type
 from pystencils.astnodes import PragmaBlock
+include_template = Template("""
+#include "kerncraft.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <stdio.h>
+{{ includes }}
+{%- if likwid %}
+#include <likwid.h>
+{%- endif %}
+#define RESTRICT __restrict__
+#define FUNC_PREFIX
+""")
 benchmark_template = Template("""
 #include "kerncraft.h"
 #include <stdlib.h>
@@ -28,9 +48,42 @@ void dummy(void *);
 void timing(double* wcTime, double* cpuTime);
 extern int var_false;
 {{kernel_code}}
+void *aligned_malloc_with_offset( unsigned long size, unsigned long alignment, unsigned long offset )
+{
+   // With 0 alignment this function makes no sense
+   // use normal malloc instead
+   //WALBERLA_ASSERT_GREATER( alignment, 0 );
+   // Tests if alignment is power of two (assuming alignment>0)
+   //WALBERLA_ASSERT( !(alignment & (alignment - 1)) );
+   //WALBERLA_ASSERT_LESS( offset, alignment );
+   if( offset == 0 )
+      return aligned_malloc( size, alignment );
+   void *pa;  // pointer to allocated memory
+   void *ptr; // pointer to usable aligned memory
+   pa=malloc( (size+2*alignment-1 )+sizeof(void *));
+   if(!pa)
+      return 0;
+   // Find next aligned position, starting at pa+sizeof(void*)-1
+   ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
+   ptr=(void*) ( (char*)(ptr) + alignment - offset);
+   // Store pointer to real allocated chunk just before usable chunk
+   *((void **)ptr-1)=pa;
+   //WALBERLA_ASSERT_EQUAL( ((size_t)ptr+offset) % alignment, 0 );
+   return ptr;
+}
 int main(int argc, char **argv)
 {
@@ -41,7 +94,12 @@ int main(int argc, char **argv)
  {%- for field_name, dataType, size in fields %}
  // Initialization {{field_name}}
+  {%- if aligned %}
+  double * {{field_name}} = (double *) aligned_malloc_with_offset(sizeof({{dataType}}) * {{size}}, 32, 8);
+  {%- else %}
  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
+  {%- endif %}
  for (unsigned long long i = 0; i < {{size}}; ++i)
    {{field_name}}[i] = 0.23;
@@ -69,6 +127,10 @@ int main(int argc, char **argv)
  #pragma omp barrier
  {%- elif likwid %}
  likwid_markerRegisterRegion("loop");
+  {%- elif openmp %}
+  #pragma omp parallel
+  {
+  #pragma omp barrier
  {%- endif %}
  for(int warmup = 1; warmup >= 0; --warmup) {
@@ -107,10 +169,10 @@ int main(int argc, char **argv)
  {%- if likwid %}
  likwid_markerStopRegion("loop");
+  {%- endif %}
  {%- if openmp %}
  }
  {%- endif %}
-  {%- endif %}
  {%- if likwid %}
  likwid_markerClose();
@@ -119,13 +181,12 @@ int main(int argc, char **argv)
 """)
-def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
+def generate_benchmark(ast, openmp=False, timing=False, main=True, kernel=True, assume_aligned=False):
    """Return C code of a benchmark program for the given kernel.
    Args:
        ast: the pystencils AST object as returned by create_kernel
-        likwid: if True likwid markers are added to the code
+        openmp: to generated correct likwid initialization code
-        openmp: relevant only if likwid=True, to generated correct likwid initialization code
        timing: add timing output to the code, prints time per iteration to stdout
    Returns:
@@ -143,7 +204,13 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
            assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
            field = accessed_fields[p.field_name]
            dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
+            if not assume_aligned:
+                fields.append((p.field_name, dtype, prod(field.shape)))
+            else:
+                index = np.where(field.strides == np.amax(field.strides))[0][0]
+                size = field.shape[index] * field.strides[index]
+                fields.append((p.field_name, dtype, int(size)))
            call_parameters.append(p.field_name)
    header_list = get_headers(ast)
@@ -151,14 +218,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
    # Strip "#pragma omp parallel" from within kernel, because main function takes care of that
    # when likwid and openmp are enabled
-    if likwid and openmp:
+    #if openmp:
-        if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
+    #    if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
-            ast.body.args[0].pragma_line = ''
+    #        ast.body.args[0].pragma_line = ''
+    if not main and kernel:
+        return include_template.render(
+                {
+                    'includes': includes,
+                    'likwid': False
+                }) + generate_c(ast, dialect='c')
    args = {
-        'likwid': likwid,
+        'aligned': assume_aligned,
+        'likwid': False,
        'openmp': openmp,
-        'kernel_code': generate_c(ast, dialect='c'),
+        'kernel_code': generate_c(ast, dialect='c') if kernel else '',
        'kernelName': ast.function_name,
        'fields': fields,
        'constants': constants,
@@ -169,7 +242,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
    return benchmark_template.render(**args)
-def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
+def run_c_benchmark(ast, inner_iterations, outer_iterations=3, assume_aligned=False, openmp=False):
    """Runs the given kernel with outer loop in C
    Args:
@@ -182,7 +255,7 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
    """
    import kerncraft
-    benchmark_code = generate_benchmark(ast, timing=True)
+    benchmark_code = generate_benchmark(ast, timing=True, assume_aligned=assume_aligned, openmp=openmp)
    with open('bench.c', 'w') as f:
        f.write(benchmark_code)
@@ -190,17 +263,28 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
    extra_flags = ['-I' + get_pystencils_include_path(),
                   '-I' + os.path.join(kerncraft_path, 'headers')]
+    if False:
+        extra_flags.append(os.environ['LIKWID_INC'])
+        extra_flags.append(os.environ['LIKWID_LIB'])
    compiler_config = get_compiler_config()
    compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
+    compile_asm_cmd = [compiler_config['command']] + compiler_config['flags'].split()
    compile_cmd += [*extra_flags,
                    os.path.join(kerncraft_path, 'headers', 'timing.c'),
                    os.path.join(kerncraft_path, 'headers', 'dummy.c'),
                    'bench.c',
                    '-o', 'bench',
                    ]
-    run_compile_step(compile_cmd)
+    print(" ".join(compile_cmd))
+    compile_asm_cmd += [*extra_flags,
+                    os.path.join(kerncraft_path, 'headers', 'timing.c'),
+                    os.path.join(kerncraft_path, 'headers', 'dummy.c'),
+                    'bench.c', '-S',
+                    ]
+    run_compile_step(compile_cmd)
+    run_compile_step(compile_asm_cmd)
    results = []
    for _ in range(outer_iterations):
        benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))

--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
@@ -21,10 +21,22 @@ class PyStencilsKerncraftKernel(KernelCode):
    Implementation of kerncraft's kernel interface for pystencils CPU kernels.
    Analyses a list of equations assuming they will be executed on a CPU
    """
-    LIKWID_BASE = '/usr/local/likwid'
+    LIKWID_BASE = '/mnt/opt/likwid-4.3.4'
+    @staticmethod
+    def get_number_of_kernels(ast: KernelFunction):
+        return len([l for l in filtered_tree_iteration(ast, LoopOverCoordinate,
+            stop_type=SympyAssignment) if l.is_innermost_loop])
+    @staticmethod
+    def get_steps_for_loops(ast: KernelFunction):
+        inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
+                       if l.is_innermost_loop]
+        steps = [l.step * 8 for l in inner_loops]
+        return steps
    def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
-                 assumed_layout='SoA', debug_print=False, filename=None):
+                 assumed_layout='SoA', debug_print=False, filename=None, loop_idx=False):
        """Create a kerncraft kernel using a pystencils AST
        Args:
@@ -50,11 +62,12 @@ class PyStencilsKerncraftKernel(KernelCode):
        if len(inner_loops) == 0:
            raise ValueError("No loop found in pystencils AST")
        else:
-            if len(inner_loops) > 1:
+            if len(inner_loops) > 1 and loop_idx is False:
-                warnings.warn("pystencils AST contains multiple inner loops. "
+                warnings.warn(("pystencils AST contains multiple inner loops ({}). "
-                              "Only one can be analyzed - choosing first one")
+                              "Only one can be analyzed - choosing first one").format(
-            inner_loop = inner_loops[0]
+                                len(inner_loops)))
+                loop_idx = 0
+            inner_loop = inner_loops[loop_idx]
        self._loop_stack = []
        cur_node = inner_loop
        while cur_node is not None:
@@ -95,7 +108,7 @@ class PyStencilsKerncraftKernel(KernelCode):
        for field in fields_accessed:
            layout = get_layout_tuple(field)
            permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
+            self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
        # Scalars may be safely ignored
        # for param in ast.get_parameters():
@@ -128,6 +141,39 @@ class PyStencilsKerncraftKernel(KernelCode):
            print("-----------------------------  FLOPS -------------------------------")
            pprint(self._flops)
+    def get_main_code(self, as_filename=False, kernel_function_name='kernel'):
+        """
+        Generate and return compilable source code from AST.
+        """
+        code = generate_benchmark(self.kernel_ast, openmp=True, kernel=False)
+        if as_filename:
+            fp, already_available = self._get_intermediate_file('main.c',
+                                                            machine_and_compiler_dependent=False)
+            if not already_available:
+                fp.write(code)
+            return fp.name
+        else:
+            return code
+    def get_kernel_code(self, openmp=False, as_filename=False, name='kernel'):
+        """
+        Generate and return compilable source code.
+        Args:
+            openmp: if true, openmp code will be generated
+            as_filename:
+        """
+        code = generate_benchmark(self.kernel_ast, openmp=openmp, main=False, kernel=True)
+        if as_filename:
+            fp, already_available = self._get_intermediate_file('kernel_{}.c'.format('-omp' if openmp else ''),
+                                                                machine_and_compiler_dependent=False)
+            if not already_available:
+                fp.write(code)
+            return fp.name
+        else:
+            return code
    def as_code(self, type_='iaca', openmp=False, as_filename=False):
        """
        Generate and return compilable source code.
@@ -137,6 +183,11 @@ class PyStencilsKerncraftKernel(KernelCode):
            openmp: if true, openmp code will be generated
            as_filename:
        """
+        warnings.warn(
+            ('as_code is deprecated and works only for kerncraft <= 0.8.0. Please upgrade your'
+                + 'kerncraft version and use self.get_kernel_code'),
+            FutureWarning
+        )
        code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
        if as_filename:
            fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_),
@@ -149,14 +200,14 @@ class PyStencilsKerncraftKernel(KernelCode):
 class KerncraftParameters(DotDict):
-    def __init__(self, **kwargs):
+    def __init__(self, ptr_inc='auto', **kwargs):
        super(KerncraftParameters, self).__init__(**kwargs)
        self['asm_block'] = 'auto'
        self['asm_increment'] = 0
        self['cores'] = 1
        self['cache_predictor'] = 'SIM'
-        self['verbose'] = 0
+        self['verbose'] = 1
-        self['pointer_increment'] = 'auto'
+        self['pointer_increment'] = ptr_inc
        self['iterations'] = 10
        self['unit'] = 'cy/CL'
        self['ignore_warnings'] = True