diff --git a/pystencils/kerncraft_coupling/generate_benchmark.py b/pystencils/kerncraft_coupling/generate_benchmark.py index f63be049a1fcfb2e6a5867bc8f7dc94123759165..ec89c59d268e09079043286f2fcfa84b07ce546f 100644 --- a/pystencils/kerncraft_coupling/generate_benchmark.py +++ b/pystencils/kerncraft_coupling/generate_benchmark.py @@ -1,4 +1,5 @@ from jinja2 import Template +import numpy as np import os import subprocess from pystencils.include import get_pystencils_include_path @@ -8,6 +9,25 @@ from pystencils.sympyextensions import prod from pystencils.data_types import get_base_type from pystencils.astnodes import PragmaBlock +include_template = Template(""" +#include "kerncraft.h" +#include <stdlib.h> +#include <stdint.h> +#include <stdbool.h> +#include <math.h> +#include <stdio.h> + +{{ includes }} +{%- if likwid %} +#include <likwid.h> +{%- endif %} + +#define RESTRICT __restrict__ +#define FUNC_PREFIX + + +""") + benchmark_template = Template(""" #include "kerncraft.h" #include <stdlib.h> @@ -28,9 +48,42 @@ void dummy(void *); void timing(double* wcTime, double* cpuTime); extern int var_false; - {{kernel_code}} +void *aligned_malloc_with_offset( unsigned long size, unsigned long alignment, unsigned long offset ) +{ + // With 0 alignment this function makes no sense + // use normal malloc instead + //WALBERLA_ASSERT_GREATER( alignment, 0 ); + // Tests if alignment is power of two (assuming alignment>0) + //WALBERLA_ASSERT( !(alignment & (alignment - 1)) ); + + //WALBERLA_ASSERT_LESS( offset, alignment ); + + if( offset == 0 ) + return aligned_malloc( size, alignment ); + + + void *pa; // pointer to allocated memory + void *ptr; // pointer to usable aligned memory + + pa=malloc( (size+2*alignment-1 )+sizeof(void *)); + if(!pa) + return 0; + + // Find next aligned position, starting at pa+sizeof(void*)-1 + ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1)); + ptr=(void*) ( (char*)(ptr) + alignment - offset); + + // Store pointer to real allocated chunk just before usable chunk + *((void **)ptr-1)=pa; + + //WALBERLA_ASSERT_EQUAL( ((size_t)ptr+offset) % alignment, 0 ); + + return ptr; +} + + int main(int argc, char **argv) { @@ -41,7 +94,12 @@ int main(int argc, char **argv) {%- for field_name, dataType, size in fields %} // Initialization {{field_name}} + {%- if aligned %} + double * {{field_name}} = (double *) aligned_malloc_with_offset(sizeof({{dataType}}) * {{size}}, 32, 8); + {%- else %} double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64); + {%- endif %} + for (unsigned long long i = 0; i < {{size}}; ++i) {{field_name}}[i] = 0.23; @@ -69,6 +127,10 @@ int main(int argc, char **argv) #pragma omp barrier {%- elif likwid %} likwid_markerRegisterRegion("loop"); + {%- elif openmp %} + #pragma omp parallel + { + #pragma omp barrier {%- endif %} for(int warmup = 1; warmup >= 0; --warmup) { @@ -107,10 +169,10 @@ int main(int argc, char **argv) {%- if likwid %} likwid_markerStopRegion("loop"); + {%- endif %} {%- if openmp %} } {%- endif %} - {%- endif %} {%- if likwid %} likwid_markerClose(); @@ -119,13 +181,12 @@ int main(int argc, char **argv) """) -def generate_benchmark(ast, likwid=False, openmp=False, timing=False): +def generate_benchmark(ast, openmp=False, timing=False, main=True, kernel=True, assume_aligned=False): """Return C code of a benchmark program for the given kernel. Args: ast: the pystencils AST object as returned by create_kernel - likwid: if True likwid markers are added to the code - openmp: relevant only if likwid=True, to generated correct likwid initialization code + openmp: to generated correct likwid initialization code timing: add timing output to the code, prints time per iteration to stdout Returns: @@ -143,7 +204,13 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" field = accessed_fields[p.field_name] dtype = str(get_base_type(p.symbol.dtype)) - fields.append((p.field_name, dtype, prod(field.shape))) + if not assume_aligned: + fields.append((p.field_name, dtype, prod(field.shape))) + else: + index = np.where(field.strides == np.amax(field.strides))[0][0] + size = field.shape[index] * field.strides[index] + fields.append((p.field_name, dtype, int(size))) + call_parameters.append(p.field_name) header_list = get_headers(ast) @@ -151,14 +218,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): # Strip "#pragma omp parallel" from within kernel, because main function takes care of that # when likwid and openmp are enabled - if likwid and openmp: - if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock): - ast.body.args[0].pragma_line = '' - + #if openmp: + # if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock): + # ast.body.args[0].pragma_line = '' + if not main and kernel: + return include_template.render( + { + 'includes': includes, + 'likwid': False + }) + generate_c(ast, dialect='c') args = { - 'likwid': likwid, + 'aligned': assume_aligned, + 'likwid': False, 'openmp': openmp, - 'kernel_code': generate_c(ast, dialect='c'), + 'kernel_code': generate_c(ast, dialect='c') if kernel else '', 'kernelName': ast.function_name, 'fields': fields, 'constants': constants, @@ -169,7 +242,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): return benchmark_template.render(**args) -def run_c_benchmark(ast, inner_iterations, outer_iterations=3): +def run_c_benchmark(ast, inner_iterations, outer_iterations=3, assume_aligned=False, openmp=False): """Runs the given kernel with outer loop in C Args: @@ -182,7 +255,7 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3): """ import kerncraft - benchmark_code = generate_benchmark(ast, timing=True) + benchmark_code = generate_benchmark(ast, timing=True, assume_aligned=assume_aligned, openmp=openmp) with open('bench.c', 'w') as f: f.write(benchmark_code) @@ -190,17 +263,28 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3): extra_flags = ['-I' + get_pystencils_include_path(), '-I' + os.path.join(kerncraft_path, 'headers')] + if False: + extra_flags.append(os.environ['LIKWID_INC']) + extra_flags.append(os.environ['LIKWID_LIB']) compiler_config = get_compiler_config() compile_cmd = [compiler_config['command']] + compiler_config['flags'].split() + compile_asm_cmd = [compiler_config['command']] + compiler_config['flags'].split() compile_cmd += [*extra_flags, os.path.join(kerncraft_path, 'headers', 'timing.c'), os.path.join(kerncraft_path, 'headers', 'dummy.c'), 'bench.c', '-o', 'bench', ] - run_compile_step(compile_cmd) + print(" ".join(compile_cmd)) + compile_asm_cmd += [*extra_flags, + os.path.join(kerncraft_path, 'headers', 'timing.c'), + os.path.join(kerncraft_path, 'headers', 'dummy.c'), + 'bench.c', '-S', + ] + run_compile_step(compile_cmd) + run_compile_step(compile_asm_cmd) results = [] for _ in range(outer_iterations): benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)])) diff --git a/pystencils/kerncraft_coupling/kerncraft_interface.py b/pystencils/kerncraft_coupling/kerncraft_interface.py index b58ccb96a28af75994837183b400807fbde9acb1..9184f266c0291e3678b9680029b906e56cd9af97 100644 --- a/pystencils/kerncraft_coupling/kerncraft_interface.py +++ b/pystencils/kerncraft_coupling/kerncraft_interface.py @@ -21,10 +21,22 @@ class PyStencilsKerncraftKernel(KernelCode): Implementation of kerncraft's kernel interface for pystencils CPU kernels. Analyses a list of equations assuming they will be executed on a CPU """ - LIKWID_BASE = '/usr/local/likwid' + LIKWID_BASE = '/mnt/opt/likwid-4.3.4' + + @staticmethod + def get_number_of_kernels(ast: KernelFunction): + return len([l for l in filtered_tree_iteration(ast, LoopOverCoordinate, + stop_type=SympyAssignment) if l.is_innermost_loop]) + + @staticmethod + def get_steps_for_loops(ast: KernelFunction): + inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment) + if l.is_innermost_loop] + steps = [l.step * 8 for l in inner_loops] + return steps def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None, - assumed_layout='SoA', debug_print=False, filename=None): + assumed_layout='SoA', debug_print=False, filename=None, loop_idx=False): """Create a kerncraft kernel using a pystencils AST Args: @@ -50,11 +62,12 @@ class PyStencilsKerncraftKernel(KernelCode): if len(inner_loops) == 0: raise ValueError("No loop found in pystencils AST") else: - if len(inner_loops) > 1: - warnings.warn("pystencils AST contains multiple inner loops. " - "Only one can be analyzed - choosing first one") - inner_loop = inner_loops[0] - + if len(inner_loops) > 1 and loop_idx is False: + warnings.warn(("pystencils AST contains multiple inner loops ({}). " + "Only one can be analyzed - choosing first one").format( + len(inner_loops))) + loop_idx = 0 + inner_loop = inner_loops[loop_idx] self._loop_stack = [] cur_node = inner_loop while cur_node is not None: @@ -95,7 +108,7 @@ class PyStencilsKerncraftKernel(KernelCode): for field in fields_accessed: layout = get_layout_tuple(field) permuted_shape = list(field.shape[i] for i in layout) - self.set_variable(field.name, str(field.dtype), tuple(permuted_shape)) + self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape)) # Scalars may be safely ignored # for param in ast.get_parameters(): @@ -128,6 +141,39 @@ class PyStencilsKerncraftKernel(KernelCode): print("----------------------------- FLOPS -------------------------------") pprint(self._flops) + def get_main_code(self, as_filename=False, kernel_function_name='kernel'): + """ + Generate and return compilable source code from AST. + """ + code = generate_benchmark(self.kernel_ast, openmp=True, kernel=False) + if as_filename: + fp, already_available = self._get_intermediate_file('main.c', + machine_and_compiler_dependent=False) + if not already_available: + fp.write(code) + return fp.name + else: + return code + + + def get_kernel_code(self, openmp=False, as_filename=False, name='kernel'): + """ + Generate and return compilable source code. + + Args: + openmp: if true, openmp code will be generated + as_filename: + """ + code = generate_benchmark(self.kernel_ast, openmp=openmp, main=False, kernel=True) + if as_filename: + fp, already_available = self._get_intermediate_file('kernel_{}.c'.format('-omp' if openmp else ''), + machine_and_compiler_dependent=False) + if not already_available: + fp.write(code) + return fp.name + else: + return code + def as_code(self, type_='iaca', openmp=False, as_filename=False): """ Generate and return compilable source code. @@ -137,6 +183,11 @@ class PyStencilsKerncraftKernel(KernelCode): openmp: if true, openmp code will be generated as_filename: """ + warnings.warn( + ('as_code is deprecated and works only for kerncraft <= 0.8.0. Please upgrade your' + + 'kerncraft version and use self.get_kernel_code'), + FutureWarning + ) code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp) if as_filename: fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_), @@ -149,14 +200,14 @@ class PyStencilsKerncraftKernel(KernelCode): class KerncraftParameters(DotDict): - def __init__(self, **kwargs): + def __init__(self, ptr_inc='auto', **kwargs): super(KerncraftParameters, self).__init__(**kwargs) self['asm_block'] = 'auto' self['asm_increment'] = 0 self['cores'] = 1 self['cache_predictor'] = 'SIM' - self['verbose'] = 0 - self['pointer_increment'] = 'auto' + self['verbose'] = 1 + self['pointer_increment'] = ptr_inc self['iterations'] = 10 self['unit'] = 'cy/CL' self['ignore_warnings'] = True