Skip to content
Snippets Groups Projects
Commit 5e3a72de authored by Jan Laukemann's avatar Jan Laukemann
Browse files

changes for lbmpy tests

parent 1754ef27
No related branches found
No related tags found
No related merge requests found
from jinja2 import Template from jinja2 import Template
import numpy as np
import os import os
import subprocess import subprocess
from pystencils.include import get_pystencils_include_path from pystencils.include import get_pystencils_include_path
...@@ -8,6 +9,25 @@ from pystencils.sympyextensions import prod ...@@ -8,6 +9,25 @@ from pystencils.sympyextensions import prod
from pystencils.data_types import get_base_type from pystencils.data_types import get_base_type
from pystencils.astnodes import PragmaBlock from pystencils.astnodes import PragmaBlock
include_template = Template("""
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
""")
benchmark_template = Template(""" benchmark_template = Template("""
#include "kerncraft.h" #include "kerncraft.h"
#include <stdlib.h> #include <stdlib.h>
...@@ -28,9 +48,42 @@ void dummy(void *); ...@@ -28,9 +48,42 @@ void dummy(void *);
void timing(double* wcTime, double* cpuTime); void timing(double* wcTime, double* cpuTime);
extern int var_false; extern int var_false;
{{kernel_code}} {{kernel_code}}
void *aligned_malloc_with_offset( unsigned long size, unsigned long alignment, unsigned long offset )
{
// With 0 alignment this function makes no sense
// use normal malloc instead
//WALBERLA_ASSERT_GREATER( alignment, 0 );
// Tests if alignment is power of two (assuming alignment>0)
//WALBERLA_ASSERT( !(alignment & (alignment - 1)) );
//WALBERLA_ASSERT_LESS( offset, alignment );
if( offset == 0 )
return aligned_malloc( size, alignment );
void *pa; // pointer to allocated memory
void *ptr; // pointer to usable aligned memory
pa=malloc( (size+2*alignment-1 )+sizeof(void *));
if(!pa)
return 0;
// Find next aligned position, starting at pa+sizeof(void*)-1
ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
ptr=(void*) ( (char*)(ptr) + alignment - offset);
// Store pointer to real allocated chunk just before usable chunk
*((void **)ptr-1)=pa;
//WALBERLA_ASSERT_EQUAL( ((size_t)ptr+offset) % alignment, 0 );
return ptr;
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
...@@ -41,7 +94,12 @@ int main(int argc, char **argv) ...@@ -41,7 +94,12 @@ int main(int argc, char **argv)
{%- for field_name, dataType, size in fields %} {%- for field_name, dataType, size in fields %}
// Initialization {{field_name}} // Initialization {{field_name}}
{%- if aligned %}
double * {{field_name}} = (double *) aligned_malloc_with_offset(sizeof({{dataType}}) * {{size}}, 32, 8);
{%- else %}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64); double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
{%- endif %}
for (unsigned long long i = 0; i < {{size}}; ++i) for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23; {{field_name}}[i] = 0.23;
...@@ -69,6 +127,10 @@ int main(int argc, char **argv) ...@@ -69,6 +127,10 @@ int main(int argc, char **argv)
#pragma omp barrier #pragma omp barrier
{%- elif likwid %} {%- elif likwid %}
likwid_markerRegisterRegion("loop"); likwid_markerRegisterRegion("loop");
{%- elif openmp %}
#pragma omp parallel
{
#pragma omp barrier
{%- endif %} {%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) { for(int warmup = 1; warmup >= 0; --warmup) {
...@@ -107,10 +169,10 @@ int main(int argc, char **argv) ...@@ -107,10 +169,10 @@ int main(int argc, char **argv)
{%- if likwid %} {%- if likwid %}
likwid_markerStopRegion("loop"); likwid_markerStopRegion("loop");
{%- endif %}
{%- if openmp %} {%- if openmp %}
} }
{%- endif %} {%- endif %}
{%- endif %}
{%- if likwid %} {%- if likwid %}
likwid_markerClose(); likwid_markerClose();
...@@ -119,13 +181,12 @@ int main(int argc, char **argv) ...@@ -119,13 +181,12 @@ int main(int argc, char **argv)
""") """)
def generate_benchmark(ast, likwid=False, openmp=False, timing=False): def generate_benchmark(ast, openmp=False, timing=False, main=True, kernel=True, assume_aligned=False):
"""Return C code of a benchmark program for the given kernel. """Return C code of a benchmark program for the given kernel.
Args: Args:
ast: the pystencils AST object as returned by create_kernel ast: the pystencils AST object as returned by create_kernel
likwid: if True likwid markers are added to the code openmp: to generated correct likwid initialization code
openmp: relevant only if likwid=True, to generated correct likwid initialization code
timing: add timing output to the code, prints time per iteration to stdout timing: add timing output to the code, prints time per iteration to stdout
Returns: Returns:
...@@ -143,7 +204,13 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): ...@@ -143,7 +204,13 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name] field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype)) dtype = str(get_base_type(p.symbol.dtype))
fields.append((p.field_name, dtype, prod(field.shape))) if not assume_aligned:
fields.append((p.field_name, dtype, prod(field.shape)))
else:
index = np.where(field.strides == np.amax(field.strides))[0][0]
size = field.shape[index] * field.strides[index]
fields.append((p.field_name, dtype, int(size)))
call_parameters.append(p.field_name) call_parameters.append(p.field_name)
header_list = get_headers(ast) header_list = get_headers(ast)
...@@ -151,14 +218,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): ...@@ -151,14 +218,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
# Strip "#pragma omp parallel" from within kernel, because main function takes care of that # Strip "#pragma omp parallel" from within kernel, because main function takes care of that
# when likwid and openmp are enabled # when likwid and openmp are enabled
if likwid and openmp: #if openmp:
if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock): # if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
ast.body.args[0].pragma_line = '' # ast.body.args[0].pragma_line = ''
if not main and kernel:
return include_template.render(
{
'includes': includes,
'likwid': False
}) + generate_c(ast, dialect='c')
args = { args = {
'likwid': likwid, 'aligned': assume_aligned,
'likwid': False,
'openmp': openmp, 'openmp': openmp,
'kernel_code': generate_c(ast, dialect='c'), 'kernel_code': generate_c(ast, dialect='c') if kernel else '',
'kernelName': ast.function_name, 'kernelName': ast.function_name,
'fields': fields, 'fields': fields,
'constants': constants, 'constants': constants,
...@@ -169,7 +242,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): ...@@ -169,7 +242,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
return benchmark_template.render(**args) return benchmark_template.render(**args)
def run_c_benchmark(ast, inner_iterations, outer_iterations=3): def run_c_benchmark(ast, inner_iterations, outer_iterations=3, assume_aligned=False, openmp=False):
"""Runs the given kernel with outer loop in C """Runs the given kernel with outer loop in C
Args: Args:
...@@ -182,7 +255,7 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3): ...@@ -182,7 +255,7 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
""" """
import kerncraft import kerncraft
benchmark_code = generate_benchmark(ast, timing=True) benchmark_code = generate_benchmark(ast, timing=True, assume_aligned=assume_aligned, openmp=openmp)
with open('bench.c', 'w') as f: with open('bench.c', 'w') as f:
f.write(benchmark_code) f.write(benchmark_code)
...@@ -190,17 +263,28 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3): ...@@ -190,17 +263,28 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
extra_flags = ['-I' + get_pystencils_include_path(), extra_flags = ['-I' + get_pystencils_include_path(),
'-I' + os.path.join(kerncraft_path, 'headers')] '-I' + os.path.join(kerncraft_path, 'headers')]
if False:
extra_flags.append(os.environ['LIKWID_INC'])
extra_flags.append(os.environ['LIKWID_LIB'])
compiler_config = get_compiler_config() compiler_config = get_compiler_config()
compile_cmd = [compiler_config['command']] + compiler_config['flags'].split() compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
compile_asm_cmd = [compiler_config['command']] + compiler_config['flags'].split()
compile_cmd += [*extra_flags, compile_cmd += [*extra_flags,
os.path.join(kerncraft_path, 'headers', 'timing.c'), os.path.join(kerncraft_path, 'headers', 'timing.c'),
os.path.join(kerncraft_path, 'headers', 'dummy.c'), os.path.join(kerncraft_path, 'headers', 'dummy.c'),
'bench.c', 'bench.c',
'-o', 'bench', '-o', 'bench',
] ]
run_compile_step(compile_cmd) print(" ".join(compile_cmd))
compile_asm_cmd += [*extra_flags,
os.path.join(kerncraft_path, 'headers', 'timing.c'),
os.path.join(kerncraft_path, 'headers', 'dummy.c'),
'bench.c', '-S',
]
run_compile_step(compile_cmd)
run_compile_step(compile_asm_cmd)
results = [] results = []
for _ in range(outer_iterations): for _ in range(outer_iterations):
benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)])) benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))
......
...@@ -21,10 +21,22 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -21,10 +21,22 @@ class PyStencilsKerncraftKernel(KernelCode):
Implementation of kerncraft's kernel interface for pystencils CPU kernels. Implementation of kerncraft's kernel interface for pystencils CPU kernels.
Analyses a list of equations assuming they will be executed on a CPU Analyses a list of equations assuming they will be executed on a CPU
""" """
LIKWID_BASE = '/usr/local/likwid' LIKWID_BASE = '/mnt/opt/likwid-4.3.4'
@staticmethod
def get_number_of_kernels(ast: KernelFunction):
return len([l for l in filtered_tree_iteration(ast, LoopOverCoordinate,
stop_type=SympyAssignment) if l.is_innermost_loop])
@staticmethod
def get_steps_for_loops(ast: KernelFunction):
inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
if l.is_innermost_loop]
steps = [l.step * 8 for l in inner_loops]
return steps
def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None, def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
assumed_layout='SoA', debug_print=False, filename=None): assumed_layout='SoA', debug_print=False, filename=None, loop_idx=False):
"""Create a kerncraft kernel using a pystencils AST """Create a kerncraft kernel using a pystencils AST
Args: Args:
...@@ -50,11 +62,12 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -50,11 +62,12 @@ class PyStencilsKerncraftKernel(KernelCode):
if len(inner_loops) == 0: if len(inner_loops) == 0:
raise ValueError("No loop found in pystencils AST") raise ValueError("No loop found in pystencils AST")
else: else:
if len(inner_loops) > 1: if len(inner_loops) > 1 and loop_idx is False:
warnings.warn("pystencils AST contains multiple inner loops. " warnings.warn(("pystencils AST contains multiple inner loops ({}). "
"Only one can be analyzed - choosing first one") "Only one can be analyzed - choosing first one").format(
inner_loop = inner_loops[0] len(inner_loops)))
loop_idx = 0
inner_loop = inner_loops[loop_idx]
self._loop_stack = [] self._loop_stack = []
cur_node = inner_loop cur_node = inner_loop
while cur_node is not None: while cur_node is not None:
...@@ -95,7 +108,7 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -95,7 +108,7 @@ class PyStencilsKerncraftKernel(KernelCode):
for field in fields_accessed: for field in fields_accessed:
layout = get_layout_tuple(field) layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout) permuted_shape = list(field.shape[i] for i in layout)
self.set_variable(field.name, str(field.dtype), tuple(permuted_shape)) self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
# Scalars may be safely ignored # Scalars may be safely ignored
# for param in ast.get_parameters(): # for param in ast.get_parameters():
...@@ -128,6 +141,39 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -128,6 +141,39 @@ class PyStencilsKerncraftKernel(KernelCode):
print("----------------------------- FLOPS -------------------------------") print("----------------------------- FLOPS -------------------------------")
pprint(self._flops) pprint(self._flops)
def get_main_code(self, as_filename=False, kernel_function_name='kernel'):
"""
Generate and return compilable source code from AST.
"""
code = generate_benchmark(self.kernel_ast, openmp=True, kernel=False)
if as_filename:
fp, already_available = self._get_intermediate_file('main.c',
machine_and_compiler_dependent=False)
if not already_available:
fp.write(code)
return fp.name
else:
return code
def get_kernel_code(self, openmp=False, as_filename=False, name='kernel'):
"""
Generate and return compilable source code.
Args:
openmp: if true, openmp code will be generated
as_filename:
"""
code = generate_benchmark(self.kernel_ast, openmp=openmp, main=False, kernel=True)
if as_filename:
fp, already_available = self._get_intermediate_file('kernel_{}.c'.format('-omp' if openmp else ''),
machine_and_compiler_dependent=False)
if not already_available:
fp.write(code)
return fp.name
else:
return code
def as_code(self, type_='iaca', openmp=False, as_filename=False): def as_code(self, type_='iaca', openmp=False, as_filename=False):
""" """
Generate and return compilable source code. Generate and return compilable source code.
...@@ -137,6 +183,11 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -137,6 +183,11 @@ class PyStencilsKerncraftKernel(KernelCode):
openmp: if true, openmp code will be generated openmp: if true, openmp code will be generated
as_filename: as_filename:
""" """
warnings.warn(
('as_code is deprecated and works only for kerncraft <= 0.8.0. Please upgrade your'
+ 'kerncraft version and use self.get_kernel_code'),
FutureWarning
)
code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp) code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
if as_filename: if as_filename:
fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_), fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_),
...@@ -149,14 +200,14 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -149,14 +200,14 @@ class PyStencilsKerncraftKernel(KernelCode):
class KerncraftParameters(DotDict): class KerncraftParameters(DotDict):
def __init__(self, **kwargs): def __init__(self, ptr_inc='auto', **kwargs):
super(KerncraftParameters, self).__init__(**kwargs) super(KerncraftParameters, self).__init__(**kwargs)
self['asm_block'] = 'auto' self['asm_block'] = 'auto'
self['asm_increment'] = 0 self['asm_increment'] = 0
self['cores'] = 1 self['cores'] = 1
self['cache_predictor'] = 'SIM' self['cache_predictor'] = 'SIM'
self['verbose'] = 0 self['verbose'] = 1
self['pointer_increment'] = 'auto' self['pointer_increment'] = ptr_inc
self['iterations'] = 10 self['iterations'] = 10
self['unit'] = 'cy/CL' self['unit'] = 'cy/CL'
self['ignore_warnings'] = True self['ignore_warnings'] = True
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment