Skip to content
Snippets Groups Projects
Commit 74bb2c23 authored by Markus Holzer's avatar Markus Holzer
Browse files

Merge branch 'kc_update' into 'master'

improved kc coupling

See merge request !184
parents f3e81539 e91f23f7
Branches
Tags
No related merge requests found
...@@ -2,21 +2,23 @@ import warnings ...@@ -2,21 +2,23 @@ import warnings
import fcntl import fcntl
from collections import defaultdict from collections import defaultdict
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from typing import Optional import textwrap
from jinja2 import Environment, PackageLoader, StrictUndefined
from jinja2 import Environment, PackageLoader, StrictUndefined, Template
import sympy as sp import sympy as sp
from kerncraft.kerncraft import KernelCode from kerncraft.kerncraft import KernelCode
from kerncraft.machinemodel import MachineModel from kerncraft.machinemodel import MachineModel
from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment) from pystencils.astnodes import \
KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.field import get_layout_from_strides from pystencils.field import get_layout_from_strides
from pystencils.sympyextensions import count_operations_in_ast from pystencils.sympyextensions import count_operations_in_ast
from pystencils.transformations import filtered_tree_iteration from pystencils.transformations import filtered_tree_iteration
from pystencils.utils import DotDict from pystencils.utils import DotDict
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.kernelcreation import add_openmp from pystencils.cpu.kernelcreation import add_openmp
from pystencils.data_types import get_base_type
from pystencils.sympyextensions import prod
class PyStencilsKerncraftKernel(KernelCode): class PyStencilsKerncraftKernel(KernelCode):
...@@ -26,7 +28,7 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -26,7 +28,7 @@ class PyStencilsKerncraftKernel(KernelCode):
""" """
LIKWID_BASE = '/usr/local/likwid' LIKWID_BASE = '/usr/local/likwid'
def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None, def __init__(self, ast: KernelFunction, machine: MachineModel,
assumed_layout='SoA', debug_print=False, filename=None): assumed_layout='SoA', debug_print=False, filename=None):
"""Create a kerncraft kernel using a pystencils AST """Create a kerncraft kernel using a pystencils AST
...@@ -44,6 +46,7 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -44,6 +46,7 @@ class PyStencilsKerncraftKernel(KernelCode):
# Initialize state # Initialize state
self.asm_block = None self.asm_block = None
self._filename = filename self._filename = filename
self._keep_intermediates = False
self.kernel_ast = ast self.kernel_ast = ast
self.temporary_dir = TemporaryDirectory() self.temporary_dir = TemporaryDirectory()
...@@ -96,14 +99,14 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -96,14 +99,14 @@ class PyStencilsKerncraftKernel(KernelCode):
target_dict[fa.field.name].append(permuted_coord) target_dict[fa.field.name].append(permuted_coord)
# Variables (arrays) # Variables (arrays)
fields_accessed = ast.fields_accessed fields_accessed = self.kernel_ast.fields_accessed
for field in fields_accessed: for field in fields_accessed:
layout = get_layout_tuple(field) layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout) permuted_shape = list(field.shape[i] for i in layout)
self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape)) self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
# Scalars may be safely ignored # Scalars may be safely ignored
# for param in ast.get_parameters(): # for param in self.kernel_ast.get_parameters():
# if not param.is_field_parameter: # if not param.is_field_parameter:
# # self.set_variable(param.symbol.name, str(param.symbol.dtype), None) # # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
# self.sources[param.symbol.name] = [None] # self.sources[param.symbol.name] = [None]
...@@ -138,7 +141,10 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -138,7 +141,10 @@ class PyStencilsKerncraftKernel(KernelCode):
file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False) file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path) lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX: if lock_mode == fcntl.LOCK_SH:
# use cache
pass
else: # lock_mode == fcntl.LOCK_EX:
function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True) function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True)
jinja_context = { jinja_context = {
...@@ -150,13 +156,12 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -150,13 +156,12 @@ class PyStencilsKerncraftKernel(KernelCode):
with open(file_path, 'w') as f: with open(file_path, 'w') as f:
f.write(file_header) f.write(file_header)
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock self.release_exclusive_lock(lock_fp) # degrade to shared lock
return file_path, lock_fp return file_path, lock_fp
def get_kernel_code(self, openmp=False, name='pystencils_kernl'): def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
""" """
Generate and return compilable source code. Generate and return compilable source code from AST.
Args: Args:
openmp: if true, openmp code will be generated openmp: if true, openmp code will be generated
...@@ -169,7 +174,11 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -169,7 +174,11 @@ class PyStencilsKerncraftKernel(KernelCode):
file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False) file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path) lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX: if lock_mode == fcntl.LOCK_SH:
# use cache
with open(file_path) as f:
code = f.read()
else: # lock_mode == fcntl.LOCK_EX:
header_list = get_headers(self.kernel_ast) header_list = get_headers(self.kernel_ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
...@@ -184,11 +193,136 @@ class PyStencilsKerncraftKernel(KernelCode): ...@@ -184,11 +193,136 @@ class PyStencilsKerncraftKernel(KernelCode):
} }
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
file_header = env.get_template('kernel.c').render(**jinja_context) code = env.get_template('kernel.c').render(**jinja_context)
with open(file_path, 'w') as f: with open(file_path, 'w') as f:
f.write(file_header) f.write(code)
self.release_exclusive_lock(lock_fp) # degrade to shared lock
return file_path, lock_fp
CODE_TEMPLATE = Template(textwrap.dedent("""
#include <likwid.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include "kerncraft.h"
#include "kernel.h"
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
extern int var_false;
int main(int argc, char **argv) {
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
{%- endfor %}
// Declaring arrays
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
// TODO initialize in parallel context in same order as they are touched
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
{%- endfor %}
likwid_markerInit();
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
// Initializing arrays in same order as touched in kernel loop nest
//INIT_ARRAYS;
// Dummy call
{%- for field_name, dataType, size in fields %}
if(var_false) dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy(&{{constantName}});
{%- endfor %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
likwid_markerStartRegion("loop");
}
for(; repeat > 0; --repeat) {
{{kernelName}}({{call_argument_list}});
{%- for field_name, dataType, size in fields %}
if(var_false) dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy(&{{constantName}});
{%- endfor %}
}
}
likwid_markerStopRegion("loop");
}
likwid_markerClose();
return 0;
}
"""))
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock def get_main_code(self, kernel_function_name='kernel'):
"""
Generate and return compilable source code from AST.
:return: tuple of filename and shared lock file pointer
"""
# TODO produce nicer code, including help text and other "comfort features".
assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \
"based on a kernel description rather than code."
file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_SH:
# use cache
with open(file_path) as f:
code = f.read()
else: # lock_mode == fcntl.LOCK_EX
# needs update
accessed_fields = {f.name: f for f in self.kernel_ast.fields_accessed}
constants = []
fields = []
call_parameters = []
for p in self.kernel_ast.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
fields.append((p.field_name, dtype, prod(field.shape)))
call_parameters.append(p.field_name)
header_list = get_headers(self.kernel_ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
# Generate code
code = self.CODE_TEMPLATE.render(
kernelName=self.kernel_ast.function_name,
fields=fields,
constants=constants,
call_agument_list=','.join(call_parameters),
includes=includes)
# Store to file
with open(file_path, 'w') as f:
f.write(code)
self.release_exclusive_lock(lock_fp) # degrade to shared lock
return file_path, lock_fp return file_path, lock_fp
......
kerncraft version: 0.8.3.dev0 kerncraft version: 0.8.6.dev0
model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
model type: Intel Xeon SandyBridge EN/EP processor model type: Intel Xeon SandyBridge EN/EP processor
clock: 2.7 GHz clock: 2.7 GHz
...@@ -8,6 +8,7 @@ cores per socket: 8 ...@@ -8,6 +8,7 @@ cores per socket: 8
threads per core: 2 threads per core: 2
NUMA domains per socket: 1 NUMA domains per socket: 1
cores per NUMA domain: 8 cores per NUMA domain: 8
transparent hugepage: always
in-core model: !!omap in-core model: !!omap
- IACA: SNB - IACA: SNB
...@@ -20,17 +21,22 @@ FLOPs per cycle: ...@@ -20,17 +21,22 @@ FLOPs per cycle:
DP: {total: 8, ADD: 4, MUL: 4} DP: {total: 8, ADD: 4, MUL: 4}
compiler: !!omap compiler: !!omap
- icc: -O3 -xAVX -fno-alias -qopenmp - icc: -O3 -xAVX -fno-alias -qopenmp -ffreestanding -nolib-inline
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp - clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -ffreestanding
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm - gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm -ffreestanding
overlapping model:
ports:
IACA: ['0', 0DV, '1', '2', '3', '4', '5']
OSACA: ['0', 0DV, '1', '2', '3', '4', '5']
LLVM-MCA: [SBDivider, SBFPDivider, SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]
performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
non-overlapping model: non-overlapping model:
ports: [2D, 3D] ports:
IACA: [2D, 3D]
OSACA: [2D, 3D]
LLVM-MCA: [SBPort23]
performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM
overlapping model:
ports: ['0', 0DV, '1', '2', '3', '4', '5']
performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
cacheline size: 64 B cacheline size: 64 B
memory hierarchy: memory hierarchy:
......
...@@ -42,9 +42,7 @@ def test_compilation(): ...@@ -42,9 +42,7 @@ def test_compilation():
@pytest.mark.kerncraft @pytest.mark.kerncraft
def analysis(kernel, model='ecmdata'): def analysis(kernel, machine, model='ecmdata'):
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
if model == 'ecmdata': if model == 'ecmdata':
model = ECMData(kernel, machine, KerncraftParameters()) model = ECMData(kernel, machine, KerncraftParameters())
elif model == 'ecm': elif model == 'ecm':
...@@ -71,7 +69,7 @@ def test_3d_7pt_osaca(): ...@@ -71,7 +69,7 @@ def test_3d_7pt_osaca():
reference_kernel.set_constant('M', size[0]) reference_kernel.set_constant('M', size[0])
reference_kernel.set_constant('N', size[1]) reference_kernel.set_constant('N', size[1])
assert size[1] == size[2] assert size[1] == size[2]
analysis(reference_kernel, model='ecm') analysis(reference_kernel, machine_model, model='ecm')
arr = np.zeros(size) arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0) a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
...@@ -82,18 +80,22 @@ def test_3d_7pt_osaca(): ...@@ -82,18 +80,22 @@ def test_3d_7pt_osaca():
update_rule = Assignment(b[0, 0, 0], s * rhs) update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule]) ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast, machine=machine_model) k = PyStencilsKerncraftKernel(ast, machine=machine_model)
analysis(k, model='ecm') analysis(k, machine_model, model='ecm')
assert reference_kernel._flops == k._flops assert reference_kernel._flops == k._flops
# assert reference.results['cl throughput'] == analysis.results['cl throughput'] # assert reference.results['cl throughput'] == analysis.results['cl throughput']
@pytest.mark.kerncraft @pytest.mark.kerncraft
def test_2d_5pt(): def test_2d_5pt():
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
size = [30, 50, 3] size = [30, 50, 3]
kernel_file_path = INPUT_FOLDER / "2d-5pt.c" kernel_file_path = INPUT_FOLDER / "2d-5pt.c"
with open(kernel_file_path) as kernel_file: with open(kernel_file_path) as kernel_file:
reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path) reference_kernel = KernelCode(kernel_file.read(), machine=machine,
reference = analysis(reference_kernel) filename=kernel_file_path)
reference = analysis(reference_kernel, machine)
arr = np.zeros(size) arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=1) a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
...@@ -102,8 +104,8 @@ def test_2d_5pt(): ...@@ -102,8 +104,8 @@ def test_2d_5pt():
rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0] rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
update_rule = Assignment(b[0, 0], s * rhs) update_rule = Assignment(b[0, 0], s * rhs)
ast = create_kernel([update_rule]) ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast) k = PyStencilsKerncraftKernel(ast, machine)
result = analysis(k) result = analysis(k, machine)
for e1, e2 in zip(reference.results['cycles'], result.results['cycles']): for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
assert e1 == e2 assert e1 == e2
...@@ -111,14 +113,18 @@ def test_2d_5pt(): ...@@ -111,14 +113,18 @@ def test_2d_5pt():
@pytest.mark.kerncraft @pytest.mark.kerncraft
def test_3d_7pt(): def test_3d_7pt():
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
size = [30, 50, 50] size = [30, 50, 50]
kernel_file_path = INPUT_FOLDER / "3d-7pt.c" kernel_file_path = INPUT_FOLDER / "3d-7pt.c"
with open(kernel_file_path) as kernel_file: with open(kernel_file_path) as kernel_file:
reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path) reference_kernel = KernelCode(kernel_file.read(), machine=machine,
filename=kernel_file_path)
reference_kernel.set_constant('M', size[0]) reference_kernel.set_constant('M', size[0])
reference_kernel.set_constant('N', size[1]) reference_kernel.set_constant('N', size[1])
assert size[1] == size[2] assert size[1] == size[2]
reference = analysis(reference_kernel) reference = analysis(reference_kernel, machine)
arr = np.zeros(size) arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0) a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
...@@ -128,8 +134,8 @@ def test_3d_7pt(): ...@@ -128,8 +134,8 @@ def test_3d_7pt():
update_rule = Assignment(b[0, 0, 0], s * rhs) update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule]) ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast) k = PyStencilsKerncraftKernel(ast, machine)
result = analysis(k) result = analysis(k, machine)
for e1, e2 in zip(reference.results['cycles'], result.results['cycles']): for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
assert e1 == e2 assert e1 == e2
...@@ -163,6 +169,9 @@ def test_benchmark(): ...@@ -163,6 +169,9 @@ def test_benchmark():
@pytest.mark.kerncraft @pytest.mark.kerncraft
def test_kerncraft_generic_field(): def test_kerncraft_generic_field():
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
a = fields('a: double[3D]') a = fields('a: double[3D]')
b = fields('b: double[3D]') b = fields('b: double[3D]')
s = sp.Symbol("s") s = sp.Symbol("s")
...@@ -170,4 +179,4 @@ def test_kerncraft_generic_field(): ...@@ -170,4 +179,4 @@ def test_kerncraft_generic_field():
update_rule = Assignment(b[0, 0, 0], s * rhs) update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule]) ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast, debug_print=True) k = PyStencilsKerncraftKernel(ast, machine, debug_print=True)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment