Skip to content
Snippets Groups Projects
Commit d7332d59 authored by Martin Bauer's avatar Martin Bauer
Browse files

OpenMP support for staggered kernels

parent 5c9ab9a8
Branches TextureDeclaration.__str__
Tags
No related merge requests found
...@@ -3,7 +3,7 @@ from functools import partial ...@@ -3,7 +3,7 @@ from functools import partial
from pystencils.astnodes import SympyAssignment, Block, LoopOverCoordinate, KernelFunction from pystencils.astnodes import SympyAssignment, Block, LoopOverCoordinate, KernelFunction
from pystencils.transformations import resolve_buffer_accesses, resolve_field_accesses, make_loop_over_domain, \ from pystencils.transformations import resolve_buffer_accesses, resolve_field_accesses, make_loop_over_domain, \
add_types, get_optimal_loop_ordering, parse_base_pointer_info, move_constants_before_loop, \ add_types, get_optimal_loop_ordering, parse_base_pointer_info, move_constants_before_loop, \
split_inner_loop, get_base_buffer_index split_inner_loop, get_base_buffer_index, filtered_tree_iteration
from pystencils.data_types import TypedSymbol, BasicType, StructType, create_type from pystencils.data_types import TypedSymbol, BasicType, StructType, create_type
from pystencils.field import Field, FieldType from pystencils.field import Field, FieldType
import pystencils.astnodes as ast import pystencils.astnodes as ast
...@@ -152,7 +152,7 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu ...@@ -152,7 +152,7 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
return ast_node return ast_node
def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None): def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None, assume_single_outer_loop=True):
"""Parallelize the outer loop with OpenMP. """Parallelize the outer loop with OpenMP.
Args: Args:
...@@ -160,6 +160,8 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None): ...@@ -160,6 +160,8 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic' schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic'
num_threads: explicitly specify number of threads num_threads: explicitly specify number of threads
collapse: number of nested loops to include in parallel region (see OpenMP collapse) collapse: number of nested loops to include in parallel region (see OpenMP collapse)
assume_single_outer_loop: if True an exception is raised if multiple outer loops are detected for all but
optimized staggered kernels the single-outer-loop assumption should be true
""" """
if not num_threads: if not num_threads:
return return
...@@ -170,31 +172,34 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None): ...@@ -170,31 +172,34 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
wrapper_block = ast.PragmaBlock('#pragma omp parallel' + threads_clause, body.take_child_nodes()) wrapper_block = ast.PragmaBlock('#pragma omp parallel' + threads_clause, body.take_child_nodes())
body.append(wrapper_block) body.append(wrapper_block)
outer_loops = [l for l in body.atoms(ast.LoopOverCoordinate) if l.is_outermost_loop] outer_loops = [l for l in filtered_tree_iteration(body, LoopOverCoordinate, stop_type=SympyAssignment)
if l.is_outermost_loop]
assert outer_loops, "No outer loop found" assert outer_loops, "No outer loop found"
assert len(outer_loops) <= 1, "More than one outer loop found. Not clear where to put OpenMP pragma." if assume_single_outer_loop and len(outer_loops) > 1:
loop_to_parallelize = outer_loops[0] raise ValueError("More than one outer loop found, only one outer loop expected")
try:
loop_range = int(loop_to_parallelize.stop - loop_to_parallelize.start) for loop_to_parallelize in outer_loops:
except TypeError: try:
loop_range = None loop_range = int(loop_to_parallelize.stop - loop_to_parallelize.start)
except TypeError:
if num_threads is None: loop_range = None
import multiprocessing
num_threads = multiprocessing.cpu_count() if num_threads is None:
import multiprocessing
if loop_range is not None and loop_range < num_threads and not collapse: num_threads = multiprocessing.cpu_count()
contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)]
if len(contained_loops) == 1: if loop_range is not None and loop_range < num_threads and not collapse:
contained_loop = contained_loops[0] contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)]
try: if len(contained_loops) == 1:
contained_loop_range = int(contained_loop.stop - contained_loop.start) contained_loop = contained_loops[0]
if contained_loop_range > loop_range: try:
loop_to_parallelize = contained_loop contained_loop_range = int(contained_loop.stop - contained_loop.start)
except TypeError: if contained_loop_range > loop_range:
pass loop_to_parallelize = contained_loop
except TypeError:
prefix = "#pragma omp for schedule(%s)" % (schedule,) pass
if collapse:
prefix += " collapse(%d)" % (collapse, ) prefix = "#pragma omp for schedule(%s)" % (schedule,)
loop_to_parallelize.prefix_lines.append(prefix) if collapse:
prefix += " collapse(%d)" % (collapse, )
loop_to_parallelize.prefix_lines.append(prefix)
...@@ -245,13 +245,21 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar ...@@ -245,13 +245,21 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None) cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
if cpu_vectorize_info: if cpu_vectorize_info:
del kwargs['cpu_vectorize_info'] del kwargs['cpu_vectorize_info']
openmp = kwargs.get('cpu_openmp', None)
if openmp:
del kwargs['cpu_openmp']
ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs) ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
if target == 'cpu': if target == 'cpu':
remove_conditionals_in_staggered_kernel(ast) remove_conditionals_in_staggered_kernel(ast)
move_constants_before_loop(ast) move_constants_before_loop(ast)
omp_collapse = None
if blocking: if blocking:
loop_blocking(ast, blocking) omp_collapse = loop_blocking(ast, blocking)
if openmp:
from pystencils.cpu import add_openmp
add_openmp(ast, num_threads=openmp, collapse=omp_collapse, assume_single_outer_loop=False)
if cpu_vectorize_info is True: if cpu_vectorize_info is True:
vectorize(ast) vectorize(ast)
elif isinstance(cpu_vectorize_info, dict): elif isinstance(cpu_vectorize_info, dict):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment