Skip to content
Snippets Groups Projects
Commit 0f21a28f authored by Martin Bauer's avatar Martin Bauer
Browse files

Added support for inner/outer kernels to overlap communication

parent dc975ff4
Branches
Tags
No related merge requests found
...@@ -4,21 +4,23 @@ import copy ...@@ -4,21 +4,23 @@ import copy
from pystencils.astnodes import ResolvedFieldAccess from pystencils.astnodes import ResolvedFieldAccess
from pystencils.data_types import get_base_type from pystencils.data_types import get_base_type
from pystencils.backends.cbackend import generate_c, CustomSympyPrinter from pystencils.backends.cbackend import generate_c, CustomSympyPrinter
from pystencils.field import FieldType from pystencils.field import FieldType, Field
from pystencils.sympyextensions import prod from pystencils.sympyextensions import prod
temporary_fieldMemberTemplate = """
std::set< {type} *, field::SwapableCompare< {type} * > > cache_{original_field_name}_;"""
temporary_fieldTemplate = """ temporary_fieldTemplate = """
// Getting temporary field {tmp_field_name} // Getting temporary field {tmp_field_name}
static std::set< {type} *, field::SwapableCompare< {type} * > > cache_{original_field_name}; auto it = cache_{original_field_name}_.find( {original_field_name} );
auto it = cache_{original_field_name}.find( {original_field_name} ); if( it != cache_{original_field_name}_.end() )
if( it != cache_{original_field_name}.end() )
{{ {{
{tmp_field_name} = *it; {tmp_field_name} = *it;
}} }}
else else
{{ {{
{tmp_field_name} = {original_field_name}->cloneUninitialized(); {tmp_field_name} = {original_field_name}->cloneUninitialized();
cache_{original_field_name}.insert({tmp_field_name}); cache_{original_field_name}_.insert({tmp_field_name});
}} }}
""" """
...@@ -36,6 +38,7 @@ def get_field_fsize(field, field_accesses=()): ...@@ -36,6 +38,7 @@ def get_field_fsize(field, field_accesses=()):
elif field.index_dimensions == 0: elif field.index_dimensions == 0:
return 1 return 1
else: else:
assert len(field_accesses) > 0
assert field.index_dimensions == 1 assert field.index_dimensions == 1
max_idx_value = 0 max_idx_value = 0
for acc in field_accesses: for acc in field_accesses:
...@@ -97,7 +100,7 @@ def field_extraction_code(field_accesses, field_name, is_temporary, declaration_ ...@@ -97,7 +100,7 @@ def field_extraction_code(field_accesses, field_name, is_temporary, declaration_
# Determine size of f coordinate which is a template parameter # Determine size of f coordinate which is a template parameter
f_size = get_field_fsize(field, field_accesses) f_size = get_field_fsize(field, field_accesses)
dtype = get_base_type(field.dtype) dtype = get_base_type(field.dtype)
field_type = "cuda::GPUField<%s>" % (dtype,) if is_gpu else "GhostLayerField<%s, %d>" % (dtype, f_size) field_type = make_field_type(dtype, f_size, is_gpu)
if not is_temporary: if not is_temporary:
dtype = get_base_type(field.dtype) dtype = get_base_type(field.dtype)
...@@ -182,6 +185,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non ...@@ -182,6 +185,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
""" """
assert isinstance(ghost_layers_to_include, str) or ghost_layers_to_include >= 0 assert isinstance(ghost_layers_to_include, str) or ghost_layers_to_include >= 0
ast = kernel_info.ast ast = kernel_info.ast
ast_params = kernel_info.ast.parameters
ghost_layers_to_include = sp.sympify(ghost_layers_to_include) ghost_layers_to_include = sp.sympify(ghost_layers_to_include)
if ast.ghost_layers is None: if ast.ghost_layers is None:
...@@ -195,10 +199,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non ...@@ -195,10 +199,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
kernel_call_lines = [] kernel_call_lines = []
fields = {f.name: f for f in ast.fields_accessed} fields = {f.name: f for f in ast.fields_accessed}
spatial_shape_symbols = [] def get_start_coordinates(field_object):
def get_start_coordinates(parameter):
field_object = fields[parameter.field_name]
if cell_interval is None: if cell_interval is None:
return [-ghost_layers_to_include - required_ghost_layers] * field_object.spatial_dimensions return [-ghost_layers_to_include - required_ghost_layers] * field_object.spatial_dimensions
else: else:
...@@ -206,19 +207,43 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non ...@@ -206,19 +207,43 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord, ci=cell_interval)) - required_ghost_layers return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord, ci=cell_interval)) - required_ghost_layers
for coord in ('x', 'y', 'z')] for coord in ('x', 'y', 'z')]
def get_end_coordinates(parameter): def get_end_coordinates(field_object):
field_object = fields[parameter.field_name]
if cell_interval is None: if cell_interval is None:
shape_names = ['xSize()', 'ySize()', 'zSize()'][:field_object.spatial_dimensions] shape_names = ['xSize()', 'ySize()', 'zSize()'][:field_object.spatial_dimensions]
offset = 2 * ghost_layers_to_include + 2 * required_ghost_layers offset = 2 * ghost_layers_to_include + 2 * required_ghost_layers
return ["%s->%s + %s" % (parameter.field_name, e, offset) for e in shape_names] return ["%s->%s + %s" % (field_object.name, e, offset) for e in shape_names]
else: else:
assert ghost_layers_to_include == 0 assert ghost_layers_to_include == 0
coord_names = ['x', 'y', 'z'][:field_object.spatial_dimensions] coord_names = ['x', 'y', 'z'][:field_object.spatial_dimensions]
return ["{ci}.{coord}Size() + {gl}".format(coord=coord, ci=cell_interval, gl=required_ghost_layers) return ["{ci}.{coord}Size() + {gl}".format(coord=coord, ci=cell_interval, gl=2 * required_ghost_layers)
for coord in coord_names] for coord in coord_names]
for param in ast.parameters: def create_field_shape_code(field, param_name, gpu_copy=True):
result = []
type_str = get_base_type(Field.SHAPE_DTYPE).base_name
shapes = ["%s(%s)" % (type_str, c) for c in get_end_coordinates(field)]
max_values = ["%s->%sSizeWithGhostLayer()" % (field.name, coord) for coord in ['x', 'y', 'z']]
for shape, max_value in zip(shapes, max_values):
result.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, %s);" % (max_value, shape))
if field.index_dimensions == 1:
shapes.append("%s(%s->fSize())" % (type_str, field.name))
elif field.index_dimensions > 1:
shapes.extend(["%s(%d)" % (type_str, e) for e in field.index_shape])
result.append("WALBERLA_ASSERT_EQUAL(int(%s->fSize()), %d);" %
(field.name, prod(field.index_shape)))
if is_cpu or not gpu_copy:
result.append("const %s %s [] = {%s};" % (type_str, param_name, ", ".join(shapes)))
else:
result.append("const %s %s_cpu [] = {%s};" % (type_str, param_name, ", ".join(shapes)))
result.append(
"WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), "
"0, cudaMemcpyHostToDevice, %s) );"
% (ast.function_name, param_name, param_name, len(shapes), type_str, stream))
return result
for param in ast_params:
if param.is_field_argument and FieldType.is_indexed(fields[param.field_name]): if param.is_field_argument and FieldType.is_indexed(fields[param.field_name]):
continue continue
...@@ -227,7 +252,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non ...@@ -227,7 +252,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
if field.field_type == FieldType.BUFFER: if field.field_type == FieldType.BUFFER:
kernel_call_lines.append("%s %s = %s;" % (param.dtype, param.name, param.field_name)) kernel_call_lines.append("%s %s = %s;" % (param.dtype, param.name, param.field_name))
else: else:
coordinates = get_start_coordinates(param) coordinates = get_start_coordinates(field)
actual_gls = "int_c(%s->nrOfGhostLayers())" % (param.field_name, ) actual_gls = "int_c(%s->nrOfGhostLayers())" % (param.field_name, )
for c in set(coordinates): for c in set(coordinates):
kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, -%s);" % kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, -%s);" %
...@@ -255,36 +280,32 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non ...@@ -255,36 +280,32 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
else: else:
kernel_call_lines.append("const %s %s_cpu [] = {%s};" % (type_str, param.name, ", ".join(strides))) kernel_call_lines.append("const %s %s_cpu [] = {%s};" % (type_str, param.name, ", ".join(strides)))
kernel_call_lines.append( kernel_call_lines.append(
"WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), 0, cudaMemcpyHostToDevice, %s) );" "WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), "
"0, cudaMemcpyHostToDevice, %s) );"
% (ast.function_name, param.name, param.name, len(strides), type_str, stream)) % (ast.function_name, param.name, param.name, len(strides), type_str, stream))
elif param.is_field_shape_argument: elif param.is_field_shape_argument:
field = fields[param.field_name] kernel_call_lines += create_field_shape_code(fields[param.field_name], param.name)
type_str = get_base_type(param.dtype).base_name
shapes = ["%s(%s)" % (type_str, c) for c in get_end_coordinates(param)]
spatial_shape_symbols = [sp.Symbol("%s_cpu[%d]" % (param.name, i)) for i in range(field.spatial_dimensions)]
max_values = ["%s->%sSizeWithGhostLayer()" % (field.name, coord) for coord in ['x', 'y', 'z']]
for shape, max_value in zip(shapes, max_values):
kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, %s);" % (max_value, shape))
if field.index_dimensions == 1:
shapes.append("%s(%s->fSize())" % (type_str, field.name))
elif field.index_dimensions > 1:
shapes.extend(["%s(%d)" % (type_str, e) for e in field.index_shape])
kernel_call_lines.append("WALBERLA_ASSERT_EQUAL(int(%s->fSize()), %d);" %
(field.name, prod(field.index_shape)))
if is_cpu:
kernel_call_lines.append("const %s %s [] = {%s};" % (type_str, param.name, ", ".join(shapes)))
else:
kernel_call_lines.append("const %s %s_cpu [] = {%s};" % (type_str, param.name, ", ".join(shapes)))
kernel_call_lines.append(
"WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), 0, cudaMemcpyHostToDevice, %s) );"
% (ast.function_name, param.name, param.name, len(shapes), type_str, stream))
if not is_cpu: if not is_cpu:
spatial_shape_symbols = []
for param in ast_params:
if param.is_field_shape_argument:
spatial_shape_symbols = [sp.Symbol("%s_cpu[%d]" % (param.name, i))
for i in range(field.spatial_dimensions)]
if not spatial_shape_symbols:
for param in ast_params:
if not param.is_field_argument:
continue
field = fields[param.field_name]
if field.field_type == FieldType.GENERIC:
kernel_call_lines += create_field_shape_code(field, '_size', gpu_copy=False)
spatial_shape_symbols = [sp.Symbol("_size[%d]" % (i, )) for i in range(field.spatial_dimensions)]
break
indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
call_parameters = ", ".join([p.name for p in ast.parameters call_parameters = ", ".join([p.name for p in ast_params
if p.is_field_ptr_argument or not p.is_field_argument]) if p.is_field_ptr_argument or not p.is_field_argument])
sp_printer_c = CustomSympyPrinter() sp_printer_c = CustomSympyPrinter()
...@@ -296,7 +317,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non ...@@ -296,7 +317,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
] ]
else: else:
kernel_call_lines.append("internal_%s::%s(%s);" % kernel_call_lines.append("internal_%s::%s(%s);" %
(ast.function_name, ast.function_name, ", ".join([p.name for p in ast.parameters]))) (ast.function_name, ast.function_name, ", ".join([p.name for p in ast_params])))
return "\n".join(kernel_call_lines) return "\n".join(kernel_call_lines)
...@@ -339,16 +360,36 @@ def generate_constructor_parameters(kernel_info, parameters_to_ignore=[]): ...@@ -339,16 +360,36 @@ def generate_constructor_parameters(kernel_info, parameters_to_ignore=[]):
return ", ".join(parameter_list + varying_parameters) return ", ".join(parameter_list + varying_parameters)
def generate_members(kernel_info, parameters_to_ignore=[]): @jinja2.contextfilter
def generate_members(ctx, kernel_info, parameters_to_ignore=None, only_fields=False):
if parameters_to_ignore is None:
parameters_to_ignore = []
ast = kernel_info.ast ast = kernel_info.ast
parameters_to_ignore += kernel_info.temporary_fields fields = {f.name: f for f in ast.fields_accessed}
params_to_skip = parameters_to_ignore + kernel_info.temporary_fields
is_gpu = ctx['target'] == 'gpu'
result = [] result = []
for param in ast.parameters: for param in ast.parameters:
if param.is_field_ptr_argument and param.field_name not in parameters_to_ignore: if only_fields and not param.is_field_argument:
continue
if param.is_field_ptr_argument and param.field_name not in params_to_skip:
result.append("BlockDataID %sID;" % (param.field_name, )) result.append("BlockDataID %sID;" % (param.field_name, ))
elif not param.is_field_argument and param.name not in parameters_to_ignore: elif not param.is_field_argument and param.name not in params_to_skip:
result.append("%s %s;" % (param.dtype, param.name,)) result.append("%s %s;" % (param.dtype, param.name,))
for field_name in kernel_info.temporary_fields:
f = fields[field_name]
if field_name in parameters_to_ignore:
continue
assert field_name.endswith('_tmp')
original_field_name = field_name[:-len('_tmp')]
f_size = get_field_fsize(f)
field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu)
result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name))
return "\n".join(result) return "\n".join(result)
......
...@@ -51,7 +51,8 @@ class Sweep: ...@@ -51,7 +51,8 @@ class Sweep:
func = functools.partial(kernel_decorator, sweep_function, sweep=sweep) func = functools.partial(kernel_decorator, sweep_function, sweep=sweep)
cb = functools.partial(Sweep._generate_header_and_source, func, name, target, cb = functools.partial(Sweep._generate_header_and_source, func, name, target,
namespace, sweep._temporary_fields, sweep._field_swaps, optimization=optimization) namespace, sweep._temporary_fields, sweep._field_swaps, optimization=optimization,
staggered=False, varying_parameters=[])
file_names = [name + ".h", name + ('.cpp' if target == 'cpu' else '.cu')] file_names = [name + ".h", name + ('.cpp' if target == 'cpu' else '.cu')]
codegen.register(file_names, cb) codegen.register(file_names, cb)
...@@ -95,11 +96,14 @@ class Sweep: ...@@ -95,11 +96,14 @@ class Sweep:
env = Environment(loader=PackageLoader('pystencils_walberla')) env = Environment(loader=PackageLoader('pystencils_walberla'))
add_pystencils_filters_to_jinja_env(env) add_pystencils_filters_to_jinja_env(env)
representative_field = {p.field_name for p in ast.parameters if p.is_field_argument}.pop()
context = { context = {
'kernel': KernelInfo(ast, temporary_fields, field_swaps, varying_parameters), 'kernel': KernelInfo(ast, temporary_fields, field_swaps, varying_parameters),
'namespace': namespace, 'namespace': namespace,
'class_name': ast.function_name[0].upper() + ast.function_name[1:], 'class_name': ast.function_name[0].upper() + ast.function_name[1:],
'target': target, 'target': target,
'field': representative_field,
} }
header = env.get_template("Sweep.tmpl.h").render(**context) header = env.get_template("Sweep.tmpl.h").render(**context)
......
...@@ -49,10 +49,74 @@ namespace {{namespace}} { ...@@ -49,10 +49,74 @@ namespace {{namespace}} {
void {{class_name}}::operator() ( IBlock * block ) void {{class_name}}::operator() ( IBlock * block )
{ {
{{kernel|generate_block_data_to_field_extraction|indent(4)}} {{kernel|generate_block_data_to_field_extraction|indent(4)}}
{{kernel|generate_call|indent(4)}} {{kernel|generate_call(stream='stream_')|indent(4)}}
{{kernel|generate_swaps|indent(4)}} {{kernel|generate_swaps|indent(4)}}
} }
void {{class_name}}::inner( IBlock * block )
{
{{kernel|generate_block_data_to_field_extraction|indent(4)}}
CellInterval inner = {{field}}->xyzSize();
inner.expand(-1);
{{kernel|generate_call(stream='stream_', cell_interval='inner')|indent(4)}}
}
void {{class_name}}::outer( IBlock * block )
{
static std::vector<CellInterval> layers;
{%if target is equalto 'gpu'%}
static std::vector<cudaStream_t> streams;
{% endif %}
{{kernel|generate_block_data_to_field_extraction|indent(4)}}
if( layers.size() == 0 )
{
CellInterval ci;
{{field}}->getSliceBeforeGhostLayer(stencil::T, ci, 1, false);
layers.push_back(ci);
{{field}}->getSliceBeforeGhostLayer(stencil::B, ci, 1, false);
layers.push_back(ci);
{{field}}->getSliceBeforeGhostLayer(stencil::N, ci, 1, false);
ci.expand(Cell(0, 0, -1));
layers.push_back(ci);
{{field}}->getSliceBeforeGhostLayer(stencil::S, ci, 1, false);
ci.expand(Cell(0, 0, -1));
layers.push_back(ci);
{{field}}->getSliceBeforeGhostLayer(stencil::E, ci, 1, false);
ci.expand(Cell(0, -1, -1));
layers.push_back(ci);
{{field}}->getSliceBeforeGhostLayer(stencil::W, ci, 1, false);
ci.expand(Cell(0, -1, -1));
layers.push_back(ci);
{%if target is equalto 'gpu'%}
for( int i=0; i < layers.size(); ++i )
{
streams.push_back(cudaStream_t());
WALBERLA_CUDA_CHECK( cudaStreamCreate(&streams.back() ) );
}
{% endif %}
}
for( int i=0; i < layers.size(); ++i )
{
{{kernel|generate_call(stream='streams[i]', cell_interval="layers[i]")|indent(8)}}
WALBERLA_CUDA_CHECK(cudaStreamSynchronize(streams[i])); // TODO move out when no memcpy is needed to setup call
}
{{kernel|generate_swaps|indent(4)}}
}
} // namespace {{namespace}} } // namespace {{namespace}}
} // namespace walberla } // namespace walberla
......
...@@ -50,14 +50,20 @@ namespace {{namespace}} { ...@@ -50,14 +50,20 @@ namespace {{namespace}} {
class {{class_name}} class {{class_name}}
{ {
public: public:
{{class_name}}( {{kernel|generate_constructor_parameters}} ) {{class_name}}( {{kernel|generate_constructor_parameters}}{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %})
: {{ kernel|generate_constructor_initializer_list }} : {{ kernel|generate_constructor_initializer_list }}, stream_(stream)
{}; {};
void operator() ( IBlock * block ); void operator() ( IBlock * block );
void inner( IBlock * block );
void outer( IBlock * block );
private: private:
{{kernel|generate_members|indent(4)}} {{kernel|generate_members|indent(4)}}
{%if target is equalto 'gpu'%}
cudaStream_t stream_;
{% endif %}
}; };
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment