From 0f21a28f4c9490826f32fd1e589fa25768a42c8d Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Thu, 18 Oct 2018 12:13:56 +0200
Subject: [PATCH] Added support for inner/outer kernels to overlap
 communication

---
 jinja_filters.py         | 131 +++++++++++++++++++++++++--------------
 sweep.py                 |   6 +-
 templates/Sweep.tmpl.cpp |  66 +++++++++++++++++++-
 templates/Sweep.tmpl.h   |  10 ++-
 4 files changed, 164 insertions(+), 49 deletions(-)

diff --git a/jinja_filters.py b/jinja_filters.py
index b5c64ba..8e82794 100644
--- a/jinja_filters.py
+++ b/jinja_filters.py
@@ -4,21 +4,23 @@ import copy
 from pystencils.astnodes import ResolvedFieldAccess
 from pystencils.data_types import get_base_type
 from pystencils.backends.cbackend import generate_c, CustomSympyPrinter
-from pystencils.field import FieldType
+from pystencils.field import FieldType, Field
 from pystencils.sympyextensions import prod
 
+temporary_fieldMemberTemplate = """
+std::set< {type} *, field::SwapableCompare< {type} * > > cache_{original_field_name}_;"""
+
 temporary_fieldTemplate = """
 // Getting temporary field {tmp_field_name}
-static std::set< {type} *, field::SwapableCompare< {type} * > > cache_{original_field_name};
-auto it = cache_{original_field_name}.find( {original_field_name} );
-if( it != cache_{original_field_name}.end() )
+auto it = cache_{original_field_name}_.find( {original_field_name} );
+if( it != cache_{original_field_name}_.end() )
 {{
     {tmp_field_name} = *it;
 }}
 else 
 {{
     {tmp_field_name} = {original_field_name}->cloneUninitialized();
-    cache_{original_field_name}.insert({tmp_field_name});
+    cache_{original_field_name}_.insert({tmp_field_name});
 }}
 """
 
@@ -36,6 +38,7 @@ def get_field_fsize(field, field_accesses=()):
     elif field.index_dimensions == 0:
         return 1
     else:
+        assert len(field_accesses) > 0
         assert field.index_dimensions == 1
         max_idx_value = 0
         for acc in field_accesses:
@@ -97,7 +100,7 @@ def field_extraction_code(field_accesses, field_name, is_temporary, declaration_
     # Determine size of f coordinate which is a template parameter
     f_size = get_field_fsize(field, field_accesses)
     dtype = get_base_type(field.dtype)
-    field_type = "cuda::GPUField<%s>" % (dtype,) if is_gpu else "GhostLayerField<%s, %d>" % (dtype, f_size)
+    field_type = make_field_type(dtype, f_size, is_gpu)
 
     if not is_temporary:
         dtype = get_base_type(field.dtype)
@@ -182,6 +185,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
     """
     assert isinstance(ghost_layers_to_include, str) or ghost_layers_to_include >= 0
     ast = kernel_info.ast
+    ast_params = kernel_info.ast.parameters
 
     ghost_layers_to_include = sp.sympify(ghost_layers_to_include)
     if ast.ghost_layers is None:
@@ -195,10 +199,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
     kernel_call_lines = []
     fields = {f.name: f for f in ast.fields_accessed}
 
-    spatial_shape_symbols = []
-
-    def get_start_coordinates(parameter):
-        field_object = fields[parameter.field_name]
+    def get_start_coordinates(field_object):
         if cell_interval is None:
             return [-ghost_layers_to_include - required_ghost_layers] * field_object.spatial_dimensions
         else:
@@ -206,19 +207,43 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
             return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord, ci=cell_interval)) - required_ghost_layers
                     for coord in ('x', 'y', 'z')]
 
-    def get_end_coordinates(parameter):
-        field_object = fields[parameter.field_name]
+    def get_end_coordinates(field_object):
         if cell_interval is None:
             shape_names = ['xSize()', 'ySize()', 'zSize()'][:field_object.spatial_dimensions]
             offset = 2 * ghost_layers_to_include + 2 * required_ghost_layers
-            return ["%s->%s + %s" % (parameter.field_name, e, offset) for e in shape_names]
+            return ["%s->%s + %s" % (field_object.name, e, offset) for e in shape_names]
         else:
             assert ghost_layers_to_include == 0
             coord_names = ['x', 'y', 'z'][:field_object.spatial_dimensions]
-            return ["{ci}.{coord}Size() + {gl}".format(coord=coord, ci=cell_interval, gl=required_ghost_layers)
+            return ["{ci}.{coord}Size() + {gl}".format(coord=coord, ci=cell_interval, gl=2 * required_ghost_layers)
                     for coord in coord_names]
 
-    for param in ast.parameters:
+    def create_field_shape_code(field, param_name, gpu_copy=True):
+        result = []
+        type_str = get_base_type(Field.SHAPE_DTYPE).base_name
+        shapes = ["%s(%s)" % (type_str, c) for c in get_end_coordinates(field)]
+
+        max_values = ["%s->%sSizeWithGhostLayer()" % (field.name, coord) for coord in ['x', 'y', 'z']]
+        for shape, max_value in zip(shapes, max_values):
+            result.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, %s);" % (max_value, shape))
+
+        if field.index_dimensions == 1:
+            shapes.append("%s(%s->fSize())" % (type_str, field.name))
+        elif field.index_dimensions > 1:
+            shapes.extend(["%s(%d)" % (type_str, e) for e in field.index_shape])
+            result.append("WALBERLA_ASSERT_EQUAL(int(%s->fSize()),  %d);" %
+                                     (field.name, prod(field.index_shape)))
+        if is_cpu or not gpu_copy:
+            result.append("const %s %s [] = {%s};" % (type_str, param_name, ", ".join(shapes)))
+        else:
+            result.append("const %s %s_cpu [] = {%s};" % (type_str, param_name, ", ".join(shapes)))
+            result.append(
+                "WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), "
+                "0, cudaMemcpyHostToDevice, %s) );"
+                % (ast.function_name, param_name, param_name, len(shapes), type_str, stream))
+        return result
+
+    for param in ast_params:
         if param.is_field_argument and FieldType.is_indexed(fields[param.field_name]):
             continue
 
@@ -227,7 +252,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
             if field.field_type == FieldType.BUFFER:
                 kernel_call_lines.append("%s %s = %s;" % (param.dtype, param.name, param.field_name))
             else:
-                coordinates = get_start_coordinates(param)
+                coordinates = get_start_coordinates(field)
                 actual_gls = "int_c(%s->nrOfGhostLayers())" % (param.field_name, )
                 for c in set(coordinates):
                     kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, -%s);" %
@@ -255,36 +280,32 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
             else:
                 kernel_call_lines.append("const %s %s_cpu [] = {%s};" % (type_str, param.name, ", ".join(strides)))
                 kernel_call_lines.append(
-                    "WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), 0, cudaMemcpyHostToDevice, %s) );"
+                    "WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), "
+                    "0, cudaMemcpyHostToDevice, %s) );"
                     % (ast.function_name, param.name, param.name, len(strides), type_str, stream))
 
         elif param.is_field_shape_argument:
-            field = fields[param.field_name]
-            type_str = get_base_type(param.dtype).base_name
-            shapes = ["%s(%s)" % (type_str, c) for c in get_end_coordinates(param)]
-            spatial_shape_symbols = [sp.Symbol("%s_cpu[%d]" % (param.name, i)) for i in range(field.spatial_dimensions)]
-
-            max_values = ["%s->%sSizeWithGhostLayer()" % (field.name, coord) for coord in ['x', 'y', 'z']]
-            for shape, max_value in zip(shapes, max_values):
-                kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, %s);" % (max_value, shape))
-
-            if field.index_dimensions == 1:
-                shapes.append("%s(%s->fSize())" % (type_str, field.name))
-            elif field.index_dimensions > 1:
-                shapes.extend(["%s(%d)" % (type_str, e) for e in field.index_shape])
-                kernel_call_lines.append("WALBERLA_ASSERT_EQUAL(int(%s->fSize()),  %d);" %
-                                         (field.name, prod(field.index_shape)))
-            if is_cpu:
-                kernel_call_lines.append("const %s %s [] = {%s};" % (type_str, param.name, ", ".join(shapes)))
-            else:
-                kernel_call_lines.append("const %s %s_cpu [] = {%s};" % (type_str, param.name, ", ".join(shapes)))
-                kernel_call_lines.append(
-                    "WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), 0, cudaMemcpyHostToDevice, %s) );"
-                    % (ast.function_name, param.name, param.name, len(shapes), type_str, stream))
+            kernel_call_lines += create_field_shape_code(fields[param.field_name], param.name)
 
     if not is_cpu:
+        spatial_shape_symbols = []
+        for param in ast_params:
+            if param.is_field_shape_argument:
+                spatial_shape_symbols = [sp.Symbol("%s_cpu[%d]" % (param.name, i))
+                                         for i in range(field.spatial_dimensions)]
+
+        if not spatial_shape_symbols:
+            for param in ast_params:
+                if not param.is_field_argument:
+                    continue
+                field = fields[param.field_name]
+                if field.field_type == FieldType.GENERIC:
+                    kernel_call_lines += create_field_shape_code(field, '_size', gpu_copy=False)
+                    spatial_shape_symbols = [sp.Symbol("_size[%d]" % (i, )) for i in range(field.spatial_dimensions)]
+                    break
+
         indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
-        call_parameters = ", ".join([p.name for p in ast.parameters
+        call_parameters = ", ".join([p.name for p in ast_params
                                      if p.is_field_ptr_argument or not p.is_field_argument])
         sp_printer_c = CustomSympyPrinter()
 
@@ -296,7 +317,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
         ]
     else:
         kernel_call_lines.append("internal_%s::%s(%s);" %
-                                 (ast.function_name, ast.function_name, ", ".join([p.name for p in ast.parameters])))
+                                 (ast.function_name, ast.function_name, ", ".join([p.name for p in ast_params])))
     return "\n".join(kernel_call_lines)
 
 
@@ -339,16 +360,36 @@ def generate_constructor_parameters(kernel_info, parameters_to_ignore=[]):
     return ", ".join(parameter_list + varying_parameters)
 
 
-def generate_members(kernel_info, parameters_to_ignore=[]):
+@jinja2.contextfilter
+def generate_members(ctx, kernel_info, parameters_to_ignore=None, only_fields=False):
+    if parameters_to_ignore is None:
+        parameters_to_ignore = []
+
     ast = kernel_info.ast
-    parameters_to_ignore += kernel_info.temporary_fields
+    fields = {f.name: f for f in ast.fields_accessed}
+    params_to_skip = parameters_to_ignore + kernel_info.temporary_fields
+    is_gpu = ctx['target'] == 'gpu'
 
     result = []
     for param in ast.parameters:
-        if param.is_field_ptr_argument and param.field_name not in parameters_to_ignore:
+        if only_fields and not param.is_field_argument:
+            continue
+
+        if param.is_field_ptr_argument and param.field_name not in params_to_skip:
             result.append("BlockDataID %sID;" % (param.field_name, ))
-        elif not param.is_field_argument and param.name not in parameters_to_ignore:
+        elif not param.is_field_argument and param.name not in params_to_skip:
             result.append("%s %s;" % (param.dtype, param.name,))
+
+    for field_name in kernel_info.temporary_fields:
+        f = fields[field_name]
+        if field_name in parameters_to_ignore:
+            continue
+        assert field_name.endswith('_tmp')
+        original_field_name = field_name[:-len('_tmp')]
+        f_size = get_field_fsize(f)
+        field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu)
+        result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name))
+
     return "\n".join(result)
 
 
diff --git a/sweep.py b/sweep.py
index b9b2373..87354a2 100644
--- a/sweep.py
+++ b/sweep.py
@@ -51,7 +51,8 @@ class Sweep:
 
         func = functools.partial(kernel_decorator, sweep_function, sweep=sweep)
         cb = functools.partial(Sweep._generate_header_and_source, func, name, target,
-                               namespace, sweep._temporary_fields, sweep._field_swaps, optimization=optimization)
+                               namespace, sweep._temporary_fields, sweep._field_swaps, optimization=optimization,
+                               staggered=False, varying_parameters=[])
 
         file_names = [name + ".h", name + ('.cpp' if target == 'cpu' else '.cu')]
         codegen.register(file_names, cb)
@@ -95,11 +96,14 @@ class Sweep:
         env = Environment(loader=PackageLoader('pystencils_walberla'))
         add_pystencils_filters_to_jinja_env(env)
 
+        representative_field = {p.field_name for p in ast.parameters if p.is_field_argument}.pop()
+
         context = {
             'kernel': KernelInfo(ast, temporary_fields, field_swaps, varying_parameters),
             'namespace': namespace,
             'class_name': ast.function_name[0].upper() + ast.function_name[1:],
             'target': target,
+            'field': representative_field,
         }
 
         header = env.get_template("Sweep.tmpl.h").render(**context)
diff --git a/templates/Sweep.tmpl.cpp b/templates/Sweep.tmpl.cpp
index b19175a..ddb43db 100644
--- a/templates/Sweep.tmpl.cpp
+++ b/templates/Sweep.tmpl.cpp
@@ -49,10 +49,74 @@ namespace {{namespace}} {
 void {{class_name}}::operator() ( IBlock * block )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
-    {{kernel|generate_call|indent(4)}}
+    {{kernel|generate_call(stream='stream_')|indent(4)}}
     {{kernel|generate_swaps|indent(4)}}
 }
 
+
+
+void {{class_name}}::inner( IBlock * block )
+{
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+
+    CellInterval inner = {{field}}->xyzSize();
+    inner.expand(-1);
+
+    {{kernel|generate_call(stream='stream_', cell_interval='inner')|indent(4)}}
+}
+
+
+void {{class_name}}::outer( IBlock * block )
+{
+    static std::vector<CellInterval> layers;
+    {%if target is equalto 'gpu'%}
+    static std::vector<cudaStream_t> streams;
+    {% endif %}
+
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+
+    if( layers.size() == 0 )
+    {
+        CellInterval ci;
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::T, ci, 1, false);
+        layers.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::B, ci, 1, false);
+        layers.push_back(ci);
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::N, ci, 1, false);
+        ci.expand(Cell(0, 0, -1));
+        layers.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::S, ci, 1, false);
+        ci.expand(Cell(0, 0, -1));
+        layers.push_back(ci);
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::E, ci, 1, false);
+        ci.expand(Cell(0, -1, -1));
+        layers.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::W, ci, 1, false);
+        ci.expand(Cell(0, -1, -1));
+        layers.push_back(ci);
+
+        {%if target is equalto 'gpu'%}
+        for( int i=0; i < layers.size(); ++i )
+        {
+            streams.push_back(cudaStream_t());
+            WALBERLA_CUDA_CHECK( cudaStreamCreate(&streams.back() ) );
+        }
+        {% endif %}
+    }
+
+    for( int i=0; i < layers.size(); ++i )
+    {
+        {{kernel|generate_call(stream='streams[i]', cell_interval="layers[i]")|indent(8)}}
+        WALBERLA_CUDA_CHECK(cudaStreamSynchronize(streams[i])); // TODO move out when no memcpy is needed to setup call
+    }
+
+    {{kernel|generate_swaps|indent(4)}}
+}
+
+
 } // namespace {{namespace}}
 } // namespace walberla
 
diff --git a/templates/Sweep.tmpl.h b/templates/Sweep.tmpl.h
index 6639cec..bb687f9 100644
--- a/templates/Sweep.tmpl.h
+++ b/templates/Sweep.tmpl.h
@@ -50,14 +50,20 @@ namespace {{namespace}} {
 class {{class_name}}
 {
 public:
-    {{class_name}}( {{kernel|generate_constructor_parameters}} )
-        : {{ kernel|generate_constructor_initializer_list }}
+    {{class_name}}( {{kernel|generate_constructor_parameters}}{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %})
+        : {{ kernel|generate_constructor_initializer_list }}, stream_(stream)
     {};
 
     void operator() ( IBlock * block );
 
+    void inner( IBlock * block );
+    void outer( IBlock * block );
+
 private:
     {{kernel|generate_members|indent(4)}}
+    {%if target is equalto 'gpu'%}
+    cudaStream_t stream_;
+    {% endif %}
 };
 
 
-- 
GitLab