From 495e92b9c2c88ed0cfcde350bbc7b45dae875d11 Mon Sep 17 00:00:00 2001
From: et31efoj <markus.holzer@fau.de>
Date: Thu, 7 Feb 2019 18:09:20 +0100
Subject: [PATCH] Implemented an improved version of the communication for
 periodic BC

The communication of the ghost layers used to communicate just all
values in between one time step to make sure that everything is correct.
Furthermore the communication was only valid for pull stream steps.
The improved communication distinguishes automatically between pull and
push and communicates only values which are needed. With this improvement
it was possible to implement the EsoTwist streaming scheme.
---
 pystencils/boundaries/boundaryconditions.py   |  2 +-
 .../datahandling/serial_datahandling.py       | 35 +++++--
 pystencils/gpucuda/periodicity.py             | 93 ++++++++++++++++---
 pystencils/transformations.py                 | 13 +--
 pystencils_tests/test_cudagpu.py              |  2 +-
 5 files changed, 117 insertions(+), 28 deletions(-)

diff --git a/pystencils/boundaries/boundaryconditions.py b/pystencils/boundaries/boundaryconditions.py
index 10870c4cd..6628be820 100644
--- a/pystencils/boundaries/boundaryconditions.py
+++ b/pystencils/boundaries/boundaryconditions.py
@@ -1,6 +1,6 @@
-from typing import List, Tuple, Any
 from pystencils import Assignment
 from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo
+from typing import List, Tuple, Any
 from pystencils.data_types import create_type
 
 
diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py
index 7e4d19cd8..dd2b70b89 100644
--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -250,7 +250,14 @@ class SerialDataHandling(DataHandling):
     def synchronization_function_gpu(self, names, stencil_name=None, **_):
         return self.synchronization_function(names, stencil_name, 'gpu')
 
-    def synchronization_function(self, names, stencil=None, target=None, **_):
+    def get_communication_before_sweep(self, names, stencil_name=None, update_rule=None, **_):
+        return self.synchronization_function(names, stencil_name, self.default_target, update_rule, 'pull')
+
+    def get_communication_after_sweep(self, names, stencil_name=None, update_rule=None, **_):
+        return self.synchronization_function(names, stencil_name, self.default_target, update_rule, 'push')
+
+    def synchronization_function(self, names, stencil=None, target=None, update_rule=None,
+                                 reads_writes="pull", **_):
         if target is None:
             target = self.default_target
         assert target in ('cpu', 'gpu')
@@ -265,11 +272,11 @@ class SerialDataHandling(DataHandling):
         elif (stencil is None and self.dim == 3) or (stencil is not None and stencil.startswith('D3')):
             directions = itertools.product(*[neighbors] * 3)
         else:
-            raise ValueError("Invalid stencil")
+            directions = itertools.product(*[neighbors] * 1)
 
         for direction in directions:
             use_direction = True
-            if direction == (0, 0) or direction == (0, 0, 0):
+            if direction == (0,) or direction == (0, 0) or direction == (0, 0, 0):
                 use_direction = False
             for component, periodicity in zip(direction, self._periodicity):
                 if not periodicity and component != 0:
@@ -290,15 +297,31 @@ class SerialDataHandling(DataHandling):
 
             if len(filtered_stencil) > 0:
                 if target == 'cpu':
-                    from pystencils.slicing import get_periodic_boundary_functor
-                    result.append(get_periodic_boundary_functor(filtered_stencil, ghost_layers=gls))
+                    if update_rule is None:
+                        from pystencils.slicing import get_periodic_boundary_functor
+                        result.append(get_periodic_boundary_functor(filtered_stencil, ghost_layers=gls))
+                    else:
+                        from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as boundary_func
+                        result.append(boundary_func(filtered_stencil, self._domainSize,
+                                                    index_dimensions=self.fields[name].index_dimensions,
+                                                    index_dim_shape=values_per_cell,
+                                                    dtype=self.fields[name].dtype.numpy_dtype,
+                                                    ghost_layers=gls,
+                                                    target='cpu',
+                                                    update_rule=update_rule,
+                                                    reads_writes=reads_writes))
+
                 else:
                     from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as boundary_func
+
                     result.append(boundary_func(filtered_stencil, self._domainSize,
                                                 index_dimensions=self.fields[name].index_dimensions,
                                                 index_dim_shape=values_per_cell,
                                                 dtype=self.fields[name].dtype.numpy_dtype,
-                                                ghost_layers=gls))
+                                                ghost_layers=gls,
+                                                target='gpu',
+                                                update_rule=update_rule,
+                                                reads_writes=reads_writes))
 
         if target == 'cpu':
             def result_functor():
diff --git a/pystencils/gpucuda/periodicity.py b/pystencils/gpucuda/periodicity.py
index 5657d4618..ba6232322 100644
--- a/pystencils/gpucuda/periodicity.py
+++ b/pystencils/gpucuda/periodicity.py
@@ -1,11 +1,15 @@
 import numpy as np
-from pystencils import Field, Assignment
+from pystencils import Field, Assignment, AssignmentCollection
 from pystencils.slicing import normalize_slice, get_periodic_boundary_src_dst_slices
-from pystencils.gpucuda import make_python_function
-from pystencils.gpucuda.kernelcreation import create_cuda_kernel
 
+from pystencils.gpucuda import make_python_function as make_python_function_gpu
+from pystencils.cpu import make_python_function as make_python_function_cpu
+from pystencils import create_kernel
+from lbmpy.methods.abstractlbmethod import LbmCollisionRule
 
-def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, index_dim_shape=1, dtype=np.float64):
+
+def create_copy_kernel(domain_size, from_slice, to_slice, stencil, index_dimensions=0, index_dim_shape=1,
+                       dtype=np.float64, target=None, update_rule_assignments=None, reads_writes="pull"):
     """Copies a rectangular part of an array to another non-overlapping part"""
     if index_dimensions not in (0, 1):
         raise NotImplementedError("Works only for one or zero index coordinates")
@@ -18,25 +22,86 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in
     assert offset == [s1.stop - s2.stop for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
         "Slices have to have same size"
 
-    update_eqs = []
-    for i in range(index_dim_shape):
-        eq = Assignment(f(i), f[tuple(offset)](i))
-        update_eqs.append(eq)
+    update_eqs = set()
+    if update_rule_assignments is None:
+        for i in range(index_dim_shape):
+            eq = Assignment(f(i), f[tuple(offset)](i))
+            update_eqs.add(eq)
+    else:
+        for update_rule_assignment in update_rule_assignments:
+            if reads_writes == "pull":
+                field_accesses = update_rule_assignment.rhs.atoms(Field.Access)
+            else:
+                field_accesses = update_rule_assignment.lhs.atoms(Field.Access)
+
+            for field_access in field_accesses:
+                if 0 in stencil:
+                    for stencil_part, off in zip(stencil, field_access.offsets):
+                        if (stencil_part < 0 and off < 0) or (stencil_part > 0 and off > 0):
+                            if reads_writes == "pull":
+                                eq = Assignment(f(field_access.index[0]), f[tuple(offset)](field_access.index[0]))
+                                update_eqs.add(eq)
+                            if reads_writes == "push":
+                                eq = Assignment(f[tuple(offset)](field_access.index[0]), f(field_access.index[0]))
+                                update_eqs.add(eq)
+                else:
+                    if stencil == field_access.offsets:
+                        if reads_writes == "pull":
+                            eq = Assignment(f(field_access.index[0]), f[tuple(offset)](field_access.index[0]))
+                            update_eqs.add(eq)
+                        if reads_writes == "push":
+                            eq = Assignment(f[tuple(offset)](field_access.index[0]), f(field_access.index[0]))
+                            update_eqs.add(eq)
+
+    kernels = []
+    if len(update_eqs) > 0:
+        ast = create_kernel(list(update_eqs), iteration_slice=to_slice,
+                            skip_independence_check=True, target=target)
+        if target == 'cpu':
+            kernels.append(make_python_function_cpu(ast))
+        else:
+            kernels.append(make_python_function_gpu(ast))
 
-    ast = create_cuda_kernel(update_eqs, iteration_slice=to_slice, skip_independence_check=True)
-    return make_python_function(ast)
+    return kernels
 
 
 def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
-                                  thickness=None, dtype=float):
+                                  thickness=None, dtype=float, target=None, update_rule=None, reads_writes="pull"):
+
+    # sort stencil entry's by number of zeros to make sure, that surfaces, edges and corners are
+    # communicated in that order. Otherwise values (e.g. corner values) will be overwritten
+    stencil.sort(key=number_of_zeros, reverse=True)
+
     src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
     kernels = []
-    index_dimensions = index_dimensions
-    for src_slice, dst_slice in src_dst_slice_tuples:
-        kernels.append(create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype))
+
+    if isinstance(update_rule, Assignment):
+        update_rule_assignments = [update_rule]
+    elif isinstance(update_rule, (AssignmentCollection, LbmCollisionRule)):
+        update_rule_assignments = update_rule.all_assignments
+    else:
+        update_rule_assignments = None
+
+    # by default we only get the communication pattern for pull stream
+    if reads_writes is not "pull" or reads_writes is not "push":
+        assert "not a valid argument for the definition of read or/and write access"
+
+    for i, (src_slice, dst_slice) in enumerate(src_dst_slice_tuples):
+        for j in create_copy_kernel(domain_size, src_slice, dst_slice, stencil[i], index_dimensions,
+                                    index_dim_shape, dtype, target, update_rule_assignments, reads_writes):
+            kernels.append(j)
 
     def functor(pdfs, **_):
         for kernel in kernels:
             kernel(pdfs=pdfs)
 
     return functor
+
+
+def number_of_zeros(stencil_entry):
+    count_zeros = 0
+    for tmp in stencil_entry:
+        if tmp == 0:
+            count_zeros += 1
+
+    return count_zeros
diff --git a/pystencils/transformations.py b/pystencils/transformations.py
index 4490e3e1a..0346b7d93 100644
--- a/pystencils/transformations.py
+++ b/pystencils/transformations.py
@@ -175,12 +175,13 @@ def make_loop_over_domain(body, function_name, iteration_slice=None, ghost_layer
 
     if iteration_slice is not None:
         iteration_slice = normalize_slice(iteration_slice, shape)
-
-    if ghost_layers is None:
-        required_ghost_layers = max([fa.required_ghost_layers for fa in field_accesses])
-        ghost_layers = [(required_ghost_layers, required_ghost_layers)] * len(loop_order)
-    if isinstance(ghost_layers, int):
-        ghost_layers = [(ghost_layers, ghost_layers)] * len(loop_order)
+        ghost_layers = 0
+    else:
+        if ghost_layers is None:
+            required_ghost_layers = max([fa.required_ghost_layers for fa in field_accesses])
+            ghost_layers = [(required_ghost_layers, required_ghost_layers)] * len(loop_order)
+        if isinstance(ghost_layers, int):
+            ghost_layers = [(ghost_layers, ghost_layers)] * len(loop_order)
 
     current_body = body
     for i, loop_coordinate in enumerate(reversed(loop_order)):
diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py
index 3afddb128..4a682526d 100644
--- a/pystencils_tests/test_cudagpu.py
+++ b/pystencils_tests/test_cudagpu.py
@@ -135,7 +135,7 @@ def test_periodicity():
     arr_gpu = gpuarray.to_gpu(arr_cpu)
 
     periodicity_stencil = [(1, 0), (-1, 0), (1, 1)]
-    periodic_gpu_kernel = periodic_gpu(periodicity_stencil, (5, 5), 1, 2)
+    periodic_gpu_kernel = periodic_gpu(periodicity_stencil, (5, 5), 1, 2, target='gpu')
     periodic_cpu_kernel = periodic_cpu(periodicity_stencil)
 
     cpu_result = np.copy(arr_cpu)
-- 
GitLab