From c9f3c097ed5bce9adcc3f257b55a500e4dd9b90f Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Fri, 1 Dec 2023 19:20:00 +0100
Subject: [PATCH] Pack, communicate and unpack everything together in
 synchronize()

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 runtime/domain/regular_6d_stencil.cpp |  46 +++++++--
 runtime/domain/regular_6d_stencil.hpp |   5 +
 runtime/pairs.cpp                     |  74 ++++++--------
 runtime/pairs.hpp                     |   6 ++
 src/pairs/sim/comm.py                 | 136 +++++++++++++++++++++++---
 5 files changed, 201 insertions(+), 66 deletions(-)

diff --git a/runtime/domain/regular_6d_stencil.cpp b/runtime/domain/regular_6d_stencil.cpp
index 4e7631f..f8f2472 100644
--- a/runtime/domain/regular_6d_stencil.cpp
+++ b/runtime/domain/regular_6d_stencil.cpp
@@ -118,7 +118,6 @@ void Regular6DStencil::communicateData(
     int dim, int elem_size,
     const real_t *send_buf, const int *send_offsets, const int *nsend,
     real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
-    //std::cout << "communicateData" << std::endl;
 
     const real_t *send_prev = &send_buf[send_offsets[dim * 2 + 0] * elem_size];
     const real_t *send_next = &send_buf[send_offsets[dim * 2 + 1] * elem_size];
@@ -126,10 +125,6 @@ void Regular6DStencil::communicateData(
     real_t *recv_next = &recv_buf[recv_offsets[dim * 2 + 1] * elem_size];
 
     if(prev[dim] != rank) {
-        //std::cout << rank << ": send " << nsend[dim * 2 + 0] << " elems to " << prev[dim] << ", recv " << nrecv[dim * 2 + 0] << " elems from " << next[dim] << std::endl;
-        //MPI_Send(send_prev, nsend[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0, MPI_COMM_WORLD);
-        //MPI_Recv(recv_prev, nrecv[dim * 2 + 0] * elem_size, MPI_DOUBLE, next[dim], 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-
         MPI_Sendrecv(
             send_prev, nsend[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0,
             recv_prev, nrecv[dim * 2 + 0] * elem_size, MPI_DOUBLE, next[dim], 0,
@@ -142,10 +137,6 @@ void Regular6DStencil::communicateData(
     }
 
     if(next[dim] != rank) {
-        //std::cout << rank << ": send " << nsend[dim * 2 + 1] << " elems to " << next[dim] << ", recv " << nrecv[dim * 2 + 1] << " elems from " << prev[dim] << std::endl;
-        //MPI_Send(send_next, nsend[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0, MPI_COMM_WORLD);
-        //MPI_Recv(recv_next, nrecv[dim * 2 + 1] * elem_size, MPI_DOUBLE, prev[dim], 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-
         MPI_Sendrecv(
             send_next, nsend[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0,
             recv_next, nrecv[dim * 2 + 1] * elem_size, MPI_DOUBLE, prev[dim], 0,
@@ -158,4 +149,41 @@ void Regular6DStencil::communicateData(
     }
 }
 
+void Regular6DStencil::communicateAllData(
+    int ndims, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+    for(int d = 0; d < ndims; d++) {
+        const real_t *send_prev = &send_buf[send_offsets[d * 2 + 0] * elem_size];
+        const real_t *send_next = &send_buf[send_offsets[d * 2 + 1] * elem_size];
+        real_t *recv_prev = &recv_buf[recv_offsets[d * 2 + 0] * elem_size];
+        real_t *recv_next = &recv_buf[recv_offsets[d * 2 + 1] * elem_size];
+
+        if(prev[d] != rank) {
+            MPI_Sendrecv(
+                send_prev, nsend[d * 2 + 0] * elem_size, MPI_DOUBLE, prev[d], 0,
+                recv_prev, nrecv[d * 2 + 0] * elem_size, MPI_DOUBLE, next[d], 0,
+                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+        } else {
+            for(int i = 0; i < nsend[d * 2 + 0] * elem_size; i++) {
+                recv_prev[i] = send_prev[i];
+            }
+        }
+
+        if(next[d] != rank) {
+            MPI_Sendrecv(
+                send_next, nsend[d * 2 + 1] * elem_size, MPI_DOUBLE, next[d], 0,
+                recv_next, nrecv[d * 2 + 1] * elem_size, MPI_DOUBLE, prev[d], 0,
+                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+        } else {
+            for(int i = 0; i < nsend[d * 2 + 1] * elem_size; i++) {
+                recv_next[i] = send_next[i];
+            }
+        }
+    }
+}
+
 }
diff --git a/runtime/domain/regular_6d_stencil.hpp b/runtime/domain/regular_6d_stencil.hpp
index 1c12ff4..8a43b2d 100644
--- a/runtime/domain/regular_6d_stencil.hpp
+++ b/runtime/domain/regular_6d_stencil.hpp
@@ -54,6 +54,11 @@ public:
         int dim, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+
+    void communicateAllData(
+        int ndims, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
 };
 
 }
diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp
index 9ec1f23..93c714c 100644
--- a/runtime/pairs.cpp
+++ b/runtime/pairs.cpp
@@ -305,7 +305,6 @@ void PairsSimulation::communicateSizes(int dim, const int *send_sizes, int *recv
     array_flags->setHostFlag(nrecv_id);
     array_flags->clearDeviceFlag(nrecv_id);
     this->getDomainPartitioner()->communicateSizes(dim, send_sizes, recv_sizes);
-    //PAIRS_DEBUG("send_sizes=[%d, %d], recv_sizes=[%d, %d]\n", send_sizes[dim * 2 + 0], send_sizes[dim * 2 + 1], recv_sizes[dim * 2 + 0], recv_sizes[dim * 2 + 1]);
 }
 
 void PairsSimulation::communicateData(
@@ -342,59 +341,42 @@ void PairsSimulation::communicateData(
         dim, elem_size, send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv);
 
     copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
+}
 
-    /*
-    // Debug messages
-    const int elems_to_print = 5;
-
-    // Send buffer debug
-    for(int i = 0; i < 2; i++) {
-        int nsnd = nsend[dim * 2 + i];
-
-        PAIRS_DEBUG("send_buf=[");
-        for(int j = 0; j < MIN(elems_to_print, nsnd); j++) {
-            for(int k = 0; k < elem_size; k++) {
-                PAIRS_DEBUG("%f,", send_buf[(send_offsets[dim * 2 + i] + j) * elem_size + k]);
-            }
-        }
+void PairsSimulation::communicateAllData(
+    int ndims, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
 
-        if(elems_to_print * 2 < nsnd) {
-            PAIRS_DEBUG("\b ... ");
-        }
+    auto send_buf_id = getArrayByHostPointer(send_buf).getId();
+    auto recv_buf_id = getArrayByHostPointer(recv_buf).getId();
+    auto send_offsets_id = getArrayByHostPointer(send_offsets).getId();
+    auto recv_offsets_id = getArrayByHostPointer(recv_offsets).getId();
+    auto nsend_id = getArrayByHostPointer(nsend).getId();
+    auto nrecv_id = getArrayByHostPointer(nrecv).getId();
 
-        for(int j = MAX(elems_to_print, nsnd - elems_to_print); j < nsnd; j++) {
-            for(int k = 0; k < elem_size; k++) {
-                PAIRS_DEBUG("%f,", send_buf[(send_offsets[dim * 2 + i] + j) * elem_size + k]);
-            }
-        }
+    copyArrayToHost(send_offsets_id, ReadOnly);
+    copyArrayToHost(recv_offsets_id, ReadOnly);
+    copyArrayToHost(nsend_id, ReadOnly);
+    copyArrayToHost(nrecv_id, ReadOnly);
 
-        PAIRS_DEBUG("\b]\n");
+    int nsend_all = 0;
+    int nrecv_all = 0;
+    for(int d = 0; d <= ndims; d++) {
+        nsend_all += nsend[d * 2 + 0];
+        nsend_all += nsend[d * 2 + 1];
+        nrecv_all += nrecv[d * 2 + 0];
+        nrecv_all += nrecv[d * 2 + 1];
     }
 
-    // Receive buffer debug
-    for(int i = 0; i < 2; i++) {
-        int nrec = nrecv[dim * 2 + i];
-
-        PAIRS_DEBUG("recv_buf=[");
-        for(int j = 0; j < MIN(elems_to_print, nrec); j++) {
-            for(int k = 0; k < elem_size; k++) {
-                PAIRS_DEBUG("%f,", recv_buf[(recv_offsets[dim * 2 + i] + j) * elem_size + k]);
-            }
-        }
-
-        if(elems_to_print * 2 < nrec) {
-            PAIRS_DEBUG("\b ... ");
-        }
+    copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t));
 
-        for(int j = MAX(elems_to_print, nrec - elems_to_print); j < nrec; j++) {
-            for(int k = 0; k < elem_size; k++) {
-                PAIRS_DEBUG("%f,", recv_buf[(recv_offsets[dim * 2 + i] + j) * elem_size + k]);
-            }
-        }
+    array_flags->setHostFlag(recv_buf_id);
+    array_flags->clearDeviceFlag(recv_buf_id);
+    this->getDomainPartitioner()->communicateAllData(
+        ndims, elem_size, send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv);
 
-        PAIRS_DEBUG("\b]\n");
-    }
-    */
+    copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
 }
 
 void PairsSimulation::fillCommunicationArrays(int *neighbor_ranks, int *pbc, real_t *subdom) {
diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp
index ba987e2..0cfbefb 100644
--- a/runtime/pairs.hpp
+++ b/runtime/pairs.hpp
@@ -229,11 +229,17 @@ public:
 
     Regular6DStencil *getDomainPartitioner() { return dom_part; }
     void communicateSizes(int dim, const int *send_sizes, int *recv_sizes);
+
     void communicateData(
         int dim, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv);
 
+    void communicateAllData(
+        int ndims, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+
     void fillCommunicationArrays(int neighbor_ranks[], int pbc[], real_t subdom[]);
 
     // Device functions
diff --git a/src/pairs/sim/comm.py b/src/pairs/sim/comm.py
index 242c887..3f552ed 100644
--- a/src/pairs/sim/comm.py
+++ b/src/pairs/sim/comm.py
@@ -41,18 +41,31 @@ class Comm:
     def synchronize(self):
         # Every property that is not constant across timesteps and have neighbor accesses during any
         # interaction kernel (i.e. property[j] in force calculation kernel)
-        prop_list = [self.sim.property(p) for p in ['position', 'linear_velocity', 'angular_velocity'] if self.sim.property(p) is not None]
-        for step in range(self.dom_part.number_of_steps()):
-            PackGhostParticles(self, step, prop_list)
-            CommunicateData(self, step, prop_list)
-            UnpackGhostParticles(self, step, prop_list)
+        prop_names = ['position', 'linear_velocity', 'angular_velocity']
+        prop_list = [ self.sim.property(p) for p in prop_names if self.sim.property(p) is not None]
+
+        PackAllGhostParticles(self, prop_list)
+        CommunicateAllData(self, prop_list)
+        UnpackAllGhostParticles(self, prop_list)
 
     @pairs_inline
     def borders(self):
         # Every property that has neighbor accesses during any interaction kernel (i.e. property[j]
         # exists in any force calculation kernel)
-        # We ignore normal because there should be no halfspace ghosts
-        prop_list = [self.sim.property(p) for p in ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'angular_velocity', 'shape'] if self.sim.property(p) is not None]
+        # We ignore normal because there should be no ghost half-spaces
+        prop_names = [
+            'uid',
+            'type',
+            'mass',
+            'radius',
+            'position',
+            'linear_velocity',
+            'angular_velocity',
+            'shape'
+        ]
+
+        prop_list = [self.sim.property(p) for p in prop_names if self.sim.property(p) is not None]
+
         Assign(self.sim, self.nsend_all, 0)
         Assign(self.sim, self.sim.nghost, 0)
 
@@ -75,12 +88,15 @@ class Comm:
             PackGhostParticles(self, step, prop_list)
             CommunicateData(self, step, prop_list)
             UnpackGhostParticles(self, step, prop_list)
-            Assign(self.sim, self.sim.nghost, self.sim.nghost + sum([self.nrecv[j] for j in self.dom_part.step_indexes(step)]))
+
+            step_nrecv = sum([self.nrecv[j] for j in self.dom_part.step_indexes(step)])
+            Assign(self.sim, self.sim.nghost, self.sim.nghost + step_nrecv)
 
     @pairs_inline
     def exchange(self):
         # Every property except volatiles
         prop_list = self.sim.properties.non_volatiles()
+
         for step in range(self.dom_part.number_of_steps()):
             Assign(self.sim, self.nsend_all, 0)
             Assign(self.sim, self.sim.nghost, 0)
@@ -130,9 +146,31 @@ class CommunicateData(Lowerable):
     @pairs_inline
     def lower(self):
         elem_size = sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
-        Call_Void(self.sim, "pairs->communicateData", [self.step, elem_size,
-                                                       self.comm.send_buffer, self.comm.send_offsets, self.comm.nsend,
-                                                       self.comm.recv_buffer, self.comm.recv_offsets, self.comm.nrecv])
+
+        Call_Void(self.sim,
+                  "pairs->communicateData",
+                  [self.step, elem_size,
+                   self.comm.send_buffer, self.comm.send_offsets, self.comm.nsend,
+                   self.comm.recv_buffer, self.comm.recv_offsets, self.comm.nrecv])
+
+
+class CommunicateAllData(Lowerable):
+    def __init__(self, comm, prop_list):
+        super().__init__(comm.sim)
+        self.comm = comm
+        self.prop_list = prop_list
+        self.sim.add_statement(self)
+
+    @pairs_inline
+    def lower(self):
+        elem_size = sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
+
+        Call_Void(
+            self.sim,
+            "pairs->communicateAllData",
+            [self.comm.dom_part.number_of_steps(), elem_size,
+             self.comm.send_buffer, self.comm.send_offsets, self.comm.nsend,
+             self.comm.recv_buffer, self.comm.recv_offsets, self.comm.nrecv])
 
 
 class DetermineGhostParticles(Lowerable):
@@ -281,6 +319,82 @@ class UnpackGhostParticles(Lowerable):
                     p_offset += 1
 
 
+class PackAllGhostParticles(Lowerable):
+    def __init__(self, comm, prop_list):
+        super().__init__(comm.sim)
+        self.comm = comm
+        self.prop_list = prop_list
+        self.sim.add_statement(self)
+
+    def get_elems_per_particle(self):
+        return sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
+
+    @pairs_device_block
+    def lower(self):
+        send_buffer = self.comm.send_buffer
+        send_buffer.set_stride(1, self.get_elems_per_particle())
+        send_map = self.comm.send_map
+        send_mult = self.comm.send_mult
+        self.sim.module_name(f"pack_all_ghost_particles" + "_".join([str(p.id()) for p in self.prop_list]))
+
+        for i in For(self.sim, 0, self.comm.nsend_all):
+            p_offset = 0
+            m = send_map[i]
+            for p in self.prop_list:
+                if not Types.is_scalar(p.type()):
+                    nelems = Types.number_of_elements(self.sim, p.type())
+                    for e in range(nelems):
+                        src = p[m][e]
+                        if p == self.sim.position():
+                            src += send_mult[i][e] * self.sim.grid.length(e)
+
+                        Assign(self.sim, send_buffer[i][p_offset + e], src)
+
+                    p_offset += nelems
+
+                else:
+                    cast_fn = lambda x: Cast(self.sim, x, Types.Real) if p.type() != Types.Real else x
+                    Assign(self.sim, send_buffer[i][p_offset], cast_fn(p[m]))
+                    p_offset += 1
+
+
+class UnpackAllGhostParticles(Lowerable):
+    def __init__(self, comm, prop_list):
+        super().__init__(comm.sim)
+        self.comm = comm
+        self.prop_list = prop_list
+        self.sim.add_statement(self)
+
+    def get_elems_per_particle(self):
+        return sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
+
+    @pairs_device_block
+    def lower(self):
+        nlocal = self.sim.nlocal
+        dom_part = self.comm.dom_part
+        recv_buffer = self.comm.recv_buffer
+        recv_buffer.set_stride(1, self.get_elems_per_particle())
+        self.sim.module_name(f"unpack_all_ghost_particles" + "_".join([str(p.id()) for p in self.prop_list]))
+
+        nrecv_size = sum([len(dom_part.step_indexes(s)) for s in range(dom_part.number_of_steps())])
+        nrecv_all = sum([self.comm.nrecv[j] for j in range(nrecv_size)])
+
+        for i in For(self.sim, 0, nrecv_all):
+            p_offset = 0
+            for p in self.prop_list:
+                if not Types.is_scalar(p.type()):
+                    nelems = Types.number_of_elements(self.sim, p.type())
+                    for e in range(nelems):
+                        Assign(self.sim, p[nlocal + i][e], recv_buffer[i][p_offset + e])
+
+                    p_offset += nelems
+
+                else:
+                    cast_fn = lambda x: Cast(self.sim, x, p.type()) if p.type() != Types.Real else x
+                    Assign(self.sim, p[nlocal + i], cast_fn(recv_buffer[i][p_offset]))
+                    p_offset += 1
+
+
 class RemoveExchangedParticles_part1(Lowerable):
     def __init__(self, comm):
         super().__init__(comm.sim)
-- 
GitLab