diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5e64f58adc9305ca50a519910012e440bf6a805..b7fd6a678cd6af86a50c3e3daf7fa029936af84e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON)
 option(GENERATE_WHOLE_PROGRAM "GENERATE_WHOLE_PROGRAM" OFF)
 
 if(USE_WALBERLA)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING ")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING -DUSE_WALBERLA")
     find_package(waLBerla REQUIRED)
     add_subdirectory(${walberla_SOURCE_DIR} ${walberla_BINARY_DIR} EXCLUDE_FROM_ALL)
     waLBerla_import()
@@ -39,7 +39,6 @@ set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces)
 
 set(RUNTIME_COMMON_FILES
     runtime/pairs.cpp
-    # runtime/boundary_weights.cpp  # avoid compiling this for now. TODO: generate the host and device functions
     runtime/copper_fcc_lattice.cpp
     runtime/create_body.cpp
     runtime/dem_sc_grid.cpp
@@ -49,8 +48,9 @@ set(RUNTIME_COMMON_FILES
     runtime/timing.cpp
     runtime/vtk.cpp
     runtime/domain/regular_6d_stencil.cpp)
-
+    
 set(RUNTIME_WALBERLA_FILES
+    # runtime/boundary_weights.cpp  # avoid compiling this for now. TODO: generate the host and device functions
     runtime/domain/block_forest.cpp)
 
 set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE STRING "Python executable not set.")
diff --git a/examples/dem_sd.py b/examples/dem_sd.py
index 5bea2fd80c594e2da27cf99393804b9d5cd40c86..d1a78d946e990dec7f4e55d3b8878efef3b65077 100644
--- a/examples/dem_sd.py
+++ b/examples/dem_sd.py
@@ -115,8 +115,8 @@ psim.add_property('linear_velocity', pairs.vector())
 psim.add_property('angular_velocity', pairs.vector())
 psim.add_property('force', pairs.vector(), volatile=True)
 psim.add_property('torque', pairs.vector(), volatile=True)
-# psim.add_property('hydrodynamic_force', pairs.vector())
-# psim.add_property('hydrodynamic_torque', pairs.vector())
+psim.add_property('hydrodynamic_force', pairs.vector(), reduce=True)
+psim.add_property('hydrodynamic_torque', pairs.vector(), reduce=True)
 # psim.add_property('old_hydrodynamic_force', pairs.vector())
 # psim.add_property('old_hydrodynamic_torque', pairs.vector())
 psim.add_property('radius', pairs.real(), 1.0)
@@ -130,12 +130,12 @@ psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDyn
 
 psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
 # psim.set_domain_partitioner(pairs.block_forest(), initDomainFromWalberla=True)
-# psim.set_domain_partitioner(pairs.block_forest())
-psim.set_domain_partitioner(pairs.regular_domain_partitioner_xy())
+psim.set_domain_partitioner(pairs.block_forest())
+# psim.set_domain_partitioner(pairs.regular_domain_partitioner())
 psim.pbc([False, False, False])
-psim.dem_sc_grid(
-    domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
-    diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
+# psim.dem_sc_grid(
+#     domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
+#     diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
 
 #psim.read_particle_data(
 #    "data/spheres.input",
diff --git a/examples/main.cpp b/examples/main.cpp
index a663ba8157b13cfb42610a4cdf3c8db59570ac30..5f8473b5e5c7fee202399084ef44b92ec3851c8a 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -1,41 +1,32 @@
 #include <iostream>
 //---
 #include "dem_sd.hpp"
-// #include "core/mpi/Broadcast.h"
-// #include <blockforest/BlockForest.h>
-// #include <blockforest/Initialization.h>
-// #include <blockforest/loadbalancing/DynamicCurve.h>
-// #include <blockforest/loadbalancing/DynamicDiffusive.h>
-// #include <blockforest/loadbalancing/DynamicParMetis.h>
-// #include <blockforest/loadbalancing/InfoCollection.h>
-// #include <blockforest/loadbalancing/PODPhantomData.h>
-// #include <blockforest/loadbalancing/level_determination/MinMaxLevelDetermination.h>
-// #include <blockforest/loadbalancing/weight_assignment/MetisAssignmentFunctor.h>
-// #include <blockforest/loadbalancing/weight_assignment/WeightAssignmentFunctor.h>
+#include <blockforest/BlockForest.h>
+#include <blockforest/Initialization.h>
 
 int main(int argc, char **argv) {
     auto pairs_sim = std::make_shared<PairsSimulation>();
-    // auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim);
+    auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim);
 
     // Create forest (make sure to use_domain(forest)) ----------------------------------------------
-    // walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1);
-    // std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance();
-    // mpiManager->initializeMPI(&argc, &argv);
-    // mpiManager->useWorldComm();
-    // auto rank = mpiManager->rank();
-    // auto procs = mpiManager->numProcesses();
-    // auto block_config = walberla::Vector3<int>(2, 2, 1);
-    // auto ref_level = 0;
-    // std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest(
-    //         domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level);
+    walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1);
+    std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance();
+    mpiManager->initializeMPI(&argc, &argv);
+    mpiManager->useWorldComm();
+    auto rank = mpiManager->rank();
+    auto procs = mpiManager->numProcesses();
+    auto block_config = walberla::Vector3<int>(2, 2, 2);
+    auto ref_level = 0;
+    std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest(
+            domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level);
     //-----------------------------------------------------------------------------------------------
 
     // initialize pairs data structures ----------------------------------------------
     pairs_sim->initialize();
 
     // either create new domain or use an existing one ----------------------------------------
-    pairs_sim->create_domain(argc, argv);
-    // pairs_sim->use_domain(forest);
+    // pairs_sim->create_domain(argc, argv);
+    pairs_sim->use_domain(forest);
 
     // create planes and particles ------------------------------------------------------------
     pairs_sim->create_halfspace(0,     0,      0,      1, 0, 0,        0, 13);
@@ -45,21 +36,23 @@ int main(int argc, char **argv) {
     pairs_sim->create_halfspace(0.1,   0.1,    0.1,    0, -1, 0,       0, 13);
     pairs_sim->create_halfspace(0.1,   0.1,    0.1,    0, 0, -1,       0, 13);
 
-    pairs::id_t pUid = pairs_sim->create_sphere(0.03,   0.03,   0.08,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
-    pairs_sim->create_sphere(0.07,   0.07,   0.08,   -0.5, -0.5, 0 , 1000, 0.0045, 0, 0);
+    pairs::id_t pUid = pairs_sim->create_sphere(0.0499,   0.0499,   0.07,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
+    pairs::id_t pUid2 = pairs_sim->create_sphere(0.0499,   0.0499,   0.0499,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
 
     // Tracking a particle ------------------------------------------------------------------------
     // if (pUid != pairs_acc->getInvalidUid()){
     //     std::cout<< "Particle " << pUid << " is created in rank " << rank << std::endl;
     // }
     
-    // walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM);
+    walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM);
+    walberla::mpi::allReduceInplace(pUid2, walberla::mpi::SUM);
 
     // if (pUid != pairs_acc->getInvalidUid()){
     //     std::cout<< "Particle " << pUid << " will be tracked by rank " << rank << std::endl;
     // }
 
-    // auto pIsInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();};
+    auto pIsLocalInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();};
+    auto pIsGhostInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdxGhost(uid) != pairs_acc->getInvalidIdx();};
 
 
     // TODO: make sure linkedCellWidth is larger than max diameter in the system
@@ -75,19 +68,59 @@ int main(int argc, char **argv) {
     //     }
     // }
 
-    // if (pIsInMyRank(pUid)){
+    // if (pIsLocalInMyRank(pUid)){
     //     int idx = pairs_acc->uidToIdx(pUid);
     //     pairs_acc->setPosition(idx, walberla::Vector3<double>(0.01, 0.01, 0.08));
     //     pairs_acc->setLinearVelocity(idx, walberla::Vector3<double>(0.5, 0.5, 0.5));
     // }
 
-    for (int i=0; i<10000; ++i){
-        // if ((i%200==0) && (pIsInMyRank(pUid))){
+    for (int t=0; t<1; ++t){
+        // if ((t%200==0) && (pIsLocalInMyRank(pUid))){
         //     int idx = pairs_acc->uidToIdx(pUid);
         //     std::cout<< "Tracked particle is now in rank " << rank << " --- " << pairs_acc->getPosition(idx)<< std::endl;
         // }
-
-        pairs_sim->do_timestep(i);
+        
+        if (pIsLocalInMyRank(pUid)){
+            int idx = pairs_acc->uidToIdx(pUid);
+            pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(1, 1, 1));
+            pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(2, 2, 2));
+        }
+        if (pIsLocalInMyRank(pUid2)){
+            int idx = pairs_acc->uidToIdx(pUid2);
+            pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(10, 10, 10));
+            pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(20, 20, 20));
+        }
+
+        pairs_sim->do_timestep(t);
+
+        if (pIsGhostInMyRank(pUid)){
+            int idx = pairs_acc->uidToIdxGhost(pUid);
+            pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(1, 1, 1));
+            pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(3, 3, 3));
+        }
+        if (pIsGhostInMyRank(pUid2)){
+            int idx = pairs_acc->uidToIdxGhost(pUid2);
+            pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(2, 2, 2));
+            pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(4, 4, 4));
+        }
+
+        std::cout << "reverse comm and reduce" << std::endl;
+        pairs_sim->reverse_comm();
+
+        if (pIsLocalInMyRank(pUid)){
+            int idx = pairs_acc->uidToIdx(pUid);
+            auto forceSum = pairs_acc->getHydrodynamicForce(idx);
+            auto torqueSum = pairs_acc->getHydrodynamicTorque(idx);
+            std::cout << pUid << " @@@@ reduced force = " << forceSum << std::endl;
+            std::cout << pUid << " @@@@ reduced torque = " << torqueSum << std::endl;
+        }
+        if (pIsLocalInMyRank(pUid2)){
+            int idx = pairs_acc->uidToIdx(pUid2);
+            auto forceSum = pairs_acc->getHydrodynamicForce(idx);
+            auto torqueSum = pairs_acc->getHydrodynamicTorque(idx);
+            std::cout << pUid2 << " @@@@ reduced force = " << forceSum << std::endl;
+            std::cout << pUid2 << " @@@@ reduced torque = " << torqueSum << std::endl;
+        }
     }
 
     pairs_sim->end();
diff --git a/runtime/boundary_weights.cpp b/runtime/boundary_weights.cpp
index f2c84f8a10869b87e99794a0eaca61b6283c05b7..2174da809af1e7f83cf3fe47adc4d2280494ca7e 100644
--- a/runtime/boundary_weights.cpp
+++ b/runtime/boundary_weights.cpp
@@ -8,13 +8,13 @@
 #include "pairs_common.hpp"
 
 // Always include last generated interfaces
-#include "last_generated.hpp"
+#include "interfaces/last_generated.hpp"
 
 #ifdef PAIRS_TARGET_CUDA
 
 #define REDUCE_BLOCK_SIZE   64
 
-void __global__ reduceBoundaryWeights(
+__global__ void reduceBoundaryWeights(
     real_t *position, int start, int end, int particle_capacity,
     real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) {
 
diff --git a/runtime/boundary_weights.hpp b/runtime/boundary_weights.hpp
index cd079d139052f82a98315e9ccfdfdd8d19130ab4..bb310add32dd93ddb892c1838b6e136ae25123f9 100644
--- a/runtime/boundary_weights.hpp
+++ b/runtime/boundary_weights.hpp
@@ -15,9 +15,11 @@ namespace pairs {
 void compute_boundary_weights(
     PairsRuntime *ps,
     real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax,
-    long unsigned int *comp_weight, long unsigned int *comm_weight){
+    long unsigned int *comp_weight, long unsigned int *comm_weight)
+    {
         std::cerr<< "TODO: boundary weights should be generated" << std::endl;
         exit(-1);
-    };
+    }
+    ;
 
 }
diff --git a/runtime/domain/ParticleDataHandling.hpp b/runtime/domain/ParticleDataHandling.hpp
index e9cc12d2863208d2f3146f3b129baafcf5821e8a..f66bb70434d793e0dee825436e0f4d3096193d2b 100644
--- a/runtime/domain/ParticleDataHandling.hpp
+++ b/runtime/domain/ParticleDataHandling.hpp
@@ -191,7 +191,7 @@ public:
 
         // TODO: Check if there is enough particle capacity for the new particles, when there is not,
         // all properties and arrays which have particle_capacity as one of their dimensions must be reallocated
-        PAIRS_ASSERT(nlocal + nrecv < particle_capacity);
+        // PAIRS_ASSERT(nlocal + nrecv < particle_capacity);
 
         for(int i = 0; i < nrecv; ++i) {
             for(auto &prop: ps->getProperties()) {
diff --git a/runtime/domain/block_forest.cpp b/runtime/domain/block_forest.cpp
index fcb19886f2f253027501fd5c621be4019401804d..fcc532917744dbaf474de43b9b65acd018b4c762 100644
--- a/runtime/domain/block_forest.cpp
+++ b/runtime/domain/block_forest.cpp
@@ -68,7 +68,8 @@ void BlockForest::updateNeighborhood() {
         auto block = static_cast<walberla::blockforest::Block *>(&iblock);
         auto& block_info = (*info)[block->getId()];
 
-        if(block_info.computationalWeight > 0) {
+        // don't check computationalWeight for now (TODO: compute_boundary_weights)
+        // if(block_info.computationalWeight > 0) {
             for(uint neigh = 0; neigh < block->getNeighborhoodSize(); ++neigh) {
                 auto neighbor_rank = walberla::int_c(block->getNeighborProcess(neigh));
 
@@ -79,8 +80,8 @@ void BlockForest::updateNeighborhood() {
                     auto begin = blocks_pushed[neighbor_rank].begin();
                     auto end = blocks_pushed[neighbor_rank].end();
 
-                    if(neighbor_info.computationalWeight > 0 &&
-                       find_if(begin, end, [neighbor_block](const auto &nbh) {
+                    // if(neighbor_info.computationalWeight > 0 &&
+                    if(   find_if(begin, end, [neighbor_block](const auto &nbh) {
                             return nbh == neighbor_block; }) == end) {
 
                         neighborhood[neighbor_rank].push_back(neighbor_aabb);
@@ -88,7 +89,7 @@ void BlockForest::updateNeighborhood() {
                     }
                 // }
             }
-        }
+        // }
     }
 
     for(auto& nbh: neighborhood) {
@@ -133,21 +134,21 @@ void BlockForest::updateWeights() {
         auto block = static_cast<walberla::blockforest::Block *>(&iblock);
         auto aabb = block->getAABB();
         auto& block_info = (*info)[block->getId()];
-
-        pairs::compute_boundary_weights(
-            this->ps,
-            aabb.xMin(), aabb.xMax(), aabb.yMin(), aabb.yMax(), aabb.zMin(), aabb.zMax(),
-            &(block_info.computationalWeight), &(block_info.communicationWeight));
+        // TODO: Generate boundary weights
+        // pairs::compute_boundary_weights(
+        //     this->ps,
+        //     aabb.xMin(), aabb.xMax(), aabb.yMin(), aabb.yMax(), aabb.zMin(), aabb.zMax(),
+        //     &(block_info.computationalWeight), &(block_info.communicationWeight));
 
         for(int branch = 0; branch < 8; ++branch) {
             const auto b_id = walberla::BlockID(block->getId(), branch);
             const auto b_aabb = forest->getAABBFromBlockId(b_id);
             auto& b_info = (*info)[b_id];
 
-            pairs::compute_boundary_weights(
-                this->ps,
-                b_aabb.xMin(), b_aabb.xMax(), b_aabb.yMin(), b_aabb.yMax(), b_aabb.zMin(), b_aabb.zMax(),
-                &(b_info.computationalWeight), &(b_info.communicationWeight));
+            // pairs::compute_boundary_weights(
+            //     this->ps,
+            //     b_aabb.xMin(), b_aabb.xMax(), b_aabb.yMin(), b_aabb.yMax(), b_aabb.zMin(), b_aabb.zMax(),
+            //     &(b_info.computationalWeight), &(b_info.communicationWeight));
         }
     }
 
@@ -270,6 +271,9 @@ void BlockForest::initialize(int *argc, char ***argv) {
     auto procs = mpiManager->numProcesses();
     auto block_config = balance_workload ? walberla::Vector3<int>(1, 1, 1) :
                                            getBlockConfig(procs, gridsize[0], gridsize[1], gridsize[2]);
+
+    if(rank==0) std::cout << "block_config = " << block_config << std::endl;
+
     auto ref_level = balance_workload ? getInitialRefinementLevel(procs) : 0;
 
     forest = walberla::blockforest::createBlockForest(
@@ -452,6 +456,14 @@ void BlockForest::communicateData(
     }
 }
 
+void BlockForest::communicateDataReverse(
+    int dim, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+        this->communicateData(dim, elem_size,send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv);
+}
+
 void BlockForest::communicateAllData(
     int ndims, int elem_size,
     const real_t *send_buf, const int *send_offsets, const int *nsend,
diff --git a/runtime/domain/block_forest.hpp b/runtime/domain/block_forest.hpp
index d607616995f4af4b41f3840b1e86fbb6a7294367..1c7bf537901f0681b2ddab336f39dafda1abd61a 100644
--- a/runtime/domain/block_forest.hpp
+++ b/runtime/domain/block_forest.hpp
@@ -79,6 +79,11 @@ public:
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv);
 
+    void communicateDataReverse(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+
     void communicateAllData(
         int ndims, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
diff --git a/runtime/domain/domain_partitioning.hpp b/runtime/domain/domain_partitioning.hpp
index 77af9b8569e8dafbc0015974ab3fb6ec0383d635..15c56f3a509e784fa88bf2c9bc45abe926fd5916 100644
--- a/runtime/domain/domain_partitioning.hpp
+++ b/runtime/domain/domain_partitioning.hpp
@@ -50,10 +50,17 @@ public:
         int dim, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv) = 0;
+
+    virtual void communicateDataReverse(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv) = 0;
+
     virtual void communicateAllData(
         int ndims, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv) = 0;
+        
     virtual void finalize() = 0;
 };
 
diff --git a/runtime/domain/regular_6d_stencil.cpp b/runtime/domain/regular_6d_stencil.cpp
index 649e18624899655dc5730d27e51dc5cc15d80918..6da8b4cb18518b4deff9fccf89308d65f84c85d5 100644
--- a/runtime/domain/regular_6d_stencil.cpp
+++ b/runtime/domain/regular_6d_stencil.cpp
@@ -167,6 +167,35 @@ void Regular6DStencil::communicateData(
     }
 }
 
+void Regular6DStencil::communicateDataReverse(
+    int dim, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+    const real_t *send_prev = &send_buf[send_offsets[dim * 2 + 0] * elem_size];
+    const real_t *send_next = &send_buf[send_offsets[dim * 2 + 1] * elem_size];
+    real_t *recv_prev = &recv_buf[recv_offsets[dim * 2 + 0] * elem_size];
+    real_t *recv_next = &recv_buf[recv_offsets[dim * 2 + 1] * elem_size];
+
+    if(prev[dim] != rank) {
+        MPI_Sendrecv(
+            send_prev, nsend[dim * 2 + 0] * elem_size, MPI_DOUBLE, next[dim], 0,
+            recv_prev, nrecv[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0,
+            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else {
+        pairs::copy_in_device(recv_prev, send_prev, nsend[dim * 2 + 0] * elem_size * sizeof(real_t));
+    }
+
+    if(next[dim] != rank) {
+        MPI_Sendrecv(
+            send_next, nsend[dim * 2 + 1] * elem_size, MPI_DOUBLE, prev[dim], 0,
+            recv_next, nrecv[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0,
+            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else {
+        pairs::copy_in_device(recv_next, send_next, nsend[dim * 2 + 1] * elem_size * sizeof(real_t));
+    }
+}
+
 void Regular6DStencil::communicateAllData(
     int ndims, int elem_size,
     const real_t *send_buf, const int *send_offsets, const int *nsend,
diff --git a/runtime/domain/regular_6d_stencil.hpp b/runtime/domain/regular_6d_stencil.hpp
index 5b91d7a53c73f75edb4671deb9dba9069b68a9c6..7ed18677451710577fb49421fdcaaf83507e8ff7 100644
--- a/runtime/domain/regular_6d_stencil.hpp
+++ b/runtime/domain/regular_6d_stencil.hpp
@@ -67,6 +67,11 @@ public:
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv);
 
+    void communicateDataReverse(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+        
     void communicateAllData(
         int ndims, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp
index a0b767c4322f84bea978243c3febe3440dbba344..40c1381bdab04b269345b5d126040420aaed9653 100644
--- a/runtime/pairs.cpp
+++ b/runtime/pairs.cpp
@@ -43,7 +43,7 @@ void PairsRuntime::initDomain(
 #endif
 
     else {
-        PAIRS_ERROR("Domain partitioning type not implemented!\n");
+        PAIRS_ERROR("(initDomain) Domain partitioning type not implemented!\n");
         exit(-1);
     }
 
@@ -426,7 +426,7 @@ void PairsRuntime::communicateData(
             nrecv_all += nrecv[n];
         }
     }
-
+    
     copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t));
     array_flags->setHostFlag(recv_buf_id);
     array_flags->clearDeviceFlag(recv_buf_id);
@@ -446,6 +446,69 @@ void PairsRuntime::communicateData(
     #endif
 }
 
+void PairsRuntime::communicateDataReverse(
+    int dim, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+    const real_t *send_buf_ptr = send_buf;
+    real_t *recv_buf_ptr = recv_buf;
+    auto send_buf_array = getArrayByHostPointer(send_buf);
+    auto recv_buf_array = getArrayByHostPointer(recv_buf);
+    auto send_buf_id = send_buf_array.getId();
+    auto recv_buf_id = recv_buf_array.getId();
+    auto send_offsets_id = getArrayByHostPointer(send_offsets).getId();
+    auto recv_offsets_id = getArrayByHostPointer(recv_offsets).getId();
+    auto nsend_id = getArrayByHostPointer(nsend).getId();
+    auto nrecv_id = getArrayByHostPointer(nrecv).getId();
+
+    this->getTimers()->start(DeviceTransfers);
+    copyArrayToHost(send_offsets_id, ReadOnly);
+    copyArrayToHost(recv_offsets_id, ReadOnly);
+    copyArrayToHost(nsend_id, ReadOnly);
+    copyArrayToHost(nrecv_id, ReadOnly);
+
+    #ifdef ENABLE_CUDA_AWARE_MPI
+    send_buf_ptr = (real_t *) send_buf_array.getDevicePointer();
+    recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer();
+    #else
+    int nsend_all = 0;
+    int nrecv_all = 0;
+    if(this->dom_part_type == RegularPartitioning || this->dom_part_type == RegularXYPartitioning){
+        for(int d = 2; d >= dim; d--) {
+            nsend_all += nsend[d * 2 + 0];
+            nsend_all += nsend[d * 2 + 1];
+            nrecv_all += nrecv[d * 2 + 0];
+            nrecv_all += nrecv[d * 2 + 1];
+        }
+    }
+    else if (this->dom_part_type == BlockForestPartitioning){
+        int nranks = this->getDomainPartitioner()->getNumberOfNeighborRanks();
+        for (int n=0; n<nranks; ++n){   // blockforest doesn't need reverse loop
+            nsend_all += nsend[n];
+            nrecv_all += nrecv[n];
+        }
+    }
+
+    copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t));
+    array_flags->setHostFlag(recv_buf_id);
+    array_flags->clearDeviceFlag(recv_buf_id);
+    #endif
+
+    this->getTimers()->stop(DeviceTransfers);
+
+    this->getTimers()->start(Communication);
+    this->getDomainPartitioner()->communicateDataReverse(
+        dim, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv);
+    this->getTimers()->stop(Communication);
+
+    #ifndef ENABLE_CUDA_AWARE_MPI
+    this->getTimers()->start(DeviceTransfers);
+    copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
+    this->getTimers()->stop(DeviceTransfers);
+    #endif
+}
+
 void PairsRuntime::communicateAllData(
     int ndims, int elem_size,
     const real_t *send_buf, const int *send_offsets, const int *nsend,
diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp
index 68fb2d68a3cb77a8548b17517442fad7b62ab970..56d4022e471dbb40733f88a1fbf5065c0c0cce16 100644
--- a/runtime/pairs.hpp
+++ b/runtime/pairs.hpp
@@ -322,6 +322,11 @@ public:
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv);
 
+    void communicateDataReverse(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+        
     void communicateAllData(
         int ndims, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
diff --git a/src/pairs/analysis/devices.py b/src/pairs/analysis/devices.py
index d4552eba7183ab9a5371cb69e6bc6e07b22ed592..5a15a13ea57ba34ce0f019ec0f202e040e5bd396 100644
--- a/src/pairs/analysis/devices.py
+++ b/src/pairs/analysis/devices.py
@@ -27,7 +27,7 @@ class MarkCandidateLoops(Visitor):
                             if isinstance(branch_stmt, For):
                                 possible_candidates.append(branch_stmt)
 
-                if isinstance(stmt, For):
+                if isinstance(stmt, For) and not stmt.not_kernel:
                     possible_candidates.append(stmt)
 
         for stmt in possible_candidates:
diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py
index 8af9bbadca2c22537df913a3fcee31c771f7d8ba..9eef2d33dd7b6702075db8f619733505eb928857 100644
--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -172,7 +172,8 @@ class CGen:
 
         self.print("// Module headers")
         for module in self.sim.modules():
-            self.print(f"void {module.name}(PairsRuntime *pairs_runtime, struct PairsObjects *pobj);")
+            if module.name != "main":
+                self.print(f"void {module.name}(PairsRuntime *pairs_runtime, struct PairsObjects *pobj);")
 
         self.print("")
 
@@ -188,7 +189,13 @@ class CGen:
         self.print.add_indent(4)
         self.print("PairsAccessor(const std::shared_ptr<PairsSimulation> &ps_): ps(ps_){}")
         self.print("")
-        self.print("int size() const { return ps->pobj->nlocal; }")
+        self.print("int size() const { return ps->pobj->nlocal + ps->pobj->nghost; }")
+        self.print("")
+
+        self.print("int nlocal() const { return ps->pobj->nlocal; }")
+        self.print("")
+
+        self.print("int nghost() const { return ps->pobj->nghost; }")
         self.print("")
 
         self.print("int getInvalidIdx(){return -1;}")
@@ -199,7 +206,19 @@ class CGen:
 
         self.print('''int uidToIdx(pairs::id_t uid){
         int idx = getInvalidIdx();
-        for(int i=0; i<ps->pobj->nlocal; ++i){
+        for(int i=0; i<this->nlocal(); ++i){
+            if (getUid(i) == uid){
+                idx = i;
+                break;
+            }
+        }
+        return idx;''')
+        self.print("}")
+        self.print("")
+
+        self.print('''int uidToIdxGhost(pairs::id_t uid){
+        int idx = getInvalidIdx();
+        for(int i=this->nlocal(); i<this->size(); ++i){
             if (getUid(i) == uid){
                 idx = i;
                 break;
@@ -337,6 +356,7 @@ class CGen:
         self.print.start()
         self.generate_preamble()
         self.generate_pairs_object_structure()
+        self.generate_module_headers()
 
         for kernel in self.sim.kernels():
             self.generate_kernel(kernel)
@@ -346,7 +366,7 @@ class CGen:
 
         self.print.end()
 
-    def generate_library(self, initialize_module, create_domain_module, setup_sim_module,  do_timestep_module):
+    def generate_library(self, initialize_module, create_domain_module, setup_sim_module,  do_timestep_module, reverse_comm_module):
         self.generate_interfaces()
         # Generate CUDA/CPP file with modules
         ext = ".cu" if self.target.is_gpu() else ".cpp"
@@ -374,7 +394,7 @@ class CGen:
             self.generate_kernel(kernel)
 
         for module in self.sim.modules():
-            if module.name not in ['initialize', 'create_domain', 'setup_sim', 'do_timestep']:
+            if module.name not in ['initialize', 'create_domain', 'setup_sim', 'do_timestep', 'reverse_comm']:
                 self.generate_module(module)
 
         self.print.end()
@@ -436,6 +456,11 @@ class CGen:
         self.print("}")
         self.print("")
 
+        self.print("int rank(){")
+        self.print("    return pairs_runtime->getDomainPartitioner()->getRank();")
+        self.print("}")
+        self.print("")
+
         self.print("void setup_sim() {")
         self.print.add_indent(4)
         self.generate_statement(setup_sim_module.block)
@@ -451,6 +476,13 @@ class CGen:
         self.print("}")
         self.print("")
 
+        self.print("void reverse_comm() {")
+        self.print.add_indent(4)
+        self.generate_statement(reverse_comm_module.block)
+        self.print.add_indent(-4)
+        self.print("}")
+        self.print("")
+
         self.print("void end() {")
         self.print("    pairs::print_timers(pairs_runtime);")
         self.print("    pairs::print_stats(pairs_runtime, pobj->nlocal, pobj->nghost);")
@@ -461,7 +493,7 @@ class CGen:
         self.print.add_indent(-4)
         self.print("};")
 
-        # self.generate_host_pairs_accessor_class()
+        self.generate_host_pairs_accessor_class()
         
         self.print.end()
         self.generate_full_object_names = False
@@ -512,17 +544,16 @@ class CGen:
 
                 if device_cond and var.device_flag:
                     self.print(f"{type_kw} *{var.name()} = pobj->rv_{var.name()}.getDevicePointer();")
+                elif var.force_read:
+                    self.print(f"{type_kw} {var.name()} = pobj->{var.name()};")
                 else:
                     self.print(f"{type_kw} *{var.name()} = &(pobj->{var.name()});")
 
             for array in module.arrays():
                 type_kw = Types.c_keyword(self.sim, array.type())
                 name = array.name() if not device_cond else f"d_{array.name()}"
-                # self.generate_full_object_names = True
                 if not array.is_static() or (array.is_static() and not device_cond):
                     self.print(f"{type_kw} *{array.name()} = pobj->{name};")
-                    # self.print(f"{type_kw} *{array.name()} = {self.generate_object_reference(array, device=device_cond)};")
-                # self.generate_full_object_names = False
 
                 if array in module.host_references():
                     self.print(f"{type_kw} *h_{array.name()} = pobj->{array.name()};")
@@ -615,13 +646,13 @@ class CGen:
         self.print.add_indent(4)
         self.kernel_context = True
 
-        if has_resizes:
-            self.print(f"printf(\"{kernel.name} @@@@@@@@ before kernel: resizes[0] = %d\\n\", resizes[0]);")
+        # if has_resizes:
+            # self.print(f"printf(\"{kernel.name} @@@@@@@@ before kernel: resizes[0] = %d\\n\", resizes[0]);")
 
         self.generate_statement(kernel.block)
 
-        if has_resizes:
-            self.print(f"printf(\"{kernel.name} @@@@@@@@ after kernel: resizes[0] = %d\\n\", resizes[0]);")
+        # if has_resizes:
+            # self.print(f"printf(\"{kernel.name} @@@@@@@@ after kernel: resizes[0] = %d\\n\", resizes[0]);")
 
         self.kernel_context = False
         self.print.add_indent(-4)
@@ -663,9 +694,9 @@ class CGen:
             if ast_node.check_for_resize():
                 resize = self.generate_expression(ast_node.resize)
                 capacity = self.generate_expression(ast_node.capacity)
-                self.print(f"printf (\" %d -- before AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});")
+                # self.print(f"printf (\" %d -- before AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});")
                 self.print(f"pairs::{prefix}atomic_add_resize_check(&({elem}), {value}, &({resize}), {capacity});")
-                self.print(f"printf (\" %d -- after AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});")
+                # self.print(f"printf (\" %d -- after AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});")
 
             else:
                 self.print(f"pairs::{prefix}atomic_add(&({elem}), {value});")
@@ -866,9 +897,9 @@ class CGen:
                 self.print(f"pairs_runtime->copyArrayTo{ctx_suffix}({array_id}, {action}, {size}); // {array_name}")
 
             else:
-                self.print(f"std::cout<< \"{Printer.line_id} -- before {array_name} copyArrayTo{ctx_suffix}({action}) === \" <<  pobj->{array_name}[0]  << \" \" << pobj->{array_name}[1]  << \" \" << pobj->{array_name}[2]  << std::endl;")
+                # self.print(f"std::cout<< \"{Printer.line_id} -- before {array_name} copyArrayTo{ctx_suffix}({action}) === \" <<  pobj->{array_name}[0]  << \" \" << pobj->{array_name}[1]  << \" \" << pobj->{array_name}[2]  << std::endl;")
                 self.print(f"pairs_runtime->copyArrayTo{ctx_suffix}({array_id}, {action}); // {array_name}")
-                self.print(f"std::cout<< \"{Printer.line_id} -- after {array_name} copyArrayTo{ctx_suffix}({action}) === \" <<  pobj->{array_name}[0]  << \" \" << pobj->{array_name}[1]  << \" \" << pobj->{array_name}[2]  << std::endl;")
+                # self.print(f"std::cout<< \"{Printer.line_id} -- after {array_name} copyArrayTo{ctx_suffix}({action}) === \" <<  pobj->{array_name}[0]  << \" \" << pobj->{array_name}[1]  << \" \" << pobj->{array_name}[2]  << std::endl;")
 
         if isinstance(ast_node, CopyContactProperty):
             prop_id = ast_node.contact_prop().id()
@@ -1151,7 +1182,7 @@ class CGen:
             var = self.generate_expression(ast_node.var)
             # Dereferences are ignored for write variables when full objects
             # are generated since they can be directly written into
-            return var if self.generate_full_object_names else f"(*{var})"
+            return var if (self.generate_full_object_names or ast_node.var.force_read) else f"(*{var})"
 
         if isinstance(ast_node, DeviceStaticRef):
             elem = self.generate_expression(ast_node.elem)
diff --git a/src/pairs/ir/loops.py b/src/pairs/ir/loops.py
index 997fda5c6fa73938a74c8fb52f453caa1bd60a21..228bd40e9a69edd70ef36b33dacf82a8aba34fb0 100644
--- a/src/pairs/ir/loops.py
+++ b/src/pairs/ir/loops.py
@@ -39,7 +39,7 @@ class Iter(ASTTerm):
 
 
 class For(ASTNode):
-    def __init__(self, sim, range_min, range_max, block=None):
+    def __init__(self, sim, range_min, range_max, block=None, not_kernel=False):
         super().__init__(sim)
         self.iterator = Iter(sim, self)
         self.min = Lit.cvt(sim, range_min)
@@ -47,6 +47,7 @@ class For(ASTNode):
         self.block = Block(sim, []) if block is None else block
         self.kernel = None
         self._kernel_candidate = False
+        self.not_kernel = not_kernel
 
     def __str__(self):
         return f"For<{self.iterator}, {self.min} ... {self.max}>"
diff --git a/src/pairs/ir/properties.py b/src/pairs/ir/properties.py
index b13d6f48e8face187f328130a95a14eb1ff0717d..21ed08c918529f509047c0075a2670a5922af5b0 100644
--- a/src/pairs/ir/properties.py
+++ b/src/pairs/ir/properties.py
@@ -16,8 +16,8 @@ class Properties:
         self.props = []
         self.defs = {}
 
-    def add(self, p_name, p_type, p_value, p_volatile, p_layout=Layouts.AoS):
-        p = Property(self.sim, p_name, p_type, p_value, p_volatile, p_layout)
+    def add(self, p_name, p_type, p_value, p_volatile, p_layout=Layouts.AoS, p_reduce=False):
+        p = Property(self.sim, p_name, p_type, p_value, p_volatile, p_layout, p_reduce)
         self.props.append(p)
         self.defs[p_name] = p_value
         return p
@@ -27,6 +27,9 @@ class Properties:
 
     def all(self):
         return self.props
+    
+    def reduction_props(self):
+        return [p for p in self.props if p.reduce is True]
 
     def volatiles(self):
         return [p for p in self.props if p.volatile is True]
@@ -51,7 +54,7 @@ class Properties:
 class Property(ASTNode):
     last_prop_id = 0
 
-    def __init__(self, sim, name, dtype, default, volatile, layout=Layouts.AoS):
+    def __init__(self, sim, name, dtype, default, volatile, layout=Layouts.AoS, reduce=False):
         super().__init__(sim)
         self.prop_id = Property.last_prop_id
         self.prop_name = name
@@ -59,6 +62,7 @@ class Property(ASTNode):
         self.prop_layout = layout
         self.default_value = default
         self.volatile = volatile
+        self.reduce = reduce
         self.device_flag = False
         Property.last_prop_id += 1
 
diff --git a/src/pairs/ir/variables.py b/src/pairs/ir/variables.py
index 9d36cf6a531c7e8fcf5853db45c2d1e9f449bd14..00bcc698214a3938b5ae5c3b9b6917114f596c5c 100644
--- a/src/pairs/ir/variables.py
+++ b/src/pairs/ir/variables.py
@@ -49,6 +49,7 @@ class Var(ASTTerm):
         self.mutable = True
         self.var_bonded_arrays = []
         self.device_flag = False
+        self.force_read = False
 
         if temp:
             DeclareVariable(sim, self)
@@ -104,6 +105,10 @@ class Deref(ASTTerm):
     def var(self):
         return self._var
 
+    def copy(self, deep=False):
+        # Terminal copies are just themselves
+        return self
+
     def type(self):
         return self._var.type()
 
diff --git a/src/pairs/sim/comm.py b/src/pairs/sim/comm.py
index 988e71d0bcb2f325b5582c819fd010e221ab50ca..119e432bc77f8bf2dfaee3ca80129f95af6ff4b4 100644
--- a/src/pairs/sim/comm.py
+++ b/src/pairs/sim/comm.py
@@ -23,7 +23,7 @@ class Comm:
         self.nsend_all        = sim.add_var('nsend_all', Types.Int32)
         self.send_capacity    = sim.add_var('send_capacity', Types.Int32, 200000)
         self.recv_capacity    = sim.add_var('recv_capacity', Types.Int32, 200000)
-        self.elem_capacity    = sim.add_var('elem_capacity', Types.Int32, 40)
+        self.elem_capacity    = sim.add_var('elem_capacity', Types.Int32, 100)
         self.nsend            = sim.add_array('nsend', [dom_part.nranks_capacity], Types.Int32)
         self.send_offsets     = sim.add_array('send_offsets', [dom_part.nranks_capacity], Types.Int32)
         self.send_buffer      = sim.add_array('send_buffer', [self.send_capacity, self.elem_capacity], Types.Real, arr_sync=False)
@@ -41,6 +41,13 @@ class Comm:
         self.contact_soffsets = sim.add_array('contact_soffsets', [dom_part.nranks_capacity], Types.Int32)
         self.contact_roffsets = sim.add_array('contact_roffsets', [dom_part.nranks_capacity], Types.Int32)
 
+        self.nsend_reverse            = sim.add_array('nsend_reverse', [dom_part.nranks_capacity], Types.Int32)
+        self.send_offsets_reverse     = sim.add_array('send_offsets_reverse', [dom_part.nranks_capacity], Types.Int32)
+        self.send_buffer_reverse      = sim.add_array('send_buffer_reverse', [self.send_capacity, self.elem_capacity], Types.Real, arr_sync=False)
+        self.nrecv_reverse            = sim.add_array('nrecv_reverse', [dom_part.nranks_capacity], Types.Int32)
+        self.recv_offsets_reverse     = sim.add_array('recv_offsets_reverse', [dom_part.nranks_capacity], Types.Int32)
+        self.recv_buffer_reverse      = sim.add_array('recv_buffer_reverse', [self.recv_capacity, self.elem_capacity], Types.Real, arr_sync=False)
+
     @pairs_inline
     def synchronize(self):
         # Every property that is not constant across timesteps and have neighbor accesses during any
@@ -51,6 +58,33 @@ class Comm:
         PackAllGhostParticles(self, prop_list)
         CommunicateAllData(self, prop_list)
         UnpackAllGhostParticles(self, prop_list)
+        
+    @pairs_host_block
+    def reverse_comm(self, reduce=False):
+        self.sim.module_name(f"reverse_comm")
+        self.prop_list = self.sim.properties.reduction_props()
+
+        for step in range(self.dom_part.number_of_steps() - 1, -1, -1):
+            if self.sim._target.is_gpu():
+                CopyArray(self.sim, self.nsend, Contexts.Host, Actions.ReadOnly)
+                CopyArray(self.sim, self.nrecv, Contexts.Host, Actions.ReadOnly)
+                CopyArray(self.sim, self.send_offsets, Contexts.Host, Actions.ReadOnly)
+                CopyArray(self.sim, self.recv_offsets, Contexts.Host, Actions.ReadOnly)
+
+                # CopyArray(self.sim, self.nsend_reverse, Contexts.Host, Actions.WriteOnly)
+                # CopyArray(self.sim, self.nrecv_reverse, Contexts.Host, Actions.WriteOnly)
+                # CopyArray(self.sim, self.send_offsets_reverse, Contexts.Host, Actions.WriteOnly)
+                # CopyArray(self.sim, self.recv_offsets_reverse, Contexts.Host, Actions.WriteOnly)
+            
+            for j in self.dom_part.step_indexes(step):
+                Assign(self.sim, self.nsend_reverse[j], self.nrecv[j])
+                Assign(self.sim, self.nrecv_reverse[j], self.nsend[j])
+                Assign(self.sim, self.send_offsets_reverse[j], self.recv_offsets[j])
+                Assign(self.sim, self.recv_offsets_reverse[j], self.send_offsets[j])
+
+            PackGhostParticlesReverse(self, step, self.prop_list)
+            CommunicateDataReverse(self, step, self.prop_list)
+            UnpackGhostParticlesReverse(self, step, self.prop_list, reduce)
 
     @pairs_inline
     def borders(self):
@@ -180,7 +214,24 @@ class CommunicateData(Lowerable):
                    self.comm.send_buffer, self.comm.send_offsets, self.comm.nsend,
                    self.comm.recv_buffer, self.comm.recv_offsets, self.comm.nrecv])
 
+class CommunicateDataReverse(Lowerable):
+    def __init__(self, comm, step, prop_list):
+        super().__init__(comm.sim)
+        self.comm = comm
+        self.step = step
+        self.prop_list = prop_list
+        self.sim.add_statement(self)
 
+    @pairs_inline
+    def lower(self):
+        elem_size = sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
+
+        Call_Void(self.sim,
+                  "pairs_runtime->communicateDataReverse",
+                  [self.step, elem_size,
+                   self.comm.send_buffer_reverse, self.comm.send_offsets_reverse, self.comm.nsend_reverse,
+                   self.comm.recv_buffer_reverse, self.comm.recv_offsets_reverse, self.comm.nrecv_reverse])
+        
 class CommunicateContactHistoryData(Lowerable):
     def __init__(self, comm, step):
         super().__init__(comm.sim)
@@ -329,6 +380,45 @@ class PackGhostParticles(Lowerable):
                     Assign(self.sim, send_buffer[i][p_offset], cast_fn(p[m]))
                     p_offset += 1
 
+class PackGhostParticlesReverse(Lowerable):
+    def __init__(self, comm, step, prop_list):
+        super().__init__(comm.sim)
+        self.comm = comm
+        self.step = step
+        self.prop_list = prop_list
+        self.sim.add_statement(self)
+
+    def get_elems_per_particle(self):
+        return sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
+
+    @pairs_device_block
+    def lower(self):
+        nlocal = self.sim.nlocal
+        nghost = self.sim.nghost
+        send_buffer_reverse = self.comm.send_buffer_reverse
+        send_buffer_reverse.set_stride(1, self.get_elems_per_particle())
+
+        self.sim.module_name(f"pack_ghost_particles_reverse{self.step}_" + "_".join([str(p.id()) for p in self.prop_list]))
+
+        start = self.comm.send_offsets_reverse[self.comm.dom_part.first_step_index(self.step)]
+        end = ScalarOp.inline(start + self.comm.dom_part.reduce_sum_step_indexes(self.step, self.comm.nsend_reverse))
+        for i in For(self.sim, start, end):
+            p_offset = 0
+            m = nlocal + i
+            for p in self.prop_list:
+                if not Types.is_scalar(p.type()):
+                    nelems = Types.number_of_elements(self.sim, p.type())
+                    for e in range(nelems):
+                        src = p[m][e]
+                        Assign(self.sim, send_buffer_reverse[i][p_offset + e], src)
+
+                    p_offset += nelems
+
+                else:
+                    cast_fn = lambda x: Cast(self.sim, x, Types.Real) if p.type() != Types.Real else x
+                    Assign(self.sim, send_buffer_reverse[i][p_offset], cast_fn(p[m]))
+                    p_offset += 1
+
             
 class UnpackGhostParticles(Lowerable):
     def __init__(self, comm, step, prop_list):
@@ -365,6 +455,49 @@ class UnpackGhostParticles(Lowerable):
                     Assign(self.sim, p[nlocal + i], cast_fn(recv_buffer[i][p_offset]))
                     p_offset += 1
 
+class UnpackGhostParticlesReverse(Lowerable):
+    def __init__(self, comm, step, prop_list, reduce=False):
+        super().__init__(comm.sim)
+        self.comm = comm
+        self.step = step
+        self.prop_list = prop_list
+        self.reduce = reduce
+        self.sim.add_statement(self)
+        
+
+    def get_elems_per_particle(self):
+        return sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
+
+    @pairs_device_block
+    def lower(self):
+        send_map = self.comm.send_map
+        recv_buffer_reverse = self.comm.recv_buffer_reverse
+        recv_buffer_reverse.set_stride(1, self.get_elems_per_particle())
+        self.sim.module_name(f"unpack_ghost_particles_reverse{self.step}_" + "_".join([str(p.id()) for p in self.prop_list]))
+
+        start = self.comm.recv_offsets_reverse[self.comm.dom_part.first_step_index(self.step)]
+        end = ScalarOp.inline(start + self.comm.dom_part.reduce_sum_step_indexes(self.step, self.comm.nrecv_reverse))
+        for i in For(self.sim, start, end):
+            p_offset = 0
+            m = send_map[i]
+            for p in self.prop_list:
+                if not Types.is_scalar(p.type()):
+                    nelems = Types.number_of_elements(self.sim, p.type())
+                    for e in range(nelems):
+                        if self.reduce:
+                            Assign(self.sim, p[m][e], p[m][e] + recv_buffer_reverse[i][p_offset + e])
+                        else:
+                            Assign(self.sim, p[m][e], recv_buffer_reverse[i][p_offset + e])
+
+                    p_offset += nelems
+
+                else:
+                    cast_fn = lambda x: Cast(self.sim, x, p.type()) if p.type() != Types.Real else x
+                    if self.reduce:
+                        Assign(self.sim, p[m], p[m] + cast_fn(recv_buffer_reverse[i][p_offset]))
+                    else:
+                        Assign(self.sim, p[m], cast_fn(recv_buffer_reverse[i][p_offset]))
+                    p_offset += 1
 
 class PackAllGhostParticles(Lowerable):
     def __init__(self, comm, prop_list):
diff --git a/src/pairs/sim/domain_partitioning.py b/src/pairs/sim/domain_partitioning.py
index e781d235fe82a5dea66e1854ed3cbe33294f801f..cf30351a6ca22b621b4f91cbc745d9e7a003cd13 100644
--- a/src/pairs/sim/domain_partitioning.py
+++ b/src/pairs/sim/domain_partitioning.py
@@ -87,6 +87,8 @@ class DimensionRanges:
 class BlockForest:
     def __init__(self, sim):
         self.sim                = sim
+        self.reduce_step        = sim.add_var('reduce_step', Types.Int32)   # this var is treated as a tmp (workaround for gpu)
+        self.reduce_step.force_read = True
         self.rank               = sim.add_var('rank', Types.Int32)
         self.nranks             = sim.add_var('nranks', Types.Int32)
         self.nranks_capacity    = sim.add_var('nranks_capacity', Types.Int32, init_value=27)
@@ -108,7 +110,7 @@ class BlockForest:
         return 1
 
     def step_indexes(self, step):
-        yield from For(self.sim, 0, self.nranks)
+        yield from For(self.sim, 0, self.nranks, not_kernel=True)
 
     def first_step_index(self, step):
         return 0
@@ -117,11 +119,11 @@ class BlockForest:
         return self.reduce_sum_step_indexes(0, array)
 
     def reduce_sum_step_indexes(self, step, array):
-        nsend_sum = self.sim.add_temp_var(0)
-        for i in For(self.sim, 0, self.nranks):
-            Assign(self.sim, nsend_sum, nsend_sum + array[i])
-
-        return nsend_sum
+        Assign(self.sim, self.reduce_step, 0)
+        for i in For(self.sim, 0, self.nranks, not_kernel=True):
+            Assign(self.sim, self.reduce_step, ScalarOp.inline( self.reduce_step + array[i]))
+            
+        return self.reduce_step
 
     def initialize(self):
         grid_array = [(self.sim.grid.min(d), self.sim.grid.max(d)) for d in range(self.sim.ndims())]
@@ -131,23 +133,25 @@ class BlockForest:
         Call_Void(self.sim, "pairs_runtime->updateDomain", [])
         Assign(self.sim, self.rank, Call_Int(self.sim, "pairs_runtime->getDomainPartitioner()->getRank", []))
         Assign(self.sim, self.nranks, Call_Int(self.sim, "pairs_runtime->getNumberOfNeighborRanks", []))
-        Assign(self.sim, self.ntotal_aabbs, Call_Int(self.sim, "pairs_runtime->getNumberOfNeighborAABBs", []))
-
-        for _ in Filter(self.sim, self.nranks_capacity < self.nranks):
-            Assign(self.sim, self.nranks_capacity, self.nranks + 10)
-            self.ranks.realloc()
-            self.naabbs.realloc()
-            self.aabb_offsets.realloc()
-
-        for _ in Filter(self.sim, self.aabb_capacity < self.ntotal_aabbs):
-            Assign(self.sim, self.aabb_capacity, self.ntotal_aabbs + 20)
-            self.aabbs.realloc()
-
-        Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['ranks', self.ranks, self.nranks])
-        Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['naabbs', self.naabbs, self.nranks])
-        Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabb_offsets', self.aabb_offsets, self.nranks])
-        Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabbs', self.aabbs, self.ntotal_aabbs * 6])
-        Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['subdom', self.subdom, self.sim.ndims() * 2])
+
+        for _ in Filter(self.sim, ScalarOp.neq(self.nranks, 0)):
+            Assign(self.sim, self.ntotal_aabbs, Call_Int(self.sim, "pairs_runtime->getNumberOfNeighborAABBs", []))
+
+            for _ in Filter(self.sim, self.nranks_capacity < self.nranks):
+                Assign(self.sim, self.nranks_capacity, self.nranks + 10)
+                self.ranks.realloc()
+                self.naabbs.realloc()
+                self.aabb_offsets.realloc()
+
+            for _ in Filter(self.sim, self.aabb_capacity < self.ntotal_aabbs):
+                Assign(self.sim, self.aabb_capacity, self.ntotal_aabbs + 20)
+                self.aabbs.realloc()
+
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['ranks', self.ranks, self.nranks])
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['naabbs', self.naabbs, self.nranks])
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabb_offsets', self.aabb_offsets, self.nranks])
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabbs', self.aabbs, self.ntotal_aabbs * 6])
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['subdom', self.subdom, self.sim.ndims() * 2])
 
     def ghost_particles(self, step, position, offset=0.0):
         ''' TODO :  If we have pbc, a sinlge particle can be a ghost particle multiple times (at different locations) for the same neighbor block,
diff --git a/src/pairs/sim/simulation.py b/src/pairs/sim/simulation.py
index 15332c4fab3fa8ffd53826c8f08cde38a491805f..c23c377c3b62e4bb83494037df47e9b39411ba5d 100644
--- a/src/pairs/sim/simulation.py
+++ b/src/pairs/sim/simulation.py
@@ -200,9 +200,9 @@ class Simulation:
         assert len(pbc_config) == self.dims, "PBC must be specified for each dimension."
         self._pbc = pbc_config
 
-    def add_property(self, prop_name, prop_type, value=0.0, volatile=False):
+    def add_property(self, prop_name, prop_type, value=0.0, volatile=False, reduce=False):
         assert self.property(prop_name) is None, f"Property already defined: {prop_name}"
-        return self.properties.add(prop_name, prop_type, value, volatile)
+        return self.properties.add(prop_name, prop_type, value, volatile, p_reduce=reduce)
 
     def add_position(self, prop_name, value=[0.0, 0.0, 0.0], volatile=False, layout=Layouts.AoS):
         assert self.property(prop_name) is None, f"Property already defined: {prop_name}"
@@ -442,6 +442,8 @@ class Simulation:
 
         # Initialize communication instance with specified domain-partitioner
         comm = Comm(self, self._dom_part)
+        reverse_comm_module = comm.reverse_comm(reduce=True)
+
         # Params that determine when a method must be called only when reneighboring
         every_reneighbor_params = {'every': self.reneighbor_frequency}
 
@@ -473,7 +475,14 @@ class Simulation:
             timestep_procedures.append(ResetContactHistoryUsageStatus(self, self._contact_history))
 
         # Reset volatile properties and add computational kernels
-        timestep_procedures += [ResetVolatileProperties(self)] + self.functions
+        timestep_procedures += [ResetVolatileProperties(self)]
+
+        # add computational kernels
+        timestep_procedures += self.functions
+
+        # For whole-program-generation, add reverse_comm wherever needed in the timestep loop (eg: after computational kernels) like this:
+        if self._generate_whole_program:
+            timestep_procedures += [reverse_comm_module]
 
         # Clear unused contact history
         if self._use_contact_history:
@@ -543,12 +552,12 @@ class Simulation:
             setup_sim_module = Module(self, name='setup_sim', block=setup_sim)
             do_timestep_module = Module(self, name='do_timestep', block=timestep.as_block())
 
-            modules_list = [initialize_module, create_domain_module, setup_sim_module, do_timestep_module]
+            modules_list = [initialize_module, create_domain_module, setup_sim_module, do_timestep_module, reverse_comm_module]
 
             transformations = Transformations(modules_list, self._target)
             transformations.apply_all()
 
             # Generate library
-            self.code_gen.generate_library(initialize_module, create_domain_module, setup_sim_module, do_timestep_module)
+            self.code_gen.generate_library(initialize_module, create_domain_module, setup_sim_module, do_timestep_module, reverse_comm_module)
 
         self.code_gen.generate_interfaces()