diff --git a/examples/modular/sd_1.cpp b/examples/modular/sd_1.cpp
index a705c2538af87405a97abbcb79f9f3b5d48b920d..9e9e73a4b8c0bb91434e4e2263eab4d50fd25093 100644
--- a/examples/modular/sd_1.cpp
+++ b/examples/modular/sd_1.cpp
@@ -12,12 +12,12 @@ int main(int argc, char **argv) {
 
     pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1); 
 
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
     pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
     pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
 
diff --git a/examples/modular/sd_2.cpp b/examples/modular/sd_2.cpp
index cc5eb453c92e5727cc716a5628c29d4787c1a170..3443143f99b4343e7b820042dc27fa1c3cbdb75a 100644
--- a/examples/modular/sd_2.cpp
+++ b/examples/modular/sd_2.cpp
@@ -33,12 +33,12 @@ int main(int argc, char **argv) {
     auto pairs_runtime = pairs_sim->getPairsRuntime();
     pairs_runtime->useDomain(forest);
 
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
     pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
     pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
 
diff --git a/examples/modular/sd_3_CPU.cpp b/examples/modular/sd_3_CPU.cpp
index 44766dd1c6950723fc4d96801cf964dd2407a957..224624075b1d35537559ea792b6160d31cd70584 100644
--- a/examples/modular/sd_3_CPU.cpp
+++ b/examples/modular/sd_3_CPU.cpp
@@ -18,12 +18,12 @@ int main(int argc, char **argv) {
     auto pairs_runtime = pairs_sim->getPairsRuntime();
     pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1);
 
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
 
     pairs::id_t pUid = pairs::create_sphere(pairs_runtime ,0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 0, 0);
     pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 0, 0);
diff --git a/examples/modular/sd_3_GPU.cu b/examples/modular/sd_3_GPU.cu
index 5b1789618c0877663cc077b4ff966604677158f6..fcb16b1dfccca3d1ecc20848c82346e19f37b2e8 100644
--- a/examples/modular/sd_3_GPU.cu
+++ b/examples/modular/sd_3_GPU.cu
@@ -49,12 +49,12 @@ int main(int argc, char **argv) {
     auto pairs_runtime = pairs_sim->getPairsRuntime();
     pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1);
 
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
 
     pairs::id_t pUid = pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 1, 0);
     pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 1, 0);
diff --git a/examples/modular/sd_4.cpp b/examples/modular/sd_4.cpp
index 791a30415998357e50b7165968a9e4248a616e91..80a872335525664500d3599b29caf26f466b3bea 100644
--- a/examples/modular/sd_4.cpp
+++ b/examples/modular/sd_4.cpp
@@ -46,12 +46,12 @@ int main(int argc, char **argv) {
 
     pairs_runtime->getDomainPartitioner()->initWorkloadBalancer(pairs::Hilbert, 100, 1000);
 
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
-    pairs::create_halfspace(pairs_runtime, 20,20,20,  -1, 0, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 20,20,20,  0, -1, 0,    0, 13);
-    pairs::create_halfspace(pairs_runtime, 20,20,20,  0, 0, -1,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 20,20,20,  -1, 0, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 20,20,20,  0, -1, 0,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
+    pairs::create_halfspace(pairs_runtime, 20,20,20,  0, 0, -1,    0, pairs::flags::INFINITE | pairs::flags::FIXED);
 
     double diameter_min = 0.3;
     double diameter_max = 0.3;
@@ -72,6 +72,23 @@ int main(int argc, char **argv) {
     int rebalance_freq = 200;
     double dt = 1e-3;
 
+
+    // Stats
+    // ------------------------------------------------------------------------------
+    int rank = pairs_sim->rank();
+    int world_size = pairs_runtime->getDomainPartitioner()->getWorldSize();
+
+    int num_local_aabbs = pairs_runtime->getDomainPartitioner()->getNumberOfLocalAABBs();
+    int num_neigh_aabbs = pairs_runtime->getDomainPartitioner()->getNumberOfNeighborAABBs();
+    int num_neigh_ranks = pairs_runtime->getDomainPartitioner()->getNumberOfNeighborRanks();
+    uint64_t nlocal = pairs_sim->nlocal();
+    uint64_t nghost = pairs_sim->nghost();
+
+    std::cout << "rank (" << rank << "): \t nlocal = " << nlocal << " nghost = " << nghost << 
+         " local_aabbs = " << num_local_aabbs << 
+         " neigh_aabbs = " << num_neigh_aabbs << 
+         " neigh_ranks = " << num_neigh_ranks << std::endl;
+
     pairs::vtk_write_subdom(pairs_runtime, "output/subdom_init", 0);
     
     for (int t=0; t<num_timesteps; ++t){
@@ -96,5 +113,8 @@ int main(int argc, char **argv) {
         }
     }
 
+    pairs::log_timers(pairs_runtime);
+    
+    ac->end();
     pairs_sim->end();
 }
\ No newline at end of file
diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp
index 8803b8ea71e72f1ba06b27f3bc7523ba0e265977..b296b89c5fef1626fdf06967bba48aaf03a050f4 100644
--- a/runtime/pairs.cpp
+++ b/runtime/pairs.cpp
@@ -428,6 +428,9 @@ void PairsRuntime::communicateData(
     #ifdef ENABLE_CUDA_AWARE_MPI
     send_buf_ptr = (real_t *) send_buf_array.getDevicePointer();
     recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer();
+    auto recv_buf_id = recv_buf_array.getId();
+    array_flags->setDeviceFlag(recv_buf_id);
+    array_flags->clearHostFlag(recv_buf_id);
     #else
     int nsend_all = 0;
     int nrecv_all = 0;
@@ -488,6 +491,9 @@ void PairsRuntime::communicateDataReverse(
     #ifdef ENABLE_CUDA_AWARE_MPI
     send_buf_ptr = (real_t *) send_buf_array.getDevicePointer();
     recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer();
+    auto recv_buf_id = recv_buf_array.getId();
+    array_flags->setDeviceFlag(recv_buf_id);
+    array_flags->clearHostFlag(recv_buf_id);
     #else
     int nsend_all = 0;
     int nrecv_all = 0;
@@ -548,6 +554,9 @@ void PairsRuntime::communicateAllData(
     #ifdef ENABLE_CUDA_AWARE_MPI
     send_buf_ptr = (real_t *) send_buf_array.getDevicePointer();
     recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer();
+    auto recv_buf_id = recv_buf_array.getId();
+    array_flags->setDeviceFlag(recv_buf_id);
+    array_flags->clearHostFlag(recv_buf_id);
     #else
     int nsend_all = 0;
     int nrecv_all = 0;
@@ -612,6 +621,9 @@ void PairsRuntime::communicateContactHistoryData(
     #ifdef ENABLE_CUDA_AWARE_MPI
     send_buf_ptr = (real_t *) send_buf_array.getDevicePointer();
     recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer();
+    auto recv_buf_id = recv_buf_array.getId();
+    array_flags->setDeviceFlag(recv_buf_id);
+    array_flags->clearHostFlag(recv_buf_id);
     #else
     auto send_buf_id = send_buf_array.getId();
     auto recv_buf_id = recv_buf_array.getId();
@@ -658,6 +670,9 @@ void PairsRuntime::allReduceInplaceSum(real_t *red_buffer, int num_elems){
 
     #ifdef ENABLE_CUDA_AWARE_MPI
     buff_ptr = (real_t *) buff_array.getDevicePointer();
+    auto buf_id = buff_array.getId();
+    array_flags->setDeviceFlag(buf_id);
+    array_flags->clearHostFlag(buf_id);
     #else
     copyArrayToHost(buff_array, Ignore, num_elems * sizeof(real_t));
     #endif
diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py
index 78c395871525672fa4d213ae885422deb63885dd..7708ca01d8d11c931b3646a3bd7b9a852aa4f63b 100644
--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -690,9 +690,9 @@ class CGen:
                     operator = matrix_op.operator()
 
                     if operator.is_unary():
-                        self.print(f"const {self.real_type()} {matrix_op.name()}_{dim} = {operator.symbol()}({lhs});")
+                        self.print(f"const {self.real_type()} {matrix_op.name()}_{i} = {operator.symbol()}({lhs});")
                     else:
-                        self.print(f"const {self.real_type()} {matrix_op.name()}_{dim} = {lhs} {operator.symbol()} {rhs};")
+                        self.print(f"const {self.real_type()} {matrix_op.name()}_{i} = {lhs} {operator.symbol()} {rhs};")
 
             if isinstance(ast_node.elem, Vector):
                 vector = ast_node.elem
diff --git a/src/pairs/code_gen/interface.py b/src/pairs/code_gen/interface.py
index 872914962de053931e4c9cd0afdc88e804ed000a..fa3158b87ae556f32a20ed64dda0e26d0acc0b71 100644
--- a/src/pairs/code_gen/interface.py
+++ b/src/pairs/code_gen/interface.py
@@ -116,7 +116,6 @@ class InterfaceModules:
         # Reset volatile includes the new locals
         self.sim.add_statement(ResetVolatileProperties(self.sim))  
 
-
     @pairs_interface_block
     def reneighbor(self):
         self.sim.module_name('reneighbor')
diff --git a/src/pairs/ir/kernel.py b/src/pairs/ir/kernel.py
index 4f477ca1b50a7cdce7b9a13894cd96f10e26f769..73764bc547ee419c0fb3a5143bd1fccfb7a400c3 100644
--- a/src/pairs/ir/kernel.py
+++ b/src/pairs/ir/kernel.py
@@ -215,16 +215,16 @@ class Kernel(ASTNode):
 
 
 class KernelLaunch(ASTNode):
-    def __init__(self, sim, kernel, iterator, range_min, range_max):
+    def __init__(self, sim, kernel, iterator, range_min, range_max, serial=False):
         assert isinstance(kernel, Kernel), "KernelLaunch(): given parameter is not of type Kernel!"
         super().__init__(sim)
         self._kernel = kernel
         self._iterator = iterator
         self._range_min = range_min
         self._range_max = range_max
-        self._threads_per_block = Lit.cvt(sim, 32)
+        self._threads_per_block = Lit.cvt(sim, 1) if serial else Lit.cvt(sim, 32)
         self._nelems = (range_max - range_min) 
-        self._nblocks = (self._nelems + self._threads_per_block - 1) / self._threads_per_block
+        self._nblocks = Lit.cvt(sim, 1) if serial else (self._nelems + self._threads_per_block - 1) / self._threads_per_block
 
     @property
     def kernel(self):
diff --git a/src/pairs/ir/loops.py b/src/pairs/ir/loops.py
index 78e7a6685c1cc6903daf4e696c2f94ecc9097f46..5544b7a530220f16a4f48d485f7c5b4382a8a359 100644
--- a/src/pairs/ir/loops.py
+++ b/src/pairs/ir/loops.py
@@ -49,7 +49,7 @@ class Iter(ASTTerm):
 
 
 class For(ASTNode):
-    def __init__(self, sim, range_min, range_max, block=None, not_kernel=False):
+    def __init__(self, sim, range_min, range_max, block=None, not_kernel=False, serial=False):
         super().__init__(sim)
         self.iterator = Iter(sim, self)
         self.min = Lit.cvt(sim, range_min)
@@ -58,6 +58,7 @@ class For(ASTNode):
         self.kernel = None
         self._kernel_candidate = False
         self.not_kernel = not_kernel
+        self.serial = serial
 
     def __str__(self):
         return f"For<{self.iterator}, {self.min} ... {self.max}>"
diff --git a/src/pairs/mapping/funcs.py b/src/pairs/mapping/funcs.py
index bfa6f87776849f9f80cf07450d758a8a1be6a5c6..12157e84b440ee6a8fa01811ab31c130b3b672f9 100644
--- a/src/pairs/mapping/funcs.py
+++ b/src/pairs/mapping/funcs.py
@@ -13,7 +13,7 @@ from pairs.ir.types import Types
 from pairs.mapping.keywords import Keywords
 from pairs.sim.flags import Flags
 from pairs.sim.interaction import ParticleInteraction
-from pairs.sim.global_interaction import  GlobalLocalInteraction, GlobalGlobalInteraction, GlobalReduction, SortGlobals, PackGlobals, ResetReductionProps, ReduceGlobals, UnpackGlobals
+from pairs.sim.global_interaction import  GlobalLocalInteraction, GlobalGlobalInteraction, GlobalReduction, DetermineGlobalsToCompute, SortGlobals, PackGlobals, ResetReductionProps, ReduceGlobals, UnpackGlobals
 from pairs.ir.module import Module
 from pairs.ir.block import Block, pairs_block
 from pairs.sim.lowerable import Lowerable
@@ -363,13 +363,20 @@ def compute(sim, func, cutoff_radius=None, symbols={}, parameters={}, compute_gl
             # If compute_globals is enabled, global interactions and reductions become 
             # part of the same module as local interactions
             global_reduction = GlobalReduction(sim, func.__name__, particle_interaction)
-            
+
+            Assign(sim, global_reduction.nglobal_red, 0)
+            Assign(sim, global_reduction.min_idx, 0)
+            Assign(sim, global_reduction.min_uid, 0)
             SortGlobals(global_reduction)           # Sort global bodies with respect to their uid
+            
             PackGlobals(global_reduction)           # Pack reduction properties in uid order in an intermediate buffer
             ResetReductionProps(global_reduction)   # Reset reduction properties to be prepared for local reduction 
+
+            Assign(sim, global_reduction.nglobal_computed, 0)
+            DetermineGlobalsToCompute(global_reduction)      # Determine global bodies that intersect with the current process
             
             # Compute local contirbutions on global bodies
-            global_local_interactions = GlobalLocalInteraction(sim, func.__name__, nparams)
+            global_local_interactions = GlobalLocalInteraction(sim, func.__name__, nparams, global_reduction)
             for interaction_data in global_local_interactions:
                 ir = BuildParticleIR(sim, symbols, parameters)
                 ir.add_symbols({
diff --git a/src/pairs/sim/global_interaction.py b/src/pairs/sim/global_interaction.py
index cfbc030c8c35f03e315cb24c7f5c436b506d92e6..265bfeca866ea3b68049e6a7360c47abda254da1 100644
--- a/src/pairs/sim/global_interaction.py
+++ b/src/pairs/sim/global_interaction.py
@@ -11,14 +11,16 @@ from pairs.ir.actions import Actions
 from pairs.ir.sizeof import Sizeof
 from pairs.ir.functions import Call_Void
 from pairs.ir.cast import Cast
+from pairs.ir.atomic import AtomicInc
 from pairs.sim.flags import Flags
 from pairs.sim.lowerable import Lowerable
 from pairs.sim.interaction import ParticleInteraction
 
 
 class GlobalLocalInteraction(ParticleInteraction):
-    def __init__(self, sim, module_name, nbody, cutoff_radius=None, use_cell_lists=False):
+    def __init__(self, sim, module_name, nbody, global_reduction, cutoff_radius=None, use_cell_lists=False):
         super().__init__(sim, module_name, nbody, cutoff_radius, use_cell_lists)
+        self.global_reduction = global_reduction
 
     @pairs_device_block
     def lower(self):
@@ -27,22 +29,18 @@ class GlobalLocalInteraction(ParticleInteraction):
             if self.include_shape(ishape):
                 for jshape in range(self.maxs): # shape of locals
                     if self.include_interaction(ishape, jshape):
-                        # Globals are presenet in all ranks so they should not interact with ghosts
-                        for j in ParticleFor(self.sim):
-                            # Loop over the global cell
-                            for p in For(self.sim, 0, self.cell_lists.cell_sizes[0]):
-                                i = self.cell_lists.cell_particles[0][p]
-                                # TODO: Skip if the bounding box of the global body doesn't intersect the subdom of this rank
+                        # Loop over global bodies that must be computed
+                        for p in For(self.sim, 0, self.global_reduction.nglobal_computed, not_kernel=True):
+                            i = self.global_reduction.globals_computed[p]
+                            # Globals are presenet in all ranks so they should not interact with ghosts
+                            for j in ParticleFor(self.sim):
+                                # Here we make make sure not to interact with other global bodies, otherwise
+                                # their contributions will get reduced again over all ranks
                                 for _ in Filter(self.sim, ScalarOp.and_op(
-                                    ScalarOp.cmp(self.sim.particle_shape[i], self.sim.get_shape_id(ishape)),
-                                    self.sim.particle_flags[i] & (Flags.Infinite | Flags.Global))):
-                                    # Here we make make sure not to interact with other global bodies, otherwise
-                                    # their contributions will get reduced again over all ranks
-                                    for _ in Filter(self.sim, ScalarOp.and_op(
-                                        ScalarOp.cmp(self.sim.particle_shape[j], self.sim.get_shape_id(jshape)),
-                                        ScalarOp.not_op(self.sim.particle_flags[j] & (Flags.Infinite | Flags.Global)))):
-                                        for _ in Filter(self.sim, ScalarOp.neq(i, j)):
-                                            self.compute_interaction(i, j, ishape, jshape, atomic=True)
+                                    ScalarOp.cmp(self.sim.particle_shape[j], self.sim.get_shape_id(jshape)),
+                                    ScalarOp.not_op(self.sim.particle_flags[j] & (Flags.Infinite | Flags.Global)))):
+                                    for _ in Filter(self.sim, ScalarOp.neq(i, j)):
+                                        self.compute_interaction(i, j, ishape, jshape, atomic=True)
 
 
 class GlobalGlobalInteraction(ParticleInteraction):
@@ -64,7 +62,7 @@ class GlobalGlobalInteraction(ParticleInteraction):
                     i = self.cell_lists.cell_particles[0][p]
                     for _ in Filter(self.sim, ScalarOp.and_op(
                         ScalarOp.cmp(self.sim.particle_shape[i], self.sim.get_shape_id(ishape)),
-                        self.sim.particle_flags[i] & (Flags.Infinite | Flags.Global))):
+                        self.sim.particle_flags[i] & Flags.Global)):
                         for jshape in range(self.maxs):
                             if self.include_interaction(ishape, jshape):
                                 # Loop over the global cell
@@ -77,12 +75,15 @@ class GlobalGlobalInteraction(ParticleInteraction):
                                         for _ in Filter(self.sim, ScalarOp.neq(i, j)):
                                             self.compute_interaction(i, j, ishape, jshape)
 
+
 class GlobalReduction:
     def __init__(self, sim, module_name, particle_interaction):
         self.sim = sim
         self.module_name            = module_name
         self.particle_interaction   = particle_interaction
         self.nglobal_red            = sim.add_var('nglobal_red', Types.Int32)               # Number of global particles that need reduction
+        self.min_uid                = sim.add_var('min_uid', Types.UInt64)
+        self.min_idx                = sim.add_var('min_idx', Types.Int32)
         self.nglobal_capacity       = sim.add_var('nglobal_capacity', Types.Int32, 64)
         self.global_elem_capacity   = sim.add_var('global_elem_capacity', Types.Int32, 100)
         self.red_buffer             = sim.add_array('red_buffer', [self.nglobal_capacity, self.global_elem_capacity], Types.Real, arr_sync=False) 
@@ -90,6 +91,8 @@ class GlobalReduction:
         self.sorted_idx             = sim.add_array('sorted_idx', [self.nglobal_capacity], Types.Int32, arr_sync=False)
         self.unsorted_idx           = sim.add_array('unsorted_idx', [self.nglobal_capacity], Types.Int32, arr_sync=False)
         self.removed_idx            = sim.add_array('removed_idx', [self.nglobal_capacity], Types.Boolean, arr_sync=False)
+        self.nglobal_computed       = sim.add_var('nglobal_computed', Types.Int32)
+        self.globals_computed       = sim.add_array('globals_computed', [self.nglobal_capacity], Types.Int32, arr_sync=False)
 
         self.red_props = set()
         for ishape in range(self.sim.max_shapes()):
@@ -99,26 +102,44 @@ class GlobalReduction:
                         self.red_props.add(app.prop())
 
     def global_particles(self):
-        for p in For(self.sim, 0, self.sim.cell_lists.cell_sizes[0]):
+        for p in For(self.sim, 0, self.sim.cell_lists.cell_sizes[0], serial=True):
             i = self.sim.cell_lists.cell_particles[0][p]
             for ishape in range(self.sim.max_shapes()):
                 if self.particle_interaction.include_shape(ishape):
                     for _ in Filter(self.sim, ScalarOp.and_op(
                         ScalarOp.cmp(self.sim.particle_shape[i], self.sim.get_shape_id(ishape)),
-                        self.sim.particle_flags[i] & (Flags.Infinite | Flags.Global))):
-                        yield i
+                        self.sim.particle_flags[i] & Flags.Global)):
+                        yield ishape, i
 
     def get_elems_per_particle(self):
         return sum([Types.number_of_elements(self.sim, p.type()) for p in self.red_props])
     
 
+class DetermineGlobalsToCompute(Lowerable):
+    def __init__(self, global_reduction):
+        super().__init__(global_reduction.sim)
+        self.global_reduction = global_reduction
+        self.sim.add_statement(self)
+
+    @pairs_device_block
+    def lower(self):
+        self.sim.module_name(f"{self.global_reduction.module_name}_determine_globals_to_compute")
+        n = self.global_reduction.nglobal_computed
+        globals_computed = self.global_reduction.globals_computed
+        
+        for ishape, i in self.global_reduction.global_particles():
+            for _ in Filter(self.sim, self.global_reduction.particle_interaction.intersects_subdom(ishape, i)):
+                Assign(self.sim, globals_computed[n], i)
+                Assign(self.sim, n, n+1)
+
+
 class SortGlobals(Lowerable):
     def __init__(self, global_reduction):
         super().__init__(global_reduction.sim)
         self.global_reduction = global_reduction
         self.sim.add_statement(self)
 
-    @pairs_host_block
+    @pairs_device_block
     def lower(self):
         self.sim.module_name(f"{self.global_reduction.module_name}_sort_globals")
         nglobal_capacity    = self.global_reduction.nglobal_capacity
@@ -126,23 +147,28 @@ class SortGlobals(Lowerable):
         unsorted_idx        = self.global_reduction.unsorted_idx
         sorted_idx          = self.global_reduction.sorted_idx
         removed_idx         = self.global_reduction.removed_idx
+        min_uid             = self.global_reduction.min_uid
+        min_idx             = self.global_reduction.min_idx
         uid                 = self.sim.particle_uid
         self.sim.check_resize(nglobal_capacity, nglobal_red)
 
-        Assign(self.sim, nglobal_red, 0)
-        for i in self.global_reduction.global_particles():
+        # The two kernels below are serialized on device to avoid unnecessary device transfers 
+        # NOTE: The number of global bodies is very small so serializing on device pays off.
+
+        for _, i in self.global_reduction.global_particles():
             Assign(self.sim, unsorted_idx[nglobal_red], i)
             Assign(self.sim, sorted_idx[nglobal_red], 0)
             Assign(self.sim, removed_idx[nglobal_red], 0)
-            Assign(self.sim, nglobal_red, nglobal_red +1)
-
-        min_uid = self.sim.add_temp_var(0, Types.UInt64)
-        min_idx = self.sim.add_temp_var(0)
+            AtomicInc(self.sim, nglobal_red, 1)
 
         # Here we sort indices of global bodies with respect to their uid's.
         # The sorted uid's will be in identical order on all ranks. This ensures that the
         # reduced properties are mapped correctly to each global body during inplace reduction.
-        for i in For(self.sim, 0, nglobal_red):
+        for i in For(self.sim, 0, nglobal_red, serial=True):
+            # FIXME: nglobal_red is readonly in this kernel. But it's a resize variable above.
+            # The line below enforces it to be passed by pointer since it's a device variable.
+            AtomicInc(self.sim, nglobal_red, 0) 
+
             Assign(self.sim, min_uid, -1)   # TODO: Lit max: UINT64_MAX
             Assign(self.sim, min_idx, 0)
             for j in For(self.sim, 0, nglobal_red):
diff --git a/src/pairs/sim/interaction.py b/src/pairs/sim/interaction.py
index 1c350cc04174f9c60580865a84e5bf3a48038dc6..0ca80a537ab3cff6617b6b43e96e8b842c375c89 100644
--- a/src/pairs/sim/interaction.py
+++ b/src/pairs/sim/interaction.py
@@ -428,6 +428,37 @@ class ParticleInteraction(Lowerable):
         self.sim.add_statement(Filter(self.sim, interaction_data.cutoff_condition, self.blocks[ishape*self.maxs + jshape]))
         self.apply_reductions(i, ishape, jshape, atomic)
 
+    def intersects_subdom(self, ishape, i):
+        ishape_id = self.sim.get_shape_id(ishape)
+
+        position = self.sim.position()
+        rotation_matrix = self.sim.property('rotation_matrix')
+        rm = [Abs(self.sim, rotation_matrix[i][d]) for d in range(self.sim.ndims() ** 2)]
+
+        if ishape_id == Shapes.Sphere:
+            radius = self.sim.property('radius')
+            aabb_max = position[i] + radius[i]
+            aabb_min = position[i] - radius[i]
+
+        if ishape_id == Shapes.Box:
+            edge_length = self.sim.property('edge_length')
+            l = edge_length[i] * 0.5
+            bound = Vector(self.sim, [  
+                rm[0]*l[0] + rm[1]*l[1] + rm[2]*l[2], 
+                rm[3]*l[0] + rm[4]*l[1] + rm[5]*l[2],
+                rm[6]*l[0] + rm[7]*l[1] + rm[8]*l[2]])
+            aabb_max = position[i] + bound
+            aabb_min = position[i] - bound
+
+        intersects = None
+        for dim in range(self.sim.ndims()):
+            dom_min = self.cell_lists.dom_part.min(dim)
+            dom_max = self.cell_lists.dom_part.max(dim)
+            d_cond = ScalarOp.and_op(aabb_min[dim] < dom_max, aabb_max[dim] >= dom_min)
+            intersects = d_cond if intersects is None else ScalarOp.and_op(intersects, d_cond)
+
+        return intersects
+
     @pairs_block
     def lower(self):
         self.sim.module_name(f"{self.module_name}_local_interactions")
diff --git a/src/pairs/transformations/devices.py b/src/pairs/transformations/devices.py
index 1eb6f1d36a7d97036de2336257508f0879dcd825..4c4ec154881fcbd71e57724036fdc62412c46fff 100644
--- a/src/pairs/transformations/devices.py
+++ b/src/pairs/transformations/devices.py
@@ -103,7 +103,7 @@ class AddDeviceKernels(Mutator):
     def mutate_For(self, ast_node):
         if ast_node.is_kernel_candidate() and self._device_module:
             kernel = self.create_kernel(ast_node.sim, ast_node.iterator, ast_node.max, ast_node.block)
-            ast_node = KernelLaunch(ast_node.sim, kernel, ast_node.iterator, ast_node.min, ast_node.max)
+            ast_node = KernelLaunch(ast_node.sim, kernel, ast_node.iterator, ast_node.min, ast_node.max, ast_node.serial)
 
         else:
             ast_node.block = self.mutate(ast_node.block)