diff --git a/examples/modular/sd_1.cpp b/examples/modular/sd_1.cpp index a705c2538af87405a97abbcb79f9f3b5d48b920d..9e9e73a4b8c0bb91434e4e2263eab4d50fd25093 100644 --- a/examples/modular/sd_1.cpp +++ b/examples/modular/sd_1.cpp @@ -12,12 +12,12 @@ int main(int argc, char **argv) { pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1); - pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, -1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, 0, -1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, 0, 0, -1, 0, 13); + pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, -1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, 0, -1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, 0, 0, -1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7, -2, -2, 0, 1000, 0.05, 0, 0); pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68, 2, 2, 0, 1000, 0.05, 0, 0); diff --git a/examples/modular/sd_2.cpp b/examples/modular/sd_2.cpp index cc5eb453c92e5727cc716a5628c29d4787c1a170..3443143f99b4343e7b820042dc27fa1c3cbdb75a 100644 --- a/examples/modular/sd_2.cpp +++ b/examples/modular/sd_2.cpp @@ -33,12 +33,12 @@ int main(int argc, char **argv) { auto pairs_runtime = pairs_sim->getPairsRuntime(); pairs_runtime->useDomain(forest); - pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, -1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, 0, -1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, 0, 0, -1, 0, 13); + pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, -1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, 0, -1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, 0, 0, -1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7, -2, -2, 0, 1000, 0.05, 0, 0); pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68, 2, 2, 0, 1000, 0.05, 0, 0); diff --git a/examples/modular/sd_3_CPU.cpp b/examples/modular/sd_3_CPU.cpp index 44766dd1c6950723fc4d96801cf964dd2407a957..224624075b1d35537559ea792b6160d31cd70584 100644 --- a/examples/modular/sd_3_CPU.cpp +++ b/examples/modular/sd_3_CPU.cpp @@ -18,12 +18,12 @@ int main(int argc, char **argv) { auto pairs_runtime = pairs_sim->getPairsRuntime(); pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1); - pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, -1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, 0, -1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, 0, 0, -1, 0, 13); + pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, -1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, 0, -1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, 0, 0, -1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); pairs::id_t pUid = pairs::create_sphere(pairs_runtime ,0.6, 0.6, 0.7, 0, 0, 0, 1000, 0.05, 0, 0); pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76, 2, 2, 0, 1000, 0.05, 0, 0); diff --git a/examples/modular/sd_3_GPU.cu b/examples/modular/sd_3_GPU.cu index 5b1789618c0877663cc077b4ff966604677158f6..fcb16b1dfccca3d1ecc20848c82346e19f37b2e8 100644 --- a/examples/modular/sd_3_GPU.cu +++ b/examples/modular/sd_3_GPU.cu @@ -49,12 +49,12 @@ int main(int argc, char **argv) { auto pairs_runtime = pairs_sim->getPairsRuntime(); pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1); - pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, -1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, 0, -1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 1,1,1, 0, 0, -1, 0, 13); + pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, -1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, 0, -1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 1,1,1, 0, 0, -1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); pairs::id_t pUid = pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7, 0, 0, 0, 1000, 0.05, 1, 0); pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76, 2, 2, 0, 1000, 0.05, 1, 0); diff --git a/examples/modular/sd_4.cpp b/examples/modular/sd_4.cpp index 791a30415998357e50b7165968a9e4248a616e91..80a872335525664500d3599b29caf26f466b3bea 100644 --- a/examples/modular/sd_4.cpp +++ b/examples/modular/sd_4.cpp @@ -46,12 +46,12 @@ int main(int argc, char **argv) { pairs_runtime->getDomainPartitioner()->initWorkloadBalancer(pairs::Hilbert, 100, 1000); - pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, 13); - pairs::create_halfspace(pairs_runtime, 20,20,20, -1, 0, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 20,20,20, 0, -1, 0, 0, 13); - pairs::create_halfspace(pairs_runtime, 20,20,20, 0, 0, -1, 0, 13); + pairs::create_halfspace(pairs_runtime, 0,0,0, 1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 0,0,0, 0, 0, 1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 20,20,20, -1, 0, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 20,20,20, 0, -1, 0, 0, pairs::flags::INFINITE | pairs::flags::FIXED); + pairs::create_halfspace(pairs_runtime, 20,20,20, 0, 0, -1, 0, pairs::flags::INFINITE | pairs::flags::FIXED); double diameter_min = 0.3; double diameter_max = 0.3; @@ -72,6 +72,23 @@ int main(int argc, char **argv) { int rebalance_freq = 200; double dt = 1e-3; + + // Stats + // ------------------------------------------------------------------------------ + int rank = pairs_sim->rank(); + int world_size = pairs_runtime->getDomainPartitioner()->getWorldSize(); + + int num_local_aabbs = pairs_runtime->getDomainPartitioner()->getNumberOfLocalAABBs(); + int num_neigh_aabbs = pairs_runtime->getDomainPartitioner()->getNumberOfNeighborAABBs(); + int num_neigh_ranks = pairs_runtime->getDomainPartitioner()->getNumberOfNeighborRanks(); + uint64_t nlocal = pairs_sim->nlocal(); + uint64_t nghost = pairs_sim->nghost(); + + std::cout << "rank (" << rank << "): \t nlocal = " << nlocal << " nghost = " << nghost << + " local_aabbs = " << num_local_aabbs << + " neigh_aabbs = " << num_neigh_aabbs << + " neigh_ranks = " << num_neigh_ranks << std::endl; + pairs::vtk_write_subdom(pairs_runtime, "output/subdom_init", 0); for (int t=0; t<num_timesteps; ++t){ @@ -96,5 +113,8 @@ int main(int argc, char **argv) { } } + pairs::log_timers(pairs_runtime); + + ac->end(); pairs_sim->end(); } \ No newline at end of file diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp index 8803b8ea71e72f1ba06b27f3bc7523ba0e265977..b296b89c5fef1626fdf06967bba48aaf03a050f4 100644 --- a/runtime/pairs.cpp +++ b/runtime/pairs.cpp @@ -428,6 +428,9 @@ void PairsRuntime::communicateData( #ifdef ENABLE_CUDA_AWARE_MPI send_buf_ptr = (real_t *) send_buf_array.getDevicePointer(); recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer(); + auto recv_buf_id = recv_buf_array.getId(); + array_flags->setDeviceFlag(recv_buf_id); + array_flags->clearHostFlag(recv_buf_id); #else int nsend_all = 0; int nrecv_all = 0; @@ -488,6 +491,9 @@ void PairsRuntime::communicateDataReverse( #ifdef ENABLE_CUDA_AWARE_MPI send_buf_ptr = (real_t *) send_buf_array.getDevicePointer(); recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer(); + auto recv_buf_id = recv_buf_array.getId(); + array_flags->setDeviceFlag(recv_buf_id); + array_flags->clearHostFlag(recv_buf_id); #else int nsend_all = 0; int nrecv_all = 0; @@ -548,6 +554,9 @@ void PairsRuntime::communicateAllData( #ifdef ENABLE_CUDA_AWARE_MPI send_buf_ptr = (real_t *) send_buf_array.getDevicePointer(); recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer(); + auto recv_buf_id = recv_buf_array.getId(); + array_flags->setDeviceFlag(recv_buf_id); + array_flags->clearHostFlag(recv_buf_id); #else int nsend_all = 0; int nrecv_all = 0; @@ -612,6 +621,9 @@ void PairsRuntime::communicateContactHistoryData( #ifdef ENABLE_CUDA_AWARE_MPI send_buf_ptr = (real_t *) send_buf_array.getDevicePointer(); recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer(); + auto recv_buf_id = recv_buf_array.getId(); + array_flags->setDeviceFlag(recv_buf_id); + array_flags->clearHostFlag(recv_buf_id); #else auto send_buf_id = send_buf_array.getId(); auto recv_buf_id = recv_buf_array.getId(); @@ -658,6 +670,9 @@ void PairsRuntime::allReduceInplaceSum(real_t *red_buffer, int num_elems){ #ifdef ENABLE_CUDA_AWARE_MPI buff_ptr = (real_t *) buff_array.getDevicePointer(); + auto buf_id = buff_array.getId(); + array_flags->setDeviceFlag(buf_id); + array_flags->clearHostFlag(buf_id); #else copyArrayToHost(buff_array, Ignore, num_elems * sizeof(real_t)); #endif diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py index 78c395871525672fa4d213ae885422deb63885dd..7708ca01d8d11c931b3646a3bd7b9a852aa4f63b 100644 --- a/src/pairs/code_gen/cgen.py +++ b/src/pairs/code_gen/cgen.py @@ -690,9 +690,9 @@ class CGen: operator = matrix_op.operator() if operator.is_unary(): - self.print(f"const {self.real_type()} {matrix_op.name()}_{dim} = {operator.symbol()}({lhs});") + self.print(f"const {self.real_type()} {matrix_op.name()}_{i} = {operator.symbol()}({lhs});") else: - self.print(f"const {self.real_type()} {matrix_op.name()}_{dim} = {lhs} {operator.symbol()} {rhs};") + self.print(f"const {self.real_type()} {matrix_op.name()}_{i} = {lhs} {operator.symbol()} {rhs};") if isinstance(ast_node.elem, Vector): vector = ast_node.elem diff --git a/src/pairs/code_gen/interface.py b/src/pairs/code_gen/interface.py index 872914962de053931e4c9cd0afdc88e804ed000a..fa3158b87ae556f32a20ed64dda0e26d0acc0b71 100644 --- a/src/pairs/code_gen/interface.py +++ b/src/pairs/code_gen/interface.py @@ -116,7 +116,6 @@ class InterfaceModules: # Reset volatile includes the new locals self.sim.add_statement(ResetVolatileProperties(self.sim)) - @pairs_interface_block def reneighbor(self): self.sim.module_name('reneighbor') diff --git a/src/pairs/ir/kernel.py b/src/pairs/ir/kernel.py index 4f477ca1b50a7cdce7b9a13894cd96f10e26f769..73764bc547ee419c0fb3a5143bd1fccfb7a400c3 100644 --- a/src/pairs/ir/kernel.py +++ b/src/pairs/ir/kernel.py @@ -215,16 +215,16 @@ class Kernel(ASTNode): class KernelLaunch(ASTNode): - def __init__(self, sim, kernel, iterator, range_min, range_max): + def __init__(self, sim, kernel, iterator, range_min, range_max, serial=False): assert isinstance(kernel, Kernel), "KernelLaunch(): given parameter is not of type Kernel!" super().__init__(sim) self._kernel = kernel self._iterator = iterator self._range_min = range_min self._range_max = range_max - self._threads_per_block = Lit.cvt(sim, 32) + self._threads_per_block = Lit.cvt(sim, 1) if serial else Lit.cvt(sim, 32) self._nelems = (range_max - range_min) - self._nblocks = (self._nelems + self._threads_per_block - 1) / self._threads_per_block + self._nblocks = Lit.cvt(sim, 1) if serial else (self._nelems + self._threads_per_block - 1) / self._threads_per_block @property def kernel(self): diff --git a/src/pairs/ir/loops.py b/src/pairs/ir/loops.py index 78e7a6685c1cc6903daf4e696c2f94ecc9097f46..5544b7a530220f16a4f48d485f7c5b4382a8a359 100644 --- a/src/pairs/ir/loops.py +++ b/src/pairs/ir/loops.py @@ -49,7 +49,7 @@ class Iter(ASTTerm): class For(ASTNode): - def __init__(self, sim, range_min, range_max, block=None, not_kernel=False): + def __init__(self, sim, range_min, range_max, block=None, not_kernel=False, serial=False): super().__init__(sim) self.iterator = Iter(sim, self) self.min = Lit.cvt(sim, range_min) @@ -58,6 +58,7 @@ class For(ASTNode): self.kernel = None self._kernel_candidate = False self.not_kernel = not_kernel + self.serial = serial def __str__(self): return f"For<{self.iterator}, {self.min} ... {self.max}>" diff --git a/src/pairs/mapping/funcs.py b/src/pairs/mapping/funcs.py index bfa6f87776849f9f80cf07450d758a8a1be6a5c6..12157e84b440ee6a8fa01811ab31c130b3b672f9 100644 --- a/src/pairs/mapping/funcs.py +++ b/src/pairs/mapping/funcs.py @@ -13,7 +13,7 @@ from pairs.ir.types import Types from pairs.mapping.keywords import Keywords from pairs.sim.flags import Flags from pairs.sim.interaction import ParticleInteraction -from pairs.sim.global_interaction import GlobalLocalInteraction, GlobalGlobalInteraction, GlobalReduction, SortGlobals, PackGlobals, ResetReductionProps, ReduceGlobals, UnpackGlobals +from pairs.sim.global_interaction import GlobalLocalInteraction, GlobalGlobalInteraction, GlobalReduction, DetermineGlobalsToCompute, SortGlobals, PackGlobals, ResetReductionProps, ReduceGlobals, UnpackGlobals from pairs.ir.module import Module from pairs.ir.block import Block, pairs_block from pairs.sim.lowerable import Lowerable @@ -363,13 +363,20 @@ def compute(sim, func, cutoff_radius=None, symbols={}, parameters={}, compute_gl # If compute_globals is enabled, global interactions and reductions become # part of the same module as local interactions global_reduction = GlobalReduction(sim, func.__name__, particle_interaction) - + + Assign(sim, global_reduction.nglobal_red, 0) + Assign(sim, global_reduction.min_idx, 0) + Assign(sim, global_reduction.min_uid, 0) SortGlobals(global_reduction) # Sort global bodies with respect to their uid + PackGlobals(global_reduction) # Pack reduction properties in uid order in an intermediate buffer ResetReductionProps(global_reduction) # Reset reduction properties to be prepared for local reduction + + Assign(sim, global_reduction.nglobal_computed, 0) + DetermineGlobalsToCompute(global_reduction) # Determine global bodies that intersect with the current process # Compute local contirbutions on global bodies - global_local_interactions = GlobalLocalInteraction(sim, func.__name__, nparams) + global_local_interactions = GlobalLocalInteraction(sim, func.__name__, nparams, global_reduction) for interaction_data in global_local_interactions: ir = BuildParticleIR(sim, symbols, parameters) ir.add_symbols({ diff --git a/src/pairs/sim/global_interaction.py b/src/pairs/sim/global_interaction.py index cfbc030c8c35f03e315cb24c7f5c436b506d92e6..265bfeca866ea3b68049e6a7360c47abda254da1 100644 --- a/src/pairs/sim/global_interaction.py +++ b/src/pairs/sim/global_interaction.py @@ -11,14 +11,16 @@ from pairs.ir.actions import Actions from pairs.ir.sizeof import Sizeof from pairs.ir.functions import Call_Void from pairs.ir.cast import Cast +from pairs.ir.atomic import AtomicInc from pairs.sim.flags import Flags from pairs.sim.lowerable import Lowerable from pairs.sim.interaction import ParticleInteraction class GlobalLocalInteraction(ParticleInteraction): - def __init__(self, sim, module_name, nbody, cutoff_radius=None, use_cell_lists=False): + def __init__(self, sim, module_name, nbody, global_reduction, cutoff_radius=None, use_cell_lists=False): super().__init__(sim, module_name, nbody, cutoff_radius, use_cell_lists) + self.global_reduction = global_reduction @pairs_device_block def lower(self): @@ -27,22 +29,18 @@ class GlobalLocalInteraction(ParticleInteraction): if self.include_shape(ishape): for jshape in range(self.maxs): # shape of locals if self.include_interaction(ishape, jshape): - # Globals are presenet in all ranks so they should not interact with ghosts - for j in ParticleFor(self.sim): - # Loop over the global cell - for p in For(self.sim, 0, self.cell_lists.cell_sizes[0]): - i = self.cell_lists.cell_particles[0][p] - # TODO: Skip if the bounding box of the global body doesn't intersect the subdom of this rank + # Loop over global bodies that must be computed + for p in For(self.sim, 0, self.global_reduction.nglobal_computed, not_kernel=True): + i = self.global_reduction.globals_computed[p] + # Globals are presenet in all ranks so they should not interact with ghosts + for j in ParticleFor(self.sim): + # Here we make make sure not to interact with other global bodies, otherwise + # their contributions will get reduced again over all ranks for _ in Filter(self.sim, ScalarOp.and_op( - ScalarOp.cmp(self.sim.particle_shape[i], self.sim.get_shape_id(ishape)), - self.sim.particle_flags[i] & (Flags.Infinite | Flags.Global))): - # Here we make make sure not to interact with other global bodies, otherwise - # their contributions will get reduced again over all ranks - for _ in Filter(self.sim, ScalarOp.and_op( - ScalarOp.cmp(self.sim.particle_shape[j], self.sim.get_shape_id(jshape)), - ScalarOp.not_op(self.sim.particle_flags[j] & (Flags.Infinite | Flags.Global)))): - for _ in Filter(self.sim, ScalarOp.neq(i, j)): - self.compute_interaction(i, j, ishape, jshape, atomic=True) + ScalarOp.cmp(self.sim.particle_shape[j], self.sim.get_shape_id(jshape)), + ScalarOp.not_op(self.sim.particle_flags[j] & (Flags.Infinite | Flags.Global)))): + for _ in Filter(self.sim, ScalarOp.neq(i, j)): + self.compute_interaction(i, j, ishape, jshape, atomic=True) class GlobalGlobalInteraction(ParticleInteraction): @@ -64,7 +62,7 @@ class GlobalGlobalInteraction(ParticleInteraction): i = self.cell_lists.cell_particles[0][p] for _ in Filter(self.sim, ScalarOp.and_op( ScalarOp.cmp(self.sim.particle_shape[i], self.sim.get_shape_id(ishape)), - self.sim.particle_flags[i] & (Flags.Infinite | Flags.Global))): + self.sim.particle_flags[i] & Flags.Global)): for jshape in range(self.maxs): if self.include_interaction(ishape, jshape): # Loop over the global cell @@ -77,12 +75,15 @@ class GlobalGlobalInteraction(ParticleInteraction): for _ in Filter(self.sim, ScalarOp.neq(i, j)): self.compute_interaction(i, j, ishape, jshape) + class GlobalReduction: def __init__(self, sim, module_name, particle_interaction): self.sim = sim self.module_name = module_name self.particle_interaction = particle_interaction self.nglobal_red = sim.add_var('nglobal_red', Types.Int32) # Number of global particles that need reduction + self.min_uid = sim.add_var('min_uid', Types.UInt64) + self.min_idx = sim.add_var('min_idx', Types.Int32) self.nglobal_capacity = sim.add_var('nglobal_capacity', Types.Int32, 64) self.global_elem_capacity = sim.add_var('global_elem_capacity', Types.Int32, 100) self.red_buffer = sim.add_array('red_buffer', [self.nglobal_capacity, self.global_elem_capacity], Types.Real, arr_sync=False) @@ -90,6 +91,8 @@ class GlobalReduction: self.sorted_idx = sim.add_array('sorted_idx', [self.nglobal_capacity], Types.Int32, arr_sync=False) self.unsorted_idx = sim.add_array('unsorted_idx', [self.nglobal_capacity], Types.Int32, arr_sync=False) self.removed_idx = sim.add_array('removed_idx', [self.nglobal_capacity], Types.Boolean, arr_sync=False) + self.nglobal_computed = sim.add_var('nglobal_computed', Types.Int32) + self.globals_computed = sim.add_array('globals_computed', [self.nglobal_capacity], Types.Int32, arr_sync=False) self.red_props = set() for ishape in range(self.sim.max_shapes()): @@ -99,26 +102,44 @@ class GlobalReduction: self.red_props.add(app.prop()) def global_particles(self): - for p in For(self.sim, 0, self.sim.cell_lists.cell_sizes[0]): + for p in For(self.sim, 0, self.sim.cell_lists.cell_sizes[0], serial=True): i = self.sim.cell_lists.cell_particles[0][p] for ishape in range(self.sim.max_shapes()): if self.particle_interaction.include_shape(ishape): for _ in Filter(self.sim, ScalarOp.and_op( ScalarOp.cmp(self.sim.particle_shape[i], self.sim.get_shape_id(ishape)), - self.sim.particle_flags[i] & (Flags.Infinite | Flags.Global))): - yield i + self.sim.particle_flags[i] & Flags.Global)): + yield ishape, i def get_elems_per_particle(self): return sum([Types.number_of_elements(self.sim, p.type()) for p in self.red_props]) +class DetermineGlobalsToCompute(Lowerable): + def __init__(self, global_reduction): + super().__init__(global_reduction.sim) + self.global_reduction = global_reduction + self.sim.add_statement(self) + + @pairs_device_block + def lower(self): + self.sim.module_name(f"{self.global_reduction.module_name}_determine_globals_to_compute") + n = self.global_reduction.nglobal_computed + globals_computed = self.global_reduction.globals_computed + + for ishape, i in self.global_reduction.global_particles(): + for _ in Filter(self.sim, self.global_reduction.particle_interaction.intersects_subdom(ishape, i)): + Assign(self.sim, globals_computed[n], i) + Assign(self.sim, n, n+1) + + class SortGlobals(Lowerable): def __init__(self, global_reduction): super().__init__(global_reduction.sim) self.global_reduction = global_reduction self.sim.add_statement(self) - @pairs_host_block + @pairs_device_block def lower(self): self.sim.module_name(f"{self.global_reduction.module_name}_sort_globals") nglobal_capacity = self.global_reduction.nglobal_capacity @@ -126,23 +147,28 @@ class SortGlobals(Lowerable): unsorted_idx = self.global_reduction.unsorted_idx sorted_idx = self.global_reduction.sorted_idx removed_idx = self.global_reduction.removed_idx + min_uid = self.global_reduction.min_uid + min_idx = self.global_reduction.min_idx uid = self.sim.particle_uid self.sim.check_resize(nglobal_capacity, nglobal_red) - Assign(self.sim, nglobal_red, 0) - for i in self.global_reduction.global_particles(): + # The two kernels below are serialized on device to avoid unnecessary device transfers + # NOTE: The number of global bodies is very small so serializing on device pays off. + + for _, i in self.global_reduction.global_particles(): Assign(self.sim, unsorted_idx[nglobal_red], i) Assign(self.sim, sorted_idx[nglobal_red], 0) Assign(self.sim, removed_idx[nglobal_red], 0) - Assign(self.sim, nglobal_red, nglobal_red +1) - - min_uid = self.sim.add_temp_var(0, Types.UInt64) - min_idx = self.sim.add_temp_var(0) + AtomicInc(self.sim, nglobal_red, 1) # Here we sort indices of global bodies with respect to their uid's. # The sorted uid's will be in identical order on all ranks. This ensures that the # reduced properties are mapped correctly to each global body during inplace reduction. - for i in For(self.sim, 0, nglobal_red): + for i in For(self.sim, 0, nglobal_red, serial=True): + # FIXME: nglobal_red is readonly in this kernel. But it's a resize variable above. + # The line below enforces it to be passed by pointer since it's a device variable. + AtomicInc(self.sim, nglobal_red, 0) + Assign(self.sim, min_uid, -1) # TODO: Lit max: UINT64_MAX Assign(self.sim, min_idx, 0) for j in For(self.sim, 0, nglobal_red): diff --git a/src/pairs/sim/interaction.py b/src/pairs/sim/interaction.py index 1c350cc04174f9c60580865a84e5bf3a48038dc6..0ca80a537ab3cff6617b6b43e96e8b842c375c89 100644 --- a/src/pairs/sim/interaction.py +++ b/src/pairs/sim/interaction.py @@ -428,6 +428,37 @@ class ParticleInteraction(Lowerable): self.sim.add_statement(Filter(self.sim, interaction_data.cutoff_condition, self.blocks[ishape*self.maxs + jshape])) self.apply_reductions(i, ishape, jshape, atomic) + def intersects_subdom(self, ishape, i): + ishape_id = self.sim.get_shape_id(ishape) + + position = self.sim.position() + rotation_matrix = self.sim.property('rotation_matrix') + rm = [Abs(self.sim, rotation_matrix[i][d]) for d in range(self.sim.ndims() ** 2)] + + if ishape_id == Shapes.Sphere: + radius = self.sim.property('radius') + aabb_max = position[i] + radius[i] + aabb_min = position[i] - radius[i] + + if ishape_id == Shapes.Box: + edge_length = self.sim.property('edge_length') + l = edge_length[i] * 0.5 + bound = Vector(self.sim, [ + rm[0]*l[0] + rm[1]*l[1] + rm[2]*l[2], + rm[3]*l[0] + rm[4]*l[1] + rm[5]*l[2], + rm[6]*l[0] + rm[7]*l[1] + rm[8]*l[2]]) + aabb_max = position[i] + bound + aabb_min = position[i] - bound + + intersects = None + for dim in range(self.sim.ndims()): + dom_min = self.cell_lists.dom_part.min(dim) + dom_max = self.cell_lists.dom_part.max(dim) + d_cond = ScalarOp.and_op(aabb_min[dim] < dom_max, aabb_max[dim] >= dom_min) + intersects = d_cond if intersects is None else ScalarOp.and_op(intersects, d_cond) + + return intersects + @pairs_block def lower(self): self.sim.module_name(f"{self.module_name}_local_interactions") diff --git a/src/pairs/transformations/devices.py b/src/pairs/transformations/devices.py index 1eb6f1d36a7d97036de2336257508f0879dcd825..4c4ec154881fcbd71e57724036fdc62412c46fff 100644 --- a/src/pairs/transformations/devices.py +++ b/src/pairs/transformations/devices.py @@ -103,7 +103,7 @@ class AddDeviceKernels(Mutator): def mutate_For(self, ast_node): if ast_node.is_kernel_candidate() and self._device_module: kernel = self.create_kernel(ast_node.sim, ast_node.iterator, ast_node.max, ast_node.block) - ast_node = KernelLaunch(ast_node.sim, kernel, ast_node.iterator, ast_node.min, ast_node.max) + ast_node = KernelLaunch(ast_node.sim, kernel, ast_node.iterator, ast_node.min, ast_node.max, ast_node.serial) else: ast_node.block = self.mutate(ast_node.block)