diff --git a/Makefile b/Makefile index 384f6fb5bd9a28619e71fac66f8f123c378ba180..f9ea605eb59fce79b4eac99a7fe676a57302f252 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,10 @@ LIKWID_INC ?= -I/usr/local/include LIKWID_DEFINES ?= -DLIKWID_PERFMON LIKWID_LIB ?= -L/usr/local/lib LIKWID_FLAGS = -llikwid ${LIKWID_INC} ${LIKWID_DEFINES} ${LIKWID_LIB} -CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${LIKWID_FLAGS} +#CUDA_FLAGS= +CUDA_FLAGS=-DENABLE_CUDA_AWARE_MPI +CFLAGS=-Ofast -march=core-avx2 ${CUDA_FLAGS} ${LIKWID_FLAGS} +#CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${LIKWID_FLAGS} #CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${LIKWID_FLAGS} CUDA_BIN_PATH:="$(shell dirname ${NVCC_PATH})" CUDA_PATH:="$(shell dirname ${CUDA_BIN_PATH})" @@ -54,8 +57,8 @@ $(CPU_BIN): $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OB $(CC) $(CFLAGS) -o $(CPU_BIN) $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OBJ_PATH)/dummy.o $(DEBUG_FLAGS) $(GPU_BIN): $(GPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o - $(NVCC) -c -o $(OBJ_PATH)/cuda_runtime.o runtime/devices/cuda.cu ${DEBUG_FLAGS} - $(NVCC) -c -o $(OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) -DDEBUG + $(NVCC) $(CUDA_FLAGS) -c -o $(OBJ_PATH)/cuda_runtime.o runtime/devices/cuda.cu ${DEBUG_FLAGS} + $(NVCC) $(CUDA_FLAGS) -c -o $(OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) -DDEBUG $(CC) -o $(GPU_BIN) $(CFLAGS) $(OBJ_PATH)/$(GPU_BIN).o $(OBJ_PATH)/cuda_runtime.o $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o -lcudart -L$(CUDA_PATH)/lib64 clean: diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp index 092cfaedc92f0b93d4f2f8ed7411fa87bebbaf96..384891a4753dc4f3d0f4ee21631fbc755182f2d3 100644 --- a/runtime/devices/device.hpp +++ b/runtime/devices/device.hpp @@ -1,6 +1,8 @@ #include <iostream> #include <cstddef> +#include "../pairs_common.hpp" + #pragma once #ifndef PAIRS_TARGET_CUDA @@ -27,6 +29,12 @@ inline __host__ int host_atomic_add(int *addr, int val) { return *addr - val; } +inline __host__ real_t host_atomic_add(real_t *addr, real_t val) { + real_t tmp = *addr; + *addr += val; + return tmp; +} + inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize, int capacity) { const int add_res = *addr + val; if(add_res >= capacity) { @@ -38,7 +46,27 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize } #ifdef PAIRS_TARGET_CUDA +#if __CUDA_ARCH__ < 600 +__device__ double atomicAdd_double(double* address, double val) { + unsigned long long int * ull_addr = (unsigned long long int*) address; + unsigned long long int old = *ull_addr, assumed; + + do { + assumed = old; + old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#else +__device__ double atomicAdd_double(double* address, double val) { + return atomicAdd(address, val); +} +#endif + __device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); } +__device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); } __device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) { const int add_res = *addr + val; if(add_res >= capacity) { @@ -50,6 +78,7 @@ __device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capa } #else inline int atomic_add(int *addr, int val) { return host_atomic_add(addr, val); } +inline int atomic_add(real_t *addr, real_t val) { return host_atomic_add(addr, val); } inline int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) { return host_atomic_add_resize_check(addr, val, resize, capacity); } diff --git a/runtime/domain/regular_6d_stencil.cpp b/runtime/domain/regular_6d_stencil.cpp index 80d617a881a43ac089e4643825a89669f7468745..84bbf2c381e227c59c34641605ab8d9fd4d61efc 100644 --- a/runtime/domain/regular_6d_stencil.cpp +++ b/runtime/domain/regular_6d_stencil.cpp @@ -155,9 +155,17 @@ void Regular6DStencil::communicateData( MPI_COMM_WORLD, &send_requests[0]); */ } else { + #ifdef ENABLE_CUDA_AWARE_MPI + cudaMemcpy( + recv_prev, + send_prev, + nsend[dim * 2 + 0] * elem_size * sizeof(real_t), + cudaMemcpyDeviceToDevice); + #else for(int i = 0; i < nsend[dim * 2 + 0] * elem_size; i++) { recv_prev[i] = send_prev[i]; } + #endif } if(next[dim] != rank) { @@ -176,9 +184,17 @@ void Regular6DStencil::communicateData( MPI_COMM_WORLD, &send_requests[1]); */ } else { + #ifdef ENABLE_CUDA_AWARE_MPI + cudaMemcpy( + recv_next, + send_next, + nsend[dim * 2 + 1] * elem_size * sizeof(real_t), + cudaMemcpyDeviceToDevice); + #else for(int i = 0; i < nsend[dim * 2 + 1] * elem_size; i++) { recv_next[i] = send_next[i]; } + #endif } //MPI_Waitall(2, recv_requests, MPI_STATUSES_IGNORE); @@ -215,9 +231,17 @@ void Regular6DStencil::communicateAllData( MPI_COMM_WORLD, &recv_requests[d * 2 + 0]); */ } else { + #ifdef ENABLE_CUDA_AWARE_MPI + cudaMemcpy( + recv_prev, + send_prev, + nsend[d * 2 + 0] * elem_size * sizeof(real_t), + cudaMemcpyDeviceToDevice); + #else for (int i = 0; i < nsend[d * 2 + 0] * elem_size; i++) { recv_prev[i] = send_prev[i]; } + #endif } if (next[d] != rank) { @@ -236,9 +260,17 @@ void Regular6DStencil::communicateAllData( MPI_COMM_WORLD, &recv_requests[d * 2 + 1]); */ } else { + #ifdef ENABLE_CUDA_AWARE_MPI + cudaMemcpy( + recv_next, + send_next, + nsend[d * 2 + 1] * elem_size * sizeof(real_t), + cudaMemcpyDeviceToDevice); + #else for (int i = 0; i < nsend[d * 2 + 1] * elem_size; i++) { recv_next[i] = send_next[i]; } + #endif } } diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp index 0b53437b13ffecdc4b80646d3b354aede3a284dd..67c73f889f0327b54fa2abe74c2132cbc2098aea 100644 --- a/runtime/pairs.cpp +++ b/runtime/pairs.cpp @@ -346,8 +346,12 @@ void PairsSimulation::communicateData( const real_t *send_buf, const int *send_offsets, const int *nsend, real_t *recv_buf, const int *recv_offsets, const int *nrecv) { - auto send_buf_id = getArrayByHostPointer(send_buf).getId(); - auto recv_buf_id = getArrayByHostPointer(recv_buf).getId(); + const real_t *send_buf_ptr = send_buf; + real_t *recv_buf_ptr = recv_buf; + auto send_buf_array = getArrayByHostPointer(send_buf); + auto recv_buf_array = getArrayByHostPointer(recv_buf); + auto send_buf_id = send_buf_array.getId(); + auto recv_buf_id = recv_buf_array.getId(); auto send_offsets_id = getArrayByHostPointer(send_offsets).getId(); auto recv_offsets_id = getArrayByHostPointer(recv_offsets).getId(); auto nsend_id = getArrayByHostPointer(nsend).getId(); @@ -359,6 +363,10 @@ void PairsSimulation::communicateData( copyArrayToHost(nsend_id, ReadOnly); copyArrayToHost(nrecv_id, ReadOnly); + #ifdef ENABLE_CUDA_AWARE_MPI + send_buf_ptr = send_buf_array.getDevicePointer(); + recv_buf_ptr = recv_buf_array.getDevicePointer(); + #else int nsend_all = 0; int nrecv_all = 0; for(int d = 0; d <= dim; d++) { @@ -374,23 +382,28 @@ void PairsSimulation::communicateData( int rcv_offset = recv_offsets[dim * 2 + 0] * elem_size * sizeof(real_t); int snd_size = (nsend[dim * 2 + 0] + nsend[dim * 2 + 1]) * elem_size * sizeof(real_t); int rcv_size = (nrecv[dim * 2 + 0] + nrecv[dim * 2 + 1]) * elem_size * sizeof(real_t); + + copyArraySliceToHost(send_buf_array, Ignore, snd_offset, snd_size * elem_size * sizeof(real_t)); */ - //copyArraySliceToHost(send_buf_array, Ignore, snd_offset, snd_size * elem_size * sizeof(real_t)); copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t)); array_flags->setHostFlag(recv_buf_id); array_flags->clearDeviceFlag(recv_buf_id); + #endif + this->getTimers()->stop(DeviceTransfers); this->getTimers()->start(Communication); this->getDomainPartitioner()->communicateData( - dim, elem_size, send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv); + dim, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv); this->getTimers()->stop(Communication); + #ifndef ENABLE_CUDA_AWARE_MPI this->getTimers()->start(DeviceTransfers); //copyArraySliceToDevice(recv_buf_array, Ignore, rcv_offset, rcv_size * elem_size * sizeof(real_t)); copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t)); this->getTimers()->stop(DeviceTransfers); + #endif } void PairsSimulation::communicateAllData( @@ -398,8 +411,12 @@ void PairsSimulation::communicateAllData( const real_t *send_buf, const int *send_offsets, const int *nsend, real_t *recv_buf, const int *recv_offsets, const int *nrecv) { - auto send_buf_id = getArrayByHostPointer(send_buf).getId(); - auto recv_buf_id = getArrayByHostPointer(recv_buf).getId(); + const real_t *send_buf_ptr = send_buf; + real_t *recv_buf_ptr = recv_buf; + auto send_buf_array = getArrayByHostPointer(send_buf); + auto recv_buf_array = getArrayByHostPointer(recv_buf); + auto send_buf_id = send_buf_array.getId(); + auto recv_buf_id = recv_buf_array.getId(); auto send_offsets_id = getArrayByHostPointer(send_offsets).getId(); auto recv_offsets_id = getArrayByHostPointer(recv_offsets).getId(); auto nsend_id = getArrayByHostPointer(nsend).getId(); @@ -411,6 +428,10 @@ void PairsSimulation::communicateAllData( copyArrayToHost(nsend_id, ReadOnly); copyArrayToHost(nrecv_id, ReadOnly); + #ifdef ENABLE_CUDA_AWARE_MPI + send_buf_ptr = send_buf_array.getDevicePointer(); + recv_buf_ptr = recv_buf_array.getDevicePointer(); + #else int nsend_all = 0; int nrecv_all = 0; for(int d = 0; d <= ndims; d++) { @@ -423,16 +444,20 @@ void PairsSimulation::communicateAllData( copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t)); array_flags->setHostFlag(recv_buf_id); array_flags->clearDeviceFlag(recv_buf_id); + #endif + this->getTimers()->stop(DeviceTransfers); this->getTimers()->start(Communication); this->getDomainPartitioner()->communicateAllData( - ndims, elem_size, send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv); + ndims, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv); this->getTimers()->stop(Communication); + #ifndef ENABLE_CUDA_AWARE_MPI this->getTimers()->start(DeviceTransfers); copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t)); this->getTimers()->stop(DeviceTransfers); + #endif } void PairsSimulation::communicateContactHistoryData( @@ -440,8 +465,12 @@ void PairsSimulation::communicateContactHistoryData( const real_t *send_buf, const int *contact_soffsets, const int *nsend_contact, real_t *recv_buf, int *contact_roffsets, int *nrecv_contact) { - auto send_buf_id = getArrayByHostPointer(send_buf).getId(); - auto recv_buf_id = getArrayByHostPointer(recv_buf).getId(); + const real_t *send_buf_ptr = send_buf; + real_t *recv_buf_ptr = recv_buf; + auto send_buf_array = getArrayByHostPointer(send_buf); + auto recv_buf_array = getArrayByHostPointer(recv_buf); + auto send_buf_id = send_buf_array.getId(); + auto recv_buf_id = recv_buf_array.getId(); auto contact_soffsets_id = getArrayByHostPointer(contact_soffsets).getId(); auto contact_roffsets_id = getArrayByHostPointer(contact_roffsets).getId(); auto nsend_contact_id = getArrayByHostPointer(nsend_contact).getId(); @@ -459,9 +488,15 @@ void PairsSimulation::communicateContactHistoryData( nsend_all += nsend_contact[d * 2 + 1]; } + #ifdef ENABLE_CUDA_AWARE_MPI + send_buf_ptr = send_buf_array.getDevicePointer(); + recv_buf_ptr = recv_buf_array.getDevicePointer(); + #else copyArrayToHost(send_buf_id, Ignore, nsend_all * sizeof(real_t)); array_flags->setHostFlag(recv_buf_id); array_flags->clearDeviceFlag(recv_buf_id); + #endif + this->getTimers()->stop(DeviceTransfers); this->getTimers()->start(Communication); @@ -477,13 +512,18 @@ void PairsSimulation::communicateContactHistoryData( } this->getDomainPartitioner()->communicateData( - dim, 1, send_buf, contact_soffsets, nsend_contact, recv_buf, contact_roffsets, nrecv_contact); + dim, 1, + send_buf_ptr, contact_soffsets, nsend_contact, + recv_buf_ptr, contact_roffsets, nrecv_contact); + this->getTimers()->stop(Communication); + #ifndef ENABLE_CUDA_AWARE_MPI this->getTimers()->start(DeviceTransfers); copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * sizeof(real_t)); copyArrayToDevice(contact_roffsets_id, Ignore); this->getTimers()->stop(DeviceTransfers); + #endif } void PairsSimulation::fillCommunicationArrays(int *neighbor_ranks, int *pbc, real_t *subdom) { diff --git a/runtime/pairs_common.hpp b/runtime/pairs_common.hpp index dfb6c2e2e5aec371258f9db803a169c6e528e636..4272bc438164739b5d10455a1e1281b650632a11 100644 --- a/runtime/pairs_common.hpp +++ b/runtime/pairs_common.hpp @@ -9,6 +9,12 @@ typedef double real_t; //typedef float real_t; //#endif +#ifndef PAIRS_TARGET_CUDA +# ifdef ENABLE_CUDA_AWARE_MPI +# undef ENABLE_CUDA_AWARE_MPI +# endif +#endif + typedef int array_t; typedef int property_t; typedef int layout_t; diff --git a/src/pairs/ir/apply.py b/src/pairs/ir/apply.py index 5271b1161d621ee3675f0d6313bd9e4855c94acf..a6589d5e8551c6b331febd457f925c7dab3d47bf 100644 --- a/src/pairs/ir/apply.py +++ b/src/pairs/ir/apply.py @@ -1,5 +1,6 @@ from pairs.ir.assign import Assign from pairs.ir.ast_node import ASTNode +from pairs.ir.atomic import AtomicInc from pairs.ir.block import pairs_inline from pairs.ir.branches import Filter from pairs.ir.lit import Lit @@ -116,4 +117,9 @@ class Apply(Lowerable): ScalarOp.and_op(self._j < self.sim.nlocal, ScalarOp.cmp(self.sim.particle_flags[self._j] & Flags.Fixed, 0))): - Assign(self.sim, self._prop[self._j], self._prop[self._j] - self._expr_j) + if Types.is_scalar(self._prop.type()): + AtomicInc(self.sim, self._prop[self._j], -self._expr_j) + + else: + for d in range(Types.number_of_elements(self.sim, self._prop.type())): + AtomicInc(self.sim, self._prop[self._j][d], -(self._expr_j[d])) diff --git a/src/pairs/ir/ast_term.py b/src/pairs/ir/ast_term.py index 4c2c7b407a5357650f152b8f49dcd07c4b458a55..a72133d2bc1761a20c9588c1fbbcf8d39f5b7a26 100644 --- a/src/pairs/ir/ast_term.py +++ b/src/pairs/ir/ast_term.py @@ -22,6 +22,9 @@ class ASTTerm(ASTNode): def __rsub__(self, other): return self._class_type(self.sim, other, self, Operators.Sub) + def __neg__(self): + return self._class_type(self.sim, self, None, Operators.USub) + def __mul__(self, other): return self._class_type(self.sim, self, other, Operators.Mul)