diff --git a/Makefile b/Makefile
index 384f6fb5bd9a28619e71fac66f8f123c378ba180..f9ea605eb59fce79b4eac99a7fe676a57302f252 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,10 @@ LIKWID_INC ?= -I/usr/local/include
 LIKWID_DEFINES ?= -DLIKWID_PERFMON
 LIKWID_LIB ?= -L/usr/local/lib
 LIKWID_FLAGS = -llikwid ${LIKWID_INC} ${LIKWID_DEFINES} ${LIKWID_LIB}
-CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${LIKWID_FLAGS}
+#CUDA_FLAGS=
+CUDA_FLAGS=-DENABLE_CUDA_AWARE_MPI
+CFLAGS=-Ofast -march=core-avx2 ${CUDA_FLAGS} ${LIKWID_FLAGS}
+#CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${LIKWID_FLAGS}
 #CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${LIKWID_FLAGS}
 CUDA_BIN_PATH:="$(shell dirname ${NVCC_PATH})"
 CUDA_PATH:="$(shell dirname ${CUDA_BIN_PATH})"
@@ -54,8 +57,8 @@ $(CPU_BIN): $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OB
 	$(CC) $(CFLAGS) -o $(CPU_BIN) $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OBJ_PATH)/dummy.o $(DEBUG_FLAGS)
 
 $(GPU_BIN): $(GPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o 
-	$(NVCC) -c -o $(OBJ_PATH)/cuda_runtime.o runtime/devices/cuda.cu ${DEBUG_FLAGS}
-	$(NVCC) -c -o $(OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) -DDEBUG
+	$(NVCC) $(CUDA_FLAGS) -c -o $(OBJ_PATH)/cuda_runtime.o runtime/devices/cuda.cu ${DEBUG_FLAGS}
+	$(NVCC) $(CUDA_FLAGS) -c -o $(OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) -DDEBUG
 	$(CC) -o $(GPU_BIN) $(CFLAGS) $(OBJ_PATH)/$(GPU_BIN).o $(OBJ_PATH)/cuda_runtime.o $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o -lcudart -L$(CUDA_PATH)/lib64
 
 clean:
diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp
index 092cfaedc92f0b93d4f2f8ed7411fa87bebbaf96..384891a4753dc4f3d0f4ee21631fbc755182f2d3 100644
--- a/runtime/devices/device.hpp
+++ b/runtime/devices/device.hpp
@@ -1,6 +1,8 @@
 #include <iostream>
 #include <cstddef>
 
+#include "../pairs_common.hpp"
+
 #pragma once
 
 #ifndef PAIRS_TARGET_CUDA
@@ -27,6 +29,12 @@ inline __host__ int host_atomic_add(int *addr, int val) {
     return *addr - val;
 }
 
+inline __host__ real_t host_atomic_add(real_t *addr, real_t val) {
+    real_t tmp = *addr;
+    *addr += val;
+    return tmp;
+}
+
 inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
     const int add_res = *addr + val;
     if(add_res >= capacity) {
@@ -38,7 +46,27 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize
 }
 
 #ifdef PAIRS_TARGET_CUDA
+#if __CUDA_ARCH__ < 600
+__device__ double atomicAdd_double(double* address, double val) {
+    unsigned long long int * ull_addr = (unsigned long long int*) address;
+    unsigned long long int old = *ull_addr, assumed;
+
+    do {
+        assumed = old;
+        old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+#else
+__device__ double atomicAdd_double(double* address, double val) {
+    return atomicAdd(address, val);
+}
+#endif
+
 __device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); }
+__device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); }
 __device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
     const int add_res = *addr + val;
     if(add_res >= capacity) {
@@ -50,6 +78,7 @@ __device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capa
 }
 #else
 inline int atomic_add(int *addr, int val) { return host_atomic_add(addr, val); }
+inline int atomic_add(real_t *addr, real_t val) { return host_atomic_add(addr, val); }
 inline int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
     return host_atomic_add_resize_check(addr, val, resize, capacity);
 }
diff --git a/runtime/domain/regular_6d_stencil.cpp b/runtime/domain/regular_6d_stencil.cpp
index 80d617a881a43ac089e4643825a89669f7468745..84bbf2c381e227c59c34641605ab8d9fd4d61efc 100644
--- a/runtime/domain/regular_6d_stencil.cpp
+++ b/runtime/domain/regular_6d_stencil.cpp
@@ -155,9 +155,17 @@ void Regular6DStencil::communicateData(
             MPI_COMM_WORLD, &send_requests[0]);
         */
     } else {
+        #ifdef ENABLE_CUDA_AWARE_MPI
+        cudaMemcpy(
+            recv_prev,
+            send_prev,
+            nsend[dim * 2 + 0] * elem_size * sizeof(real_t),
+            cudaMemcpyDeviceToDevice);
+        #else
         for(int i = 0; i < nsend[dim * 2 + 0] * elem_size; i++) {
             recv_prev[i] = send_prev[i];
         }
+        #endif
     }
 
     if(next[dim] != rank) {
@@ -176,9 +184,17 @@ void Regular6DStencil::communicateData(
             MPI_COMM_WORLD, &send_requests[1]);
         */
     } else {
+        #ifdef ENABLE_CUDA_AWARE_MPI
+        cudaMemcpy(
+            recv_next,
+            send_next,
+            nsend[dim * 2 + 1] * elem_size * sizeof(real_t),
+            cudaMemcpyDeviceToDevice);
+        #else
         for(int i = 0; i < nsend[dim * 2 + 1] * elem_size; i++) {
             recv_next[i] = send_next[i];
         }
+        #endif
     }
 
     //MPI_Waitall(2, recv_requests, MPI_STATUSES_IGNORE);
@@ -215,9 +231,17 @@ void Regular6DStencil::communicateAllData(
                 MPI_COMM_WORLD, &recv_requests[d * 2 + 0]);
             */
         } else {
+            #ifdef ENABLE_CUDA_AWARE_MPI
+            cudaMemcpy(
+                recv_prev,
+                send_prev,
+                nsend[d * 2 + 0] * elem_size * sizeof(real_t),
+                cudaMemcpyDeviceToDevice);
+            #else
             for (int i = 0; i < nsend[d * 2 + 0] * elem_size; i++) {
                 recv_prev[i] = send_prev[i];
             }
+            #endif
         }
 
         if (next[d] != rank) {
@@ -236,9 +260,17 @@ void Regular6DStencil::communicateAllData(
                 MPI_COMM_WORLD, &recv_requests[d * 2 + 1]);
             */
         } else {
+            #ifdef ENABLE_CUDA_AWARE_MPI
+            cudaMemcpy(
+                recv_next,
+                send_next,
+                nsend[d * 2 + 1] * elem_size * sizeof(real_t),
+                cudaMemcpyDeviceToDevice);
+            #else
             for (int i = 0; i < nsend[d * 2 + 1] * elem_size; i++) {
                 recv_next[i] = send_next[i];
             }
+            #endif
         }
     }
 
diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp
index 0b53437b13ffecdc4b80646d3b354aede3a284dd..67c73f889f0327b54fa2abe74c2132cbc2098aea 100644
--- a/runtime/pairs.cpp
+++ b/runtime/pairs.cpp
@@ -346,8 +346,12 @@ void PairsSimulation::communicateData(
     const real_t *send_buf, const int *send_offsets, const int *nsend,
     real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
 
-    auto send_buf_id = getArrayByHostPointer(send_buf).getId();
-    auto recv_buf_id = getArrayByHostPointer(recv_buf).getId();
+    const real_t *send_buf_ptr = send_buf;
+    real_t *recv_buf_ptr = recv_buf;
+    auto send_buf_array = getArrayByHostPointer(send_buf);
+    auto recv_buf_array = getArrayByHostPointer(recv_buf);
+    auto send_buf_id = send_buf_array.getId();
+    auto recv_buf_id = recv_buf_array.getId();
     auto send_offsets_id = getArrayByHostPointer(send_offsets).getId();
     auto recv_offsets_id = getArrayByHostPointer(recv_offsets).getId();
     auto nsend_id = getArrayByHostPointer(nsend).getId();
@@ -359,6 +363,10 @@ void PairsSimulation::communicateData(
     copyArrayToHost(nsend_id, ReadOnly);
     copyArrayToHost(nrecv_id, ReadOnly);
 
+    #ifdef ENABLE_CUDA_AWARE_MPI
+    send_buf_ptr = send_buf_array.getDevicePointer();
+    recv_buf_ptr = recv_buf_array.getDevicePointer();
+    #else
     int nsend_all = 0;
     int nrecv_all = 0;
     for(int d = 0; d <= dim; d++) {
@@ -374,23 +382,28 @@ void PairsSimulation::communicateData(
     int rcv_offset = recv_offsets[dim * 2 + 0] * elem_size * sizeof(real_t);
     int snd_size = (nsend[dim * 2 + 0] + nsend[dim * 2 + 1]) * elem_size * sizeof(real_t);
     int rcv_size = (nrecv[dim * 2 + 0] + nrecv[dim * 2 + 1]) * elem_size * sizeof(real_t);
+
+    copyArraySliceToHost(send_buf_array, Ignore, snd_offset, snd_size * elem_size * sizeof(real_t));
     */
 
-    //copyArraySliceToHost(send_buf_array, Ignore, snd_offset, snd_size * elem_size * sizeof(real_t));
     copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t));
     array_flags->setHostFlag(recv_buf_id);
     array_flags->clearDeviceFlag(recv_buf_id);
+    #endif
+
     this->getTimers()->stop(DeviceTransfers);
 
     this->getTimers()->start(Communication);
     this->getDomainPartitioner()->communicateData(
-        dim, elem_size, send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv);
+        dim, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv);
     this->getTimers()->stop(Communication);
 
+    #ifndef ENABLE_CUDA_AWARE_MPI
     this->getTimers()->start(DeviceTransfers);
     //copyArraySliceToDevice(recv_buf_array, Ignore, rcv_offset, rcv_size * elem_size * sizeof(real_t));
     copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
     this->getTimers()->stop(DeviceTransfers);
+    #endif
 }
 
 void PairsSimulation::communicateAllData(
@@ -398,8 +411,12 @@ void PairsSimulation::communicateAllData(
     const real_t *send_buf, const int *send_offsets, const int *nsend,
     real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
 
-    auto send_buf_id = getArrayByHostPointer(send_buf).getId();
-    auto recv_buf_id = getArrayByHostPointer(recv_buf).getId();
+    const real_t *send_buf_ptr = send_buf;
+    real_t *recv_buf_ptr = recv_buf;
+    auto send_buf_array = getArrayByHostPointer(send_buf);
+    auto recv_buf_array = getArrayByHostPointer(recv_buf);
+    auto send_buf_id = send_buf_array.getId();
+    auto recv_buf_id = recv_buf_array.getId();
     auto send_offsets_id = getArrayByHostPointer(send_offsets).getId();
     auto recv_offsets_id = getArrayByHostPointer(recv_offsets).getId();
     auto nsend_id = getArrayByHostPointer(nsend).getId();
@@ -411,6 +428,10 @@ void PairsSimulation::communicateAllData(
     copyArrayToHost(nsend_id, ReadOnly);
     copyArrayToHost(nrecv_id, ReadOnly);
 
+    #ifdef ENABLE_CUDA_AWARE_MPI
+    send_buf_ptr = send_buf_array.getDevicePointer();
+    recv_buf_ptr = recv_buf_array.getDevicePointer();
+    #else
     int nsend_all = 0;
     int nrecv_all = 0;
     for(int d = 0; d <= ndims; d++) {
@@ -423,16 +444,20 @@ void PairsSimulation::communicateAllData(
     copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t));
     array_flags->setHostFlag(recv_buf_id);
     array_flags->clearDeviceFlag(recv_buf_id);
+    #endif
+
     this->getTimers()->stop(DeviceTransfers);
 
     this->getTimers()->start(Communication);
     this->getDomainPartitioner()->communicateAllData(
-        ndims, elem_size, send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv);
+        ndims, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv);
     this->getTimers()->stop(Communication);
 
+    #ifndef ENABLE_CUDA_AWARE_MPI
     this->getTimers()->start(DeviceTransfers);
     copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
     this->getTimers()->stop(DeviceTransfers);
+    #endif
 }
 
 void PairsSimulation::communicateContactHistoryData(
@@ -440,8 +465,12 @@ void PairsSimulation::communicateContactHistoryData(
     const real_t *send_buf, const int *contact_soffsets, const int *nsend_contact,
     real_t *recv_buf, int *contact_roffsets, int *nrecv_contact) {
 
-    auto send_buf_id = getArrayByHostPointer(send_buf).getId();
-    auto recv_buf_id = getArrayByHostPointer(recv_buf).getId();
+    const real_t *send_buf_ptr = send_buf;
+    real_t *recv_buf_ptr = recv_buf;
+    auto send_buf_array = getArrayByHostPointer(send_buf);
+    auto recv_buf_array = getArrayByHostPointer(recv_buf);
+    auto send_buf_id = send_buf_array.getId();
+    auto recv_buf_id = recv_buf_array.getId();
     auto contact_soffsets_id = getArrayByHostPointer(contact_soffsets).getId();
     auto contact_roffsets_id = getArrayByHostPointer(contact_roffsets).getId();
     auto nsend_contact_id = getArrayByHostPointer(nsend_contact).getId();
@@ -459,9 +488,15 @@ void PairsSimulation::communicateContactHistoryData(
         nsend_all += nsend_contact[d * 2 + 1];
     }
 
+    #ifdef ENABLE_CUDA_AWARE_MPI
+    send_buf_ptr = send_buf_array.getDevicePointer();
+    recv_buf_ptr = recv_buf_array.getDevicePointer();
+    #else
     copyArrayToHost(send_buf_id, Ignore, nsend_all * sizeof(real_t));
     array_flags->setHostFlag(recv_buf_id);
     array_flags->clearDeviceFlag(recv_buf_id);
+    #endif
+
     this->getTimers()->stop(DeviceTransfers);
 
     this->getTimers()->start(Communication);
@@ -477,13 +512,18 @@ void PairsSimulation::communicateContactHistoryData(
     }
 
     this->getDomainPartitioner()->communicateData(
-        dim, 1, send_buf, contact_soffsets, nsend_contact, recv_buf, contact_roffsets, nrecv_contact);
+        dim, 1,
+        send_buf_ptr, contact_soffsets, nsend_contact,
+        recv_buf_ptr, contact_roffsets, nrecv_contact);
+
     this->getTimers()->stop(Communication);
 
+    #ifndef ENABLE_CUDA_AWARE_MPI
     this->getTimers()->start(DeviceTransfers);
     copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * sizeof(real_t));
     copyArrayToDevice(contact_roffsets_id, Ignore);
     this->getTimers()->stop(DeviceTransfers);
+    #endif
 }
 
 void PairsSimulation::fillCommunicationArrays(int *neighbor_ranks, int *pbc, real_t *subdom) {
diff --git a/runtime/pairs_common.hpp b/runtime/pairs_common.hpp
index dfb6c2e2e5aec371258f9db803a169c6e528e636..4272bc438164739b5d10455a1e1281b650632a11 100644
--- a/runtime/pairs_common.hpp
+++ b/runtime/pairs_common.hpp
@@ -9,6 +9,12 @@ typedef double real_t;
 //typedef float real_t;
 //#endif
 
+#ifndef PAIRS_TARGET_CUDA
+#   ifdef ENABLE_CUDA_AWARE_MPI
+#       undef ENABLE_CUDA_AWARE_MPI
+#   endif
+#endif
+
 typedef int array_t;
 typedef int property_t;
 typedef int layout_t;
diff --git a/src/pairs/ir/apply.py b/src/pairs/ir/apply.py
index 5271b1161d621ee3675f0d6313bd9e4855c94acf..a6589d5e8551c6b331febd457f925c7dab3d47bf 100644
--- a/src/pairs/ir/apply.py
+++ b/src/pairs/ir/apply.py
@@ -1,5 +1,6 @@
 from pairs.ir.assign import Assign
 from pairs.ir.ast_node import ASTNode
+from pairs.ir.atomic import AtomicInc
 from pairs.ir.block import pairs_inline
 from pairs.ir.branches import Filter
 from pairs.ir.lit import Lit
@@ -116,4 +117,9 @@ class Apply(Lowerable):
                             ScalarOp.and_op(self._j < self.sim.nlocal,
                                             ScalarOp.cmp(self.sim.particle_flags[self._j] & Flags.Fixed, 0))):
 
-                Assign(self.sim, self._prop[self._j], self._prop[self._j] - self._expr_j)
+                if Types.is_scalar(self._prop.type()):
+                    AtomicInc(self.sim, self._prop[self._j], -self._expr_j)
+
+                else:
+                    for d in range(Types.number_of_elements(self.sim, self._prop.type())):
+                        AtomicInc(self.sim, self._prop[self._j][d], -(self._expr_j[d]))
diff --git a/src/pairs/ir/ast_term.py b/src/pairs/ir/ast_term.py
index 4c2c7b407a5357650f152b8f49dcd07c4b458a55..a72133d2bc1761a20c9588c1fbbcf8d39f5b7a26 100644
--- a/src/pairs/ir/ast_term.py
+++ b/src/pairs/ir/ast_term.py
@@ -22,6 +22,9 @@ class ASTTerm(ASTNode):
     def __rsub__(self, other):
         return self._class_type(self.sim, other, self, Operators.Sub)
 
+    def __neg__(self):
+        return self._class_type(self.sim, self, None, Operators.USub)
+
     def __mul__(self, other):
         return self._class_type(self.sim, self, other, Operators.Mul)