diff --git a/examples/dem.py b/examples/dem.py
index 965b89d2df475f67bfeafe0dc2ecd33f718cbfb9..60e889b40e2bda6763f9818a938e59ec25c46355 100644
--- a/examples/dem.py
+++ b/examples/dem.py
@@ -122,9 +122,7 @@ minDiameter_SI = diameter_SI * 0.9
 maxDiameter_SI = diameter_SI * 1.1
 linkedCellWidth = 1.01 * maxDiameter_SI
 
-skin = 0.0
 ntypes = 1
-
 lnDryResCoeff = math.log(restitutionCoefficient);
 frictionStatic = 0.0
 frictionDynamic = frictionCoefficient
@@ -182,8 +180,7 @@ psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
                                      'infinity': math.inf })
 
 #psim.compute_half()
-#psim.build_cell_lists(linkedCellWidth)
-psim.build_neighbor_lists(linkedCellWidth + skin)
+psim.build_cell_lists(linkedCellWidth)
 psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
 
 psim.compute(gravity,
@@ -193,7 +190,7 @@ psim.compute(gravity,
                       'pi': math.pi })
 
 psim.compute(linear_spring_dashpot,
-             linkedCellWidth + skin,
+             linkedCellWidth,
              symbols={'dt': dt_SI,
                       'pi': math.pi,
                       'kappa': kappa,
diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu
index 687bb76600376008c95e6eda496c763c29b9938d..3e4f0672bf04fc924c8ce1683d01e999e5426dd0 100644
--- a/runtime/devices/cuda.cu
+++ b/runtime/devices/cuda.cu
@@ -42,6 +42,18 @@ __host__ void copy_to_host(const void *d_ptr, void *h_ptr, size_t count) {
     CUDA_ASSERT(cudaMemcpy(h_ptr, d_ptr, count, cudaMemcpyDeviceToHost));
 }
 
+__host__ void copy_slice_to_device(const void *h_ptr, void *d_ptr, size_t offset, size_t count) {
+    void *d_ptr_start = ((char *) d_ptr) + offset;
+    void *h_ptr_start = ((char *) h_ptr) + offset;
+    CUDA_ASSERT(cudaMemcpy(d_ptr_start, h_ptr_start, count, cudaMemcpyHostToDevice));
+}
+
+__host__ void copy_slice_to_host(const void *d_ptr, void *h_ptr, size_t offset, size_t count) {
+    void *d_ptr_start = ((char *) d_ptr) + offset;
+    void *h_ptr_start = ((char *) h_ptr) + offset;
+    CUDA_ASSERT(cudaMemcpy(h_ptr_start, d_ptr_start, count, cudaMemcpyDeviceToHost));
+}
+
 __host__ void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count) {
     CUDA_ASSERT(cudaMemcpyToSymbol(d_ptr, h_ptr, count));
 }
diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp
index e48d0af95f9d32fc9318e16a12af0b5192f346ed..092cfaedc92f0b93d4f2f8ed7411fa87bebbaf96 100644
--- a/runtime/devices/device.hpp
+++ b/runtime/devices/device.hpp
@@ -17,6 +17,8 @@ __host__ void device_free(void *ptr);
 __host__ void device_synchronize();
 __host__ void copy_to_device(const void *h_ptr, void *d_ptr, size_t count);
 __host__ void copy_to_host(const void *d_ptr, void *h_ptr, size_t count);
+__host__ void copy_slice_to_device(const void *h_ptr, void *d_ptr, size_t offset, size_t count);
+__host__ void copy_slice_to_host(const void *d_ptr, void *h_ptr, size_t offset, size_t count);
 __host__ void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count);
 __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t count);
 
diff --git a/runtime/devices/dummy.cpp b/runtime/devices/dummy.cpp
index 909d50ee7cddce3e3cc8131616e2051f7ea2b859..3b21e6850c08dd94124148c6bef73fe3c126b898 100644
--- a/runtime/devices/dummy.cpp
+++ b/runtime/devices/dummy.cpp
@@ -8,6 +8,8 @@ void device_free(void *ptr) {}
 void device_synchronize() {}
 void copy_to_device(void const *h_ptr, void *d_ptr, size_t count) {}
 void copy_to_host(void const *d_ptr, void *h_ptr, size_t count) {}
+void copy_slice_to_device(void const *h_ptr, void *d_ptr, size_t offset, size_t count) {}
+void copy_slice_to_host(void const *d_ptr, void *h_ptr, size_t offset, size_t count) {}
 void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count) {}
 void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t count) {}
 
diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp
index 7a11d6561ebebd6a9bcb6561378bc4a609d924b7..0b53437b13ffecdc4b80646d3b354aede3a284dd 100644
--- a/runtime/pairs.cpp
+++ b/runtime/pairs.cpp
@@ -158,6 +158,31 @@ FeatureProperty &PairsSimulation::getFeaturePropertyByName(std::string name) {
     return *fp;
 }
 
+void PairsSimulation::copyArraySliceToDevice(
+    Array &array, action_t action, size_t offset, size_t size) {
+
+    int array_id = array.getId();
+
+    if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
+        if(action == Ignore || !array_flags->isDeviceFlagSet(array_id)) {
+            if(!array.isStatic()) {
+                PAIRS_DEBUG(
+                    "Copying array %s to device (offset=%d, n=%d)\n",
+                    array.getName().c_str(), offset, size);
+
+                pairs::copy_slice_to_device(
+                    array.getHostPointer(), array.getDevicePointer(), offset, size);
+            }
+        }
+    }
+
+    if(action != ReadOnly) {
+        array_flags->clearHostFlag(array_id);
+    }
+
+    array_flags->setDeviceFlag(array_id);
+}
+
 void PairsSimulation::copyArrayToDevice(Array &array, action_t action, size_t size) {
     int array_id = array.getId();
 
@@ -180,6 +205,29 @@ void PairsSimulation::copyArrayToDevice(Array &array, action_t action, size_t si
     array_flags->setDeviceFlag(array_id);
 }
 
+void PairsSimulation::copyArraySliceToHost(Array &array, action_t action, size_t offset, size_t size) {
+    int array_id = array.getId();
+
+    if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
+        if(action == Ignore || !array_flags->isHostFlagSet(array_id)) {
+            if(!array.isStatic()) {
+                PAIRS_DEBUG(
+                    "Copying array %s to host (offset=%d, n=%d)\n",
+                    array.getName().c_str(), offset, size);
+
+                pairs::copy_slice_to_host(
+                    array.getDevicePointer(), array.getHostPointer(), offset, size);
+            }
+        }
+    }
+
+    if(action != ReadOnly) {
+        array_flags->clearDeviceFlag(array_id);
+    }
+
+    array_flags->setHostFlag(array_id);
+}
+
 void PairsSimulation::copyArrayToHost(Array &array, action_t action, size_t size) {
     int array_id = array.getId();
 
@@ -320,6 +368,15 @@ void PairsSimulation::communicateData(
         nrecv_all += nrecv[d * 2 + 1];
     }
 
+    /*
+    // TODO: this is hard-coded for 6D regular stencil, change it
+    int snd_offset = send_offsets[dim * 2 + 0] * elem_size * sizeof(real_t);
+    int rcv_offset = recv_offsets[dim * 2 + 0] * elem_size * sizeof(real_t);
+    int snd_size = (nsend[dim * 2 + 0] + nsend[dim * 2 + 1]) * elem_size * sizeof(real_t);
+    int rcv_size = (nrecv[dim * 2 + 0] + nrecv[dim * 2 + 1]) * elem_size * sizeof(real_t);
+    */
+
+    //copyArraySliceToHost(send_buf_array, Ignore, snd_offset, snd_size * elem_size * sizeof(real_t));
     copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t));
     array_flags->setHostFlag(recv_buf_id);
     array_flags->clearDeviceFlag(recv_buf_id);
@@ -331,6 +388,7 @@ void PairsSimulation::communicateData(
     this->getTimers()->stop(Communication);
 
     this->getTimers()->start(DeviceTransfers);
+    //copyArraySliceToDevice(recv_buf_array, Ignore, rcv_offset, rcv_size * elem_size * sizeof(real_t));
     copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
     this->getTimers()->stop(DeviceTransfers);
 }
diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp
index 415d63467100f3846dce2dee0fced17b014929ee..8944dfda738602867d7a9fb768cf81fd7e083d90 100644
--- a/runtime/pairs.hpp
+++ b/runtime/pairs.hpp
@@ -93,6 +93,7 @@ public:
     }
 
     void copyArrayToDevice(Array &array, action_t action, size_t size);
+    void copyArraySliceToDevice(Array &array, action_t action, size_t offset, size_t size);
 
     void copyArrayToHost(array_t id, action_t action) {
         auto& array = getArray(id);
@@ -104,6 +105,7 @@ public:
     }
 
     void copyArrayToHost(Array &array, action_t action, size_t size);
+    void copyArraySliceToHost(Array &array, action_t action, size_t offset, size_t size);
 
     // Properties
     Property &getProperty(property_t id);