diff --git a/runtime/boundary_weights.hpp b/runtime/boundary_weights.hpp
index 6f625474a24d626889bd43ceb37aeff03552b359..f784d04fbbb378fc349c7d88f0fb9ea67d0c6e57 100644
--- a/runtime/boundary_weights.hpp
+++ b/runtime/boundary_weights.hpp
@@ -15,6 +15,12 @@
 
 #pragma once
 
+#ifdef PAIRS_TARGET_CUDA
+int cuda_compute_boundary_weights(
+    real_t *position, int start, int end, int particle_capacity,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax);
+#endif
+
 namespace pairs {
 
 void compute_boundary_weights(
@@ -27,15 +33,16 @@ void compute_boundary_weights(
     const int nghost = ps->getNumberOfGhostParticles();
     auto position_prop = ps->getPropertyByName("position");
 
-    //ps->copyPropertyToDevice(position_prop, Ignore);
+    #ifndef PAIRS_TARGET_CUDA
+    auto position_ptr = position_prop->getHostPointer();
 
     *comp_weight = 0;
     *comm_weight = 0;
 
     for(int i = 0; i < nlocal; i++) {
-        real_t pos_x = pairs_interface::get_position(position_ptr, i, 0, particle_capacity);
-        real_t pos_y = pairs_interface::get_position(position_ptr, i, 1, particle_capacity);
-        real_t pos_z = pairs_interface::get_position(position_ptr, i, 2, particle_capacity);
+        real_t pos_x = pairs_host_interface::get_position(position_ptr, i, 0, particle_capacity);
+        real_t pos_y = pairs_host_interface::get_position(position_ptr, i, 1, particle_capacity);
+        real_t pos_z = pairs_host_interface::get_position(position_ptr, i, 2, particle_capacity);
 
         if( pos_x > xmin && pos_x <= xmax &&
             pos_y > ymin && pos_y <= ymax &&
@@ -45,9 +52,9 @@ void compute_boundary_weights(
     }
 
     for(int i = nlocal; i < nlocal + nghost; i++) {
-        real_t pos_x = pairs_interface::get_position(position_ptr, i, 0, particle_capacity);
-        real_t pos_y = pairs_interface::get_position(position_ptr, i, 1, particle_capacity);
-        real_t pos_z = pairs_interface::get_position(position_ptr, i, 2, particle_capacity);
+        real_t pos_x = pairs_host_interface::get_position(position_ptr, i, 0, particle_capacity);
+        real_t pos_y = pairs_host_interface::get_position(position_ptr, i, 1, particle_capacity);
+        real_t pos_z = pairs_host_interface::get_position(position_ptr, i, 2, particle_capacity);
 
         if( pos_x > xmin && pos_x <= xmax &&
             pos_y > ymin && pos_y <= ymax &&
@@ -55,6 +62,17 @@ void compute_boundary_weights(
                 *comm_weight++;
         }
     }
+    #else
+    auto position_ptr = position_prop->getDevicePointer();
+
+    ps->copyPropertyToDevice(position_prop, ReadOnly);
+
+    *comp_weight = cuda_compute_boundary_weights(
+        position_ptr, 0, nlocal, particle_capacity, xmin, xmax, ymin, ymax, zmin, zmax);
+
+    *comm_weight = cuda_compute_boundary_weights(
+        position_ptr, nlocal, nlocal + nghost, particle_capacity, xmin, xmax, ymin, ymax, zmin, zmax);
+    #endif
 }
 
 }
diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu
index 8bb7c59ef3bbefff0556847669faeae57744372e..0883728550bf5c764e760fa2a4b05e026e76dfab 100644
--- a/runtime/devices/cuda.cu
+++ b/runtime/devices/cuda.cu
@@ -3,6 +3,7 @@
 #include <cstring>
 
 #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); }
+#define REDUCE_BLOCK_SIZE   64
 
 namespace pairs {
 
@@ -71,4 +72,66 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t
     //CUDA_ASSERT(cudaMemcpyFromSymbol(h_ptr, d_ptr, count));
 }
 
+int cuda_compute_boundary_weights(
+    real_t *position, int start, int end, int particle_capacity,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
+
+    const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
+    int *h_weights = (int *) malloc(nblocks * sizeof(int));
+    int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
+    int red = 0;
+
+    CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
+
+    reduceBoundaryWeights(
+        position, start, particle_capacity, 
+    )
+    CUDA_ASSERT(cudaPeekAtLastError());
+    CUDA_ASSERT(cudaDeviceSynchronize());
+    CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
+
+    reduceBoundaryWeights<nblocks, REDUCE_BLOCK_SIZE>();
+
+    for(int i = 0; i < nblocks; i++) {
+        red += h_weights[i];
+    }
+
+    return red;
+}
+
+void __device__ reduceBoundaryWeights() {
+    __shared__ int red_data[REDUCE_BLOCK_SIZE];
+    int tid = threadIdx.x;
+    int i = blockIdx.x * blockDim.x + tid;
+    real_t pos_x = pairs_cuda_interface::get_position(position, start + i, 0, particle_capacity);
+    real_t pos_y = pairs_cuda_interface::get_position(position, start + i, 1, particle_capacity);
+    real_t pos_z = pairs_cuda_interface::get_position(position, start + i, 2, particle_capacity);
+
+    red_data[tid] = 0;
+
+    if(i < n) {
+        if( pos_x > xmin && pos_x <= xmax &&
+            pos_y > ymin && pos_y <= ymax &&
+            pos_z > zmin && pos_z <= zmax) {
+                red_data[tid] = 1;
+        }
+    }
+
+    __syncthreads();
+
+    int s = blockDim.x >> 1;
+    while(s > 0) {
+        if(tid < s) {
+            red_data[tid] += red_data[tid + s];
+        }
+
+        __syncthreads();
+        s >>= 1;
+    }
+
+    if(tid == 0) {
+        d_weights[blockIdx.x] = red_data[0];
+    }
+}
+
 }
diff --git a/runtime/domain/ParticleDataHanding.hpp b/runtime/domain/ParticleDataHanding.hpp
index 879b9ae83a943c94f7482de0d5c3abb0d89d7907..b36cd0e0db416a8e38d828efd29dd5e6efb5d657 100644
--- a/runtime/domain/ParticleDataHanding.hpp
+++ b/runtime/domain/ParticleDataHanding.hpp
@@ -74,7 +74,6 @@ private:
     void serializeImpl(Block *const block, const BlockDataID&, mpi::SendBuffer& buffer, const uint_t child, bool check_child) {
         auto ptr = buffer.allocate<uint_t>();
         double aabb_check[6];
-        int nparticles;
 
         if(check_child) {
             const auto child_id = BlockID(block->getId(), child);
@@ -95,28 +94,130 @@ private:
             aabb_check[5] = aabb.zMax();
         }
 
-        nparticles = md_serialize_particles(aabb_check);
+        for(auto& p: ps->getNonVolatileProperties()) {
+            ps->copyPropertyToHost(p, ReadOnly);
+        }
 
-        for(int i = 0; i < nparticles * 7; ++i) {
-            buffer << md_get_send_buffer_value(i);
+        auto position = ps->getPropertyByName("position");
+        int nlocal = ps->getNumberOfLocalParticles();
+        int i = 0;
+        int nserialized = 0;
+
+        while(i < nlocal) {
+            const real_t pos_x = position(i, 0);
+            const real_t pos_y = position(i, 1);
+            const real_t pos_z = position(i, 2);
+
+            if( pos_x > aabb_check[0] && pos_x <= aabb_check[1] &&
+                pos_y > aabb_check[2] && pos_y <= aabb_check[3] &&
+                pos_z > aabb_check[4] && pos_z <= aabb_check[5]) {
+
+                nlocal--;
+
+                for(auto &p: ps->getNonVolatileProperties()) {
+                    auto prop = ps->getProperty(p_id);
+                    auto prop_type = prop.getType();
+
+                    if(prop_type == Prop_Vector) {
+                        auto vector_ptr = ps->getAsVectorProperty(prop);
+                        constexpr int nelems = 3;
+
+                        for(int e = 0; e < nelems; e++) {
+                            buffer << vector_ptr(i, e);
+                            vector_ptr(i, e) = vector_ptr(nlocal, e);
+                        }
+                    } else if(prop_type == Prop_Matrix) {
+                        auto matrix_ptr = ps->getAsMatrixProperty(prop);
+                        constexpr int nelems = 9;
+
+                        for(int e = 0; e < nelems; e++) {
+                            buffer << matrix_ptr(i, e);
+                            matrix_ptr(i, e) = matrix_ptr(nlocal, e);
+                        }
+                    } else if(prop_type == Prop_Quaternion) {
+                        auto quat_ptr = ps->getAsQuaternionProperty(prop);
+                        constexpr int nelems = 4;
+
+                        for(int e = 0; e < nelems; e++) {
+                            buffer << quat_ptr(i, e);
+                            quat_ptr(i, e) = quat_ptr(nlocal, e);
+                        }
+                    } else if(prop_type == Prop_Integer) {
+                        auto int_ptr = ps->getAsIntegerProperty(prop);
+                        buffer << int_ptr(i);
+                        int_ptr(i) = int_ptr(nlocal);
+                    } else if(prop_type == Prop_Real) {
+                        auto float_ptr = ps->getAsFloatProperty(prop);
+                        buffer << float_ptr(i);
+                        float_ptr(i) = float_ptr(nlocal);
+                    } else {
+                        std::cerr << "serializeImpl(): Invalid property type!" << std::endl;
+                        return 0;
+                    }
+                }
+
+                // TODO: serialize contact history data as well
+                nserialized++;
+            }
         }
 
-        *ptr = (uint_t) nparticles;
+        ps->setNumberOfLocalParticles(nlocal);
+        *ptr = (uint_t) nserialized;
     }
 
     void deserializeImpl(IBlock *const, const BlockDataID&, mpi::RecvBuffer& buffer) {
-        uint_t nparticles;
-        buffer >> nparticles;
-
-        md_resize_recv_buffer_capacity((int) nparticles);
+        int nlocal = ps->getNumberOfLocalParticles();
+        uint_t nrecv;
+
+        buffer >> nrecv;
+
+        // TODO: Check if there is enough particle capacity for the new particles
+        // md_resize_recv_buffer_capacity((int) nparticles);
+
+        for(int i = 0; i < nrecv; ++i) {
+            for(auto &p: ps->getNonVolatileProperties()) {
+                auto prop = ps->getProperty(p_id);
+                auto prop_type = prop.getType();
+
+                if(prop_type == Prop_Vector) {
+                    auto vector_ptr = ps->getAsVectorProperty(prop);
+                    constexpr int nelems = 3;
+
+                    for(int e = 0; e < nelems; e++) {
+                        buffer >> vector_ptr(nlocal + i, e);
+                    }
+                } else if(prop_type == Prop_Matrix) {
+                    auto matrix_ptr = ps->getAsMatrixProperty(prop);
+                    constexpr int nelems = 9;
+
+                    for(int e = 0; e < nelems; e++) {
+                        buffer >> matrix_ptr(nlocal + i, e);
+                    }
+                } else if(prop_type == Prop_Quaternion) {
+                    auto quat_ptr = ps->getAsQuaternionProperty(prop);
+                    constexpr int nelems = 4;
+
+                    for(int e = 0; e < nelems; e++) {
+                        buffer >> quat_ptr(nlocal + i, e);
+                    }
+                 } else if(prop_type == Prop_Integer) {
+                    auto int_ptr = ps->getAsIntegerProperty(prop);
+                    buffer >> int_ptr(nlocal + i);
+                } else if(prop_type == Prop_Real) {
+                    auto float_ptr = ps->getAsFloatProperty(prop);
+                    buffer >> float_ptr(nlocal + i);
+                } else {
+                    std::cerr << "deserializeImpl(): Invalid property type!" << std::endl;
+                    return 0;
+                }
+            }
+        }
 
-        for(int i = 0; i < (int) nparticles * 7; ++i) {
-            double v;
-            buffer >> v;
-            md_set_recv_buffer_value(i, v);
+        for(auto& p: ps->getNonVolatileProperties()) {
+            ps->clearDeviceFlags(p);
         }
 
-        md_deserialize_particles((int) nparticles);
+        ps->setNumberOfLocalParticles(nlocal + nrecv);
     }
 };
 
diff --git a/runtime/domain/block_forest.cpp b/runtime/domain/block_forest.cpp
index 44d30197437bef211a450f6e6ebfc59d0ff0e04d..02a8fca8204d551e912c4112f99deb2ed71f724c 100644
--- a/runtime/domain/block_forest.cpp
+++ b/runtime/domain/block_forest.cpp
@@ -77,8 +77,6 @@ void BlockForest::updateNeighborhood() {
     }
 }
 
-
-
 void BlockForest::copyRuntimeArray(const std::string& name, void *dest, const int size) {
     void *src = name.compare('ranks') ? ranks.data() :
                 name.compare('naabbs') ? vec_naabbs.data() :
@@ -265,7 +263,8 @@ void BlockForest::initializeWorkloadBalancer() {
     forest->allowMultipleRefreshCycles(false);
     forest->checkForEarlyOutInRefresh(false);
     forest->checkForLateOutInRefresh(false);
-    forest->setRefreshMinTargetLevelDeterminationFunction(pe::amr::MinMaxLevelDetermination(info, regridMin, regridMax));
+    forest->setRefreshMinTargetLevelDeterminationFunction(
+        pe::amr::MinMaxLevelDetermination(info, regridMin, regridMax));
 
     for_each(algorithm.begin(), algorithm.end(), [](char& c) { c = (char) ::tolower(c); });