diff --git a/CMakeLists.txt b/CMakeLists.txt
index 17a08e09013f43a0b304f0c973bfc4651a380ebb..32f6d0b872bec9f1824d81eda3021796ebc0d457 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,9 +9,11 @@ endif()
 
 set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.")
 
-if(NOT CUDA_ARCH)
-    set(CUDA_ARCH sm_80)
-endif()
+set(CMAKE_CUDA_ARCHITECTURES 80)
+set(CUDA_ARCH 80)
+# if(NOT CUDA_ARCH)
+#     set(CUDA_ARCH 80)
+# endif()
 
 string(TOLOWER "${TESTCASE}" TESTCASE)
 message(STATUS "Selected testcase: ${TESTCASE}")
@@ -38,6 +40,8 @@ set(CPU_SRC "${TESTCASE}.cpp")
 set(GPU_SRC "${TESTCASE}.cu")
 set(CPU_BIN "${TESTCASE}_cpu")
 set(GPU_BIN "${TESTCASE}_gpu")
+set(GEN_HEADER ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces/last_generated.hpp)
+set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces)
 
 set(RUNTIME_COMMON_FILES
     runtime/pairs.cpp
@@ -98,12 +102,13 @@ add_library(runtime_cpu STATIC runtime/devices/dummy.cpp)
 target_link_libraries(${CPU_BIN} runtime_cpu)
 
 add_custom_command(
-    OUTPUT ${CMAKE_BINARY_DIR}/${CPU_SRC}
+    OUTPUT ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER}
     COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py cpu
     COMMENT "Generate CPU code"
     DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py)
-
-add_custom_target(gen_cpu DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC})
+    
+target_include_directories(${CPU_BIN} PRIVATE ${GEN_HEADER_DIR})
+add_custom_target(gen_cpu DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER})
 add_dependencies(${CPU_BIN} gen_cpu)
 
 if(COMPILE_CUDA)
@@ -128,21 +133,23 @@ if(COMPILE_CUDA)
     set_target_properties(runtime_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
     target_link_libraries(${GPU_BIN} runtime_gpu)
-    target_compile_options(${GPU_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=${CUDA_ARCH}>)
-    target_include_directories(${GPU_BIN} PRIVATE ${CUDA_INCLUDE_DIRS})
+    target_compile_options(${GPU_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>)
+    target_include_directories(${GPU_BIN} PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR})
     set_target_properties(${GPU_BIN} PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
+    target_compile_definitions(${GPU_BIN} PRIVATE PAIRS_TARGET_CUDA)
 
-    target_compile_options(runtime_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=${CUDA_ARCH}>)
-    target_include_directories(runtime_gpu PRIVATE ${CUDA_INCLUDE_DIRS})
+    target_compile_options(runtime_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>)
+    target_include_directories(runtime_gpu PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR})
     set_target_properties(runtime_gpu PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
+    target_compile_definitions(runtime_gpu PRIVATE PAIRS_TARGET_CUDA)
 
     add_custom_command(
-        OUTPUT ${GPU_SRC}
+        OUTPUT ${GPU_SRC} ${GEN_HEADER}
         COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py gpu
         COMMENT "Generate GPU code"
         DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py)
 
-    add_custom_target(gen_gpu DEPENDS ${GPU_SRC})
+    add_custom_target(gen_gpu DEPENDS ${GPU_SRC} ${GEN_HEADER})
     add_dependencies(${GPU_BIN} gen_gpu)
 endif()
 
diff --git a/runtime/boundary_weights.cpp b/runtime/boundary_weights.cpp
index 1a714d17794f9e5d89612b4034ee605106e466a3..35880c0be9edfe1407fffcfc02865ac281cce881 100644
--- a/runtime/boundary_weights.cpp
+++ b/runtime/boundary_weights.cpp
@@ -8,7 +8,7 @@
 #include "pairs_common.hpp"
 
 // Always include last generated interfaces
-#include "interfaces/last_generated.hpp"
+#include "last_generated.hpp"
 
 #ifdef PAIRS_TARGET_CUDA
 int cuda_compute_boundary_weights(
diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu
index 0883728550bf5c764e760fa2a4b05e026e76dfab..e9de4c9ff93ece8a84a2234200b4d330725279da 100644
--- a/runtime/devices/cuda.cu
+++ b/runtime/devices/cuda.cu
@@ -1,6 +1,8 @@
 #include <cuda_runtime.h>
 #include <iostream>
 #include <cstring>
+#include "last_generated.hpp"
+#include "../pairs_common.hpp"
 
 #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); }
 #define REDUCE_BLOCK_SIZE   64
@@ -72,44 +74,23 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t
     //CUDA_ASSERT(cudaMemcpyFromSymbol(h_ptr, d_ptr, count));
 }
 
-int cuda_compute_boundary_weights(
-    real_t *position, int start, int end, int particle_capacity,
-    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
-
-    const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
-    int *h_weights = (int *) malloc(nblocks * sizeof(int));
-    int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
-    int red = 0;
-
-    CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
 
-    reduceBoundaryWeights(
-        position, start, particle_capacity, 
-    )
-    CUDA_ASSERT(cudaPeekAtLastError());
-    CUDA_ASSERT(cudaDeviceSynchronize());
-    CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
-
-    reduceBoundaryWeights<nblocks, REDUCE_BLOCK_SIZE>();
-
-    for(int i = 0; i < nblocks; i++) {
-        red += h_weights[i];
-    }
-
-    return red;
-}
+void __global__ reduceBoundaryWeights(
+    real_t *position, int start, int end, int particle_capacity,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) {
 
-void __device__ reduceBoundaryWeights() {
     __shared__ int red_data[REDUCE_BLOCK_SIZE];
     int tid = threadIdx.x;
     int i = blockIdx.x * blockDim.x + tid;
-    real_t pos_x = pairs_cuda_interface::get_position(position, start + i, 0, particle_capacity);
-    real_t pos_y = pairs_cuda_interface::get_position(position, start + i, 1, particle_capacity);
-    real_t pos_z = pairs_cuda_interface::get_position(position, start + i, 2, particle_capacity);
+    int particle_idx = start + i;
 
     red_data[tid] = 0;
 
-    if(i < n) {
+    if(particle_idx < end) {
+        real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity);
+        real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity);
+        real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity);
+
         if( pos_x > xmin && pos_x <= xmax &&
             pos_y > ymin && pos_y <= ymax &&
             pos_z > zmin && pos_z <= zmax) {
@@ -134,4 +115,30 @@ void __device__ reduceBoundaryWeights() {
     }
 }
 
+int cuda_compute_boundary_weights(
+    real_t *position, int start, int end, int particle_capacity,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
+
+    const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
+    int *h_weights = (int *) malloc(nblocks * sizeof(int));
+    int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
+    int red = 0;
+
+    CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
+
+    reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>(
+            position, start, end, particle_capacity, 
+            xmin, xmax, ymin, ymax, zmin, zmax, d_weights);
+
+    CUDA_ASSERT(cudaPeekAtLastError());
+    CUDA_ASSERT(cudaDeviceSynchronize());
+    CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
+
+    for(int i = 0; i < nblocks; i++) {
+        red += h_weights[i];
+    }
+
+    return red;
+}
+
 }
diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp
index e340f12ba4276ac1b7d4e5d67fda245acb115d18..e2740b0c47631ac6f4f78fa9d144bbaa79fabd4f 100644
--- a/runtime/devices/device.hpp
+++ b/runtime/devices/device.hpp
@@ -8,6 +8,8 @@
 #ifndef PAIRS_TARGET_CUDA
 #   define __host__
 typedef int cudaError_t;
+#else
+#include <cuda_runtime.h>
 #endif
 
 namespace pairs {
@@ -71,24 +73,25 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize
 }
 
 #ifdef PAIRS_TARGET_CUDA
-#if __CUDA_ARCH__ < 600
-__device__ double atomicAdd_double(double* address, double val) {
-    unsigned long long int * ull_addr = (unsigned long long int*) address;
-    unsigned long long int old = *ull_addr, assumed;
-
-    do {
-        assumed = old;
-        old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
-#else
+// #if __CUDA_ARCH__ < 600
+// #error "CUDA architecture is less than 600"
+// __device__ double atomicAdd_double(double* address, double val) {
+//     unsigned long long int * ull_addr = (unsigned long long int*) address;
+//     unsigned long long int old = *ull_addr, assumed;
+
+//     do {
+//         assumed = old;
+//         old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+//     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+//     } while (assumed != old);
+
+//     return __longlong_as_double(old);
+// }
+// #else
 __device__ double atomicAdd_double(double* address, double val) {
     return atomicAdd(address, val);
 }
-#endif
+// #endif
 
 __device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); }
 __device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); }
diff --git a/runtime/domain/ParticleDataHandling.hpp b/runtime/domain/ParticleDataHandling.hpp
index f43c494593a06aebeac543f03fd195dd7817f0e7..b99e8cbde5fff5ad78898b277f4462b52062e9f7 100644
--- a/runtime/domain/ParticleDataHandling.hpp
+++ b/runtime/domain/ParticleDataHandling.hpp
@@ -1,6 +1,8 @@
 #include <blockforest/BlockForest.h>
 #include <blockforest/BlockDataHandling.h>
 
+#pragma once
+
 namespace pairs {
 
 class PairsRuntime;
diff --git a/runtime/interfaces/last_generated.hpp b/runtime/interfaces/last_generated.hpp
deleted file mode 100644
index c8bb6b29da34073cb58e82a8b54dc5d0f832f3ce..0000000000000000000000000000000000000000
--- a/runtime/interfaces/last_generated.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "../pairs.hpp"
-
-namespace pairs_host_interface {
-
-int get_uid(int *uid, int i) { return uid[i]; }
-int get_shape(int *shape, int i) { return shape[i]; }
-int get_flags(int *flags, int i) { return flags[i]; }
-double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; }
-double get_mass(double *mass, int i) { return mass[i]; }
-double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; }
-double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; }
-int get_type(int *type, int i) { return type[i]; }
-
-}
-
-namespace pairs_cuda_interface {
-
-__inline__ __device__ int get_uid(int *uid, int i) { return uid[i]; }
-__inline__ __device__ int get_shape(int *shape, int i) { return shape[i]; }
-__inline__ __device__ int get_flags(int *flags, int i) { return flags[i]; }
-__inline__ __device__ double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; }
-__inline__ __device__ double get_mass(double *mass, int i) { return mass[i]; }
-__inline__ __device__ double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; }
-__inline__ __device__ double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; }
-__inline__ __device__ int get_type(int *type, int i) { return type[i]; }
-
-}
diff --git a/runtime/timers.hpp b/runtime/timers.hpp
index 02058924ada457213214511a4099a05cf87695ac..c4cdc943aa5faeed57b4971684277e0844d3e7da 100644
--- a/runtime/timers.hpp
+++ b/runtime/timers.hpp
@@ -3,6 +3,8 @@
 #include <iostream>
 #include <unordered_map>
 
+#pragma once
+
 using namespace std;
 
 namespace pairs {
diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py
index 1a3103a40f8cad7936003fe0fb35781edd328ecf..d4a9d757850a74e01a9d488d12b1284c3bfa7754 100644
--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -84,7 +84,7 @@ class CGen:
         #self.print = Printer(f"runtime/interfaces/{self.ref}.hpp")
         self.print = Printer("runtime/interfaces/last_generated.hpp")
         self.print.start()
-        self.print("#include \"../pairs.hpp\"")
+        self.print("#pragma once")
         self.generate_interface_namespace('pairs_host_interface')
         self.generate_interface_namespace('pairs_cuda_interface', "__inline__ __device__")
         self.print.end()
@@ -226,13 +226,14 @@ class CGen:
             self.print(f"{tkw} {vname};")
 
             if self.target.is_gpu() and v.device_flag:
-                self.print(f"RuntimeVar<{tkw}> rv_{vname()};")
+                self.print(f"RuntimeVar<{tkw}> rv_{vname};")
 
         self.print.add_indent(-4)
         self.print("};")
         self.print("")
 
     def generate_program(self, ast_node):
+        self.generate_interfaces()
         ext = ".cu" if self.target.is_gpu() else ".cpp"
         self.print = Printer(self.ref + ext)
         self.print.start()
@@ -248,6 +249,7 @@ class CGen:
         self.print.end()
 
     def generate_library(self, initialize_module, create_domain_module, setup_sim_module,  do_timestep_module):
+        self.generate_interfaces()
         # Generate CUDA/CPP file with modules
         ext = ".cu" if self.target.is_gpu() else ".cpp"
         self.print = Printer(self.ref + ext)