diff --git a/.gitignore b/.gitignore
index fcbb18e667efb2eb74151572917015f302a33eff..4205259e44f53e38f85702960d2499ffcf11103b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 functions.dot
 functions.pdf
 **/*.vtk
-build
+build*
 dist
 pairs.egg-info
 output*
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0ecd4d90006027630096f1c57c268bb5d494e7da
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,266 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(pairs CXX)
+# Set default build type if none is specified
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build (Debug, Release, etc.)" FORCE)
+endif()
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DDEBUG")
+
+option(USE_MPI                      "USE_MPI" ON)
+option(COMPILE_CUDA                 "COMPILE_CUDA" OFF)
+option(GENERATE_WHOLE_PROGRAM       "Generate the whole program (i.e. including the 'main' function). No additional source files are needed." OFF)
+option(BUILD_APP                    "Build a stand-alone app which uses the P4IRS modular interface. Provide your source files with -DUSER_SOURCE_FILES" OFF)
+
+if(GENERATE_WHOLE_PROGRAM AND BUILD_APP)
+    message(FATAL_ERROR "You must choose either GENERATE_WHOLE_PROGRAM or BUILD_APP or neither.\n
+        Choose neither if you only want to use the P4IRS library in your project (in a seperate build system).")
+endif()
+
+set(INPUT_SCRIPT ${INPUT_SCRIPT} CACHE PATH "The input python script triggering code generation")
+if(NOT EXISTS ${INPUT_SCRIPT})
+    message(FATAL_ERROR "INPUT_SCRIPT doesn't exist! Specify it with -DINPUT_SCRIPT=/path/to/script.py")
+endif()
+get_filename_component(INPUT_SCRIPT_NAME ${INPUT_SCRIPT} NAME_WE)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
+
+#================================================================================
+# Setup directories =============================================================
+#================================================================================
+file(COPY data DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+set(OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/output)
+if(EXISTS ${OUTPUT_DIR})
+    file(REMOVE_RECURSE ${OUTPUT_DIR})
+endif()
+file(MAKE_DIRECTORY ${OUTPUT_DIR})
+
+#================================================================================
+# Generated header (internally used by runtime files) ===========================
+#================================================================================
+# TODO: Unify all interfaces
+set(GEN_INTERFACE_DIR ${CMAKE_CURRENT_BINARY_DIR}/internal_interfaces)
+set(GEN_INTERFACE_HEADER ${CMAKE_CURRENT_BINARY_DIR}/last_generated.hpp)
+file(MAKE_DIRECTORY ${GEN_INTERFACE_DIR})
+
+#================================================================================
+# RUNTIME_COMMON_FILES ==========================================================
+#================================================================================
+set(RUNTIME_COMMON_FILES
+    runtime/pairs.cpp
+    runtime/copper_fcc_lattice.cpp
+    runtime/create_body.cpp
+    runtime/dem_sc_grid.cpp
+    runtime/read_from_file.cpp
+    runtime/stats.cpp
+    runtime/thermo.cpp
+    runtime/timing.cpp
+    runtime/vtk.cpp
+    runtime/domain/regular_6d_stencil.cpp)
+
+#================================================================================
+# PAIRS_TARGET ==================================================================
+#================================================================================
+set(PAIRS_TARGET "pairs")
+
+# PAIRS dependencies 
+set(PAIRS_LINK_LIBRARIES)
+set(PAIRS_LINK_DIRS ${CMAKE_CURRENT_BINARY_DIR})
+set(PAIRS_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR})
+
+# The target can either be an executable or a static library
+if(GENERATE_WHOLE_PROGRAM OR BUILD_APP)
+    add_executable(${PAIRS_TARGET} ${RUNTIME_COMMON_FILES})
+else()
+    add_library(${PAIRS_TARGET} STATIC ${RUNTIME_COMMON_FILES})
+    list(APPEND PAIRS_LINK_LIBRARIES ${PAIRS_TARGET})
+endif()
+
+# Include P4IRS 'runtime' dir
+target_include_directories(${PAIRS_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime)
+list(APPEND PAIRS_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/runtime)
+
+set_target_properties(${PAIRS_TARGET} PROPERTIES 
+    CXX_STANDARD_REQUIRED ON
+    CXX_STANDARD 17
+)
+
+#================================================================================
+# USER_SOURCE_FILES =============================================================
+#================================================================================
+if(BUILD_APP)
+    set(USER_SOURCE_FILES "" CACHE STRING "List of source files to compile (semicolon-separated)")
+    if(NOT USER_SOURCE_FILES)
+        message(FATAL_ERROR "BUILD_APP is ON. You have to specify source files like this:\n
+            -DUSER_SOURCE_FILES=src/main.cpp;src/helper.cpp")
+    endif()
+
+    foreach(file ${USER_SOURCE_FILES})
+        if(NOT EXISTS ${file})
+            message(FATAL_ERROR "File '${file}' does not exist!")
+        endif()
+    endforeach()
+    target_sources(${PAIRS_TARGET} PRIVATE ${USER_SOURCE_FILES})
+endif()
+
+#================================================================================
+# waLBerla ======================================================================
+#================================================================================
+set(WALBERLA_DIR ${WALBERLA_DIR} CACHE PATH "Path to waLBerla source directory (required only when using BlockForest partitioning).")
+
+if(WALBERLA_DIR)
+    if(EXISTS "${WALBERLA_DIR}")
+        target_compile_definitions(${PAIRS_TARGET} PUBLIC USE_WALBERLA)
+    else()
+        message(FATAL_ERROR "Invalid WALBERLA_DIR: '${WALBERLA_DIR}' does not exist.")
+    endif()
+
+    set(RUNTIME_WALBERLA_FILES
+        runtime/domain/block_forest.cpp
+    )
+
+    # TODO: Generate the host/device functions for computing weights
+    if(COMPILE_CUDA)
+        list(APPEND RUNTIME_WALBERLA_FILES runtime/boundary_weights.cu)
+    else()
+        list(APPEND RUNTIME_WALBERLA_FILES runtime/boundary_weights.cpp)
+    endif()
+
+    target_sources(${PAIRS_TARGET} PRIVATE ${RUNTIME_WALBERLA_FILES})
+
+    ## Linking walberla modules
+    set(PAIRS_WALBERLA_DEPENDENCIES blockforest core pe)
+    find_package(waLBerla REQUIRED)
+    set(WALBERLA_LINK_LIBRARIES_KEYWORD PUBLIC)
+    target_link_modules(${PAIRS_TARGET} ${PAIRS_WALBERLA_DEPENDENCIES})     # This is a walberla helper function
+
+    ## TODO: PAIRS_LINK_DIRS and PAIRS_LINK_LIBRARIES for walberla modules *AND* their dependencies
+    ## This implemention only works if the consumer of the library is itself a walberla app (made within the build system of walberla)
+    list(APPEND PAIRS_LINK_LIBRARIES ${PAIRS_WALBERLA_DEPENDENCIES})
+endif()
+
+#================================================================================
+# Install pairs python package ==================================================
+#================================================================================
+set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE STRING "Python executable")
+
+if(NOT PYTHON_EXECUTABLE)
+    set(PYTHON_EXECUTABLE python3)
+endif()
+
+execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} setup.py build
+    OUTPUT_QUIET
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} setup.py install --user
+    OUTPUT_QUIET
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+#================================================================================
+# CUDA ==========================================================================
+#================================================================================
+if(COMPILE_CUDA)
+    find_package(CUDA REQUIRED)
+    enable_language(CUDA)
+    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${INPUT_SCRIPT_NAME}.cu")
+    set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.")
+    set(TARGET_ARG "gpu")
+
+    # Default arch is 80
+    if(NOT CUDA_ARCH)
+        set(CUDA_ARCH 80)
+    endif()
+
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true")
+
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g -G -O0 -DDEBUG")
+    else()
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
+    endif()
+
+    if(NOT DEFINED ENABLE_GPU_DIRECT)
+        set(ENABLE_GPU_DIRECT ON "Enable GPU Direct (default: ON when COMPILE_CUDA is ON)" FORCE)
+    else()
+         # User choice is respected here if they opt to COMPILE_CUDA without GPU Direct
+        set(ENABLE_GPU_DIRECT ${ENABLE_GPU_DIRECT} "Enable GPU Direct (user-defined)" FORCE)
+    endif()
+
+    if(ENABLE_GPU_DIRECT)
+        target_compile_definitions(${PAIRS_TARGET} PRIVATE ENABLE_CUDA_AWARE_MPI)
+    endif()
+
+    target_sources(${PAIRS_TARGET} PRIVATE runtime/devices/cuda.cu)
+    target_compile_definitions(${PAIRS_TARGET} PUBLIC PAIRS_TARGET_CUDA)
+    target_include_directories(${PAIRS_TARGET} PUBLIC ${CUDA_INCLUDE_DIRS})
+    list(APPEND PAIRS_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
+
+    set_target_properties(${PAIRS_TARGET} PROPERTIES 
+        CUDA_RESOLVE_DEVICE_SYMBOLS ON
+        CUDA_STANDARD 17
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_ARCHITECTURES ${CUDA_ARCH})
+
+    target_link_libraries(${PAIRS_TARGET} PUBLIC ${CUDA_LIBRARIES})
+    list(APPEND PAIRS_LINK_LIBRARIES ${CUDA_LIBRARIES})
+else()
+    if(ENABLE_GPU_DIRECT)
+        message(FATAL_ERROR "Invalid combination: ENABLE_GPU_DIRECT requires COMPILE_CUDA to be ON")
+    endif()
+
+    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${INPUT_SCRIPT_NAME}.cpp")
+    set(TARGET_ARG "cpu")
+    target_sources(${PAIRS_TARGET} PRIVATE runtime/devices/dummy.cpp)
+endif()
+
+#================================================================================
+# Generate code and add them to PAIRS_TARGET ====================================
+#================================================================================
+add_custom_command(
+    OUTPUT ${GEN_SOURCES} ${GEN_INTERFACE_HEADER}
+    COMMAND ${PYTHON_EXECUTABLE} ${INPUT_SCRIPT} ${TARGET_ARG}
+    COMMENT "Generating code with P4IRS"
+    DEPENDS ${INPUT_SCRIPT})
+
+add_custom_target(generated_code DEPENDS ${GEN_SOURCES} ${GEN_INTERFACE_HEADER})
+add_dependencies(${PAIRS_TARGET} generated_code)
+
+target_sources(${PAIRS_TARGET} PRIVATE ${GEN_SOURCES})
+target_include_directories(${PAIRS_TARGET} PRIVATE 
+    ${GEN_INTERFACE_DIR}            # Interface header USED INTERNALLY by pairs is located here
+    ${CMAKE_CURRENT_BINARY_DIR}     # Generated source and header FOR USER is located here
+)    
+
+#================================================================================
+# MPI ===========================================================================
+#================================================================================
+if(USE_MPI)
+    find_package(MPI REQUIRED)
+    include_directories(SYSTEM ${MPI_INCLUDE_PATH})
+    target_link_libraries(${PAIRS_TARGET} PUBLIC ${MPI_LIBRARIES})
+    list(APPEND PAIRS_LINK_LIBRARIES "${MPI_LIBRARIES}")
+    list(APPEND PAIRS_INCLUDE_DIRS "${MPI_INCLUDE_PATH}")
+endif()
+
+#================================================================================
+# LIKWID ========================================================================
+#================================================================================
+if(LIKWID_DIR)
+    target_compile_options(${PAIRS_TARGET} PRIVATE -DLIKWID_PERFMON -pthread)
+
+    target_link_libraries(${PAIRS_TARGET} PRIVATE ${LIKWID_DIR}/lib/liblikwid.a)
+    list(APPEND PAIRS_LINK_LIBRARIES ${LIKWID_DIR}/lib/liblikwid.a)
+
+    include_directories(${LIKWID_DIR}/include)
+    list(APPEND PAIRS_INCLUDE_DIRS "${LIKWID_DIR}/include")
+endif()
+
+#================================================================================
+# config file ===================================================================
+#================================================================================
+configure_file ( "${CMAKE_SOURCE_DIR}/cmake/pairs-config.cmake.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/pairs-config.cmake")
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 967a1a9f117d174c758b82a6af3c41c296ded89c..0000000000000000000000000000000000000000
--- a/Makefile
+++ /dev/null
@@ -1,87 +0,0 @@
-.PHONY: all build clean
-
-# General settings
-TESTCASE=md
-PYCMD=python3
-
-# C/C++ compiler settings
-CC=mpic++
-CFLAGS=-O3 -mavx2 -mfma -fopenmp ${MPI_FLAGS} ${LIKWID_FLAGS}
-#CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS}
-#CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS}
-DEBUG_FLAGS=
-#DEBUG_FLAGS=-DDEBUG
-
-# CUDA settings
-NVCC=nvcc
-#NVCC_FLAGS=-O3 -mavx2 -mfma
-NVCC_FLAGS=-O3 -arch=sm_80 -mavx2 -mfma -ffast-math -funroll-loops --forward-unknown-to-host-compiler
-#NVCC_FLAGS=-O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler
-NVCC_PATH:="$(shell which ${NVCC})"
-CUDA_FLAGS=-DENABLE_CUDA_AWARE_MPI
-CUDART_FLAGS=-lcudart -L /apps/SPACK/0.19.1/opt/linux-almalinux8-zen/gcc-8.5.0/nvhpc-23.7-bzxcokzjvx4stynglo4u2ffpljajzlam/Linux_x86_64/23.7/cuda/12.2/targets/x86_64-linux/lib
-
-# MPI settings
-MPI_PATH=/apps/SPACK/0.19.1/opt/linux-almalinux8-zen/intel-2021.10.0/openmpi-4.1.6-ijsnjhq77rjc256wlrp52m37rsq6miff
-MPI_FLAGS=-I${MPI_PATH}/include
-
-# Likwid settings
-LIKWID_INC ?= -I/usr/local/include
-LIKWID_DEFINES ?= -DLIKWID_PERFMON
-LIKWID_LIB ?= -L/usr/local/lib
-LIKWID_FLAGS = -llikwid ${LIKWID_INC} ${LIKWID_DEFINES} ${LIKWID_LIB}
-
-# Other
-CPU_OBJ_PATH=obj_cpu
-CPU_SRC="$(TESTCASE).cpp"
-CPU_BIN="$(TESTCASE)_cpu"
-GPU_OBJ_PATH=obj_gpu
-GPU_SRC="$(TESTCASE).cu"
-GPU_BIN="$(TESTCASE)_gpu"
-
-all: clean build $(CPU_BIN) $(GPU_BIN)
-	@echo "Everything was done!"
-
-build:
-	@echo "Building pairs package..."
-	$(PYCMD) setup.py build && $(PYCMD) setup.py install --user
-
-$(CPU_SRC):
-	@echo "Generating and compiling $(TESTCASE) example for CPU..."
-	@mkdir -p $(CPU_OBJ_PATH)
-	$(PYCMD) examples/$(TESTCASE).py cpu
-
-$(GPU_SRC):
-	@echo "Generating and compiling $(TESTCASE) example for GPU..."
-	@mkdir -p $(GPU_OBJ_PATH)
-	$(PYCMD) examples/$(TESTCASE).py gpu
-
-$(CPU_OBJ_PATH)/pairs.o: runtime/pairs.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CFLAGS)
-
-$(CPU_OBJ_PATH)/regular_6d_stencil.o: runtime/domain/regular_6d_stencil.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CFLAGS)
-
-$(CPU_OBJ_PATH)/dummy.o: runtime/devices/dummy.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CFLAGS)
-
-$(GPU_OBJ_PATH)/pairs.o: runtime/pairs.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(MPI_FLAGS) $(CFLAGS) $(CUDA_FLAGS)
-
-$(GPU_OBJ_PATH)/regular_6d_stencil.o: runtime/domain/regular_6d_stencil.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(MPI_FLAGS) $(CFLAGS) $(CUDA_FLAGS)
-
-$(GPU_OBJ_PATH)/cuda_runtime.o: runtime/devices/cuda.cu
-	$(NVCC) $(NVCC_FLAGS) -c -o $@ $< $(DEBUG_FLAGS) $(MPI_FLAGS) $(CUDA_FLAGS)
-
-# Targets
-$(CPU_BIN): $(CPU_SRC) $(CPU_OBJ_PATH)/pairs.o $(CPU_OBJ_PATH)/regular_6d_stencil.o $(CPU_OBJ_PATH)/dummy.o
-	$(CC) $(CFLAGS) -o $(CPU_BIN) $(CPU_SRC) $(CPU_OBJ_PATH)/pairs.o $(CPU_OBJ_PATH)/regular_6d_stencil.o $(CPU_OBJ_PATH)/dummy.o $(DEBUG_FLAGS)
-
-$(GPU_BIN): $(GPU_SRC) $(GPU_OBJ_PATH)/pairs.o $(GPU_OBJ_PATH)/regular_6d_stencil.o $(GPU_OBJ_PATH)/cuda_runtime.o
-	$(NVCC) $(NVCC_FLAGS) -c -o $(GPU_OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) $(DEBUG_FLAGS) $(MPI_FLAGS) $(CUDA_FLAGS)
-	$(CC) -o $(GPU_BIN) $(GPU_OBJ_PATH)/$(GPU_BIN).o $(GPU_OBJ_PATH)/cuda_runtime.o $(GPU_OBJ_PATH)/pairs.o $(GPU_OBJ_PATH)/regular_6d_stencil.o $(CUDART_FLAGS) $(CUDA_FLAGS) $(CFLAGS)
-
-clean:
-	@echo "Cleaning..."
-	rm -rf build $(CPU_BIN) $(GPU_BIN) $(CPU_SRC) $(GPU_SRC) dist pairs.egg-info functions functions.pdf $(CPU_OBJ_PATH) $(GPU_OBJ_PATH)
diff --git a/README.md b/README.md
index bcef6a27a6d2ebb070f8098c59934bcc9fc73a43..882644c488aa6495080216615ba203927e6d0dd6 100644
--- a/README.md
+++ b/README.md
@@ -3,10 +3,6 @@
 P4IRS is an open-source, stand-alone compiler and domain-specific language for particle simulations which aims at generating optimized code for different target hardwares.
 It is released as a Python package and allows users to define kernels, integrators and other particle routines in a high-level and straightforward fashion without the need to implement any backend code.
 
-## Build instructions
-
-There is a Makefile which contains configurable environment variables such as `TESTCASE` compiler parameters evaluate P4IRS performance on different scenarios.
-`TESTCASE` refers to any of the files within the `examples` directory, such as `md` and `dem`.
 
 ## Usage
 
@@ -104,6 +100,64 @@ else:
 psim.generate()
 ```
 
+## Build instructions
+
+P4IRS can be built in 3 different modes using the CMake build system. Before we demostrate each mode, ensure you have CMake, MPI and CUDA (if targeting GPU execution) available in your environment.
+
+In the following, we assume we have created and navigated to a build directory: `mkdir build; cd build` 
+
+**General CMake flags (applicable to all 3 modes):**  
+* Pass your input script to CMake using `-DINPUT_SCRIPT=path/to/script.py`  
+* Enable CUDA with `-DCOMPILE_CUDA=ON`
+* Enable support for BlockForest domain partitioning and dynamic load balancing by providing the path to waLBerla source directory `-DWALBERLA_DIR=path/to/walberla` (TODO: waLBerla as a submodule)
+
+
+### 1. Whole-program generation:
+---------------------
+In this mode, everything including the `main` function is generated by P4IRS.
+
+1. Set `generate_whole_program=True` in the input script
+2. Set the CMake flag `-DGENERATE_WHOLE_PROGRAM=ON`
+
+Example: Build [md.py](examples/whole-program-generation/md.py)
+```
+cmake -DINPUT_SCRIPT=../examples/whole-program-generation/md.py -DGENERATE_WHOLE_PROGRAM=ON ..
+``` 
+Now call `make` and an **executable** is built.
+
+
+### 2. Modular stand-alone app
+---------------------
+You can build a stand-alone C++ app which uses the P4IRS modular interface.
+
+1. Set `generate_whole_program=False` in the input script
+2. Set the CMake flag `-DBUILD_APP=ON`
+3. Provide the list of your source files to CMake (semicolon-seperated):`-DUSER_SOURCE_FILES=path/to/main.cpp;path/to/helper.cpp`
+
+Example: Build the application [sd_1.cpp](examples/modular/sd_1.cpp) with [spring_dashpot.py](examples/modular/spring_dashpot.py)  
+Note: In this example we assume waLBerla has been already cloned next to the P4IRS directory.
+
+```
+cmake -DINPUT_SCRIPT=../examples/modular/spring_dashpot.py -DBUILD_APP=ON -DUSER_SOURCE_FILES=../examples/modular/sd_1.cpp -DWALBERLA_DIR=../../walberla ..
+```
+Now call `make` and an **executable** is built.
+
+
+### 3. P4IRS as a library
+---------------------
+In this mode, P4IRS is compiled as a library that can be integrated into other projects.
+
+1. Set `generate_whole_program=False` in the input script
+2. Ensure both `BUILD_APP` and `GENERATE_WHOLE_PROGRAM` are `OFF` (they are OFF by default)
+
+Configure CMake and call `make` as usual, and a **static library** is built. You can then include P4IRS and its dependencies in your build system as follows:
+```cmake
+find_package(pairs REQUIRED HINTS "path/to/pairs/build" NO_DEFAULT_PATH)
+target_include_directories(my_app PUBLIC ${PAIRS_INCLUDE_DIRS})
+target_link_directories(my_app PUBLIC ${PAIRS_LINK_DIRS})
+target_link_libraries(my_app PUBLIC ${PAIRS_LINK_LIBRARIES})
+```
+
 ## Citations
 
 TBD
diff --git a/cmake/FindwaLBerla.cmake b/cmake/FindwaLBerla.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8f87e88a03902f1c1af3900a3d4d38b921996682
--- /dev/null
+++ b/cmake/FindwaLBerla.cmake
@@ -0,0 +1,13 @@
+if ( WALBERLA_DIR )
+    # WALBERLA_DIR has to point to the waLBerla source directory
+    # this command builds waLBerla (again) in the current build directory in the subfolder "walberla" (second argument)
+    add_subdirectory( ${WALBERLA_DIR} walberla EXCLUDE_FROM_ALL)
+    
+    waLBerla_import()
+    # Adds the 'src' and 'tests' directory of current app
+    list( APPEND WALBERLA_MODULE_DIRS "${CMAKE_SOURCE_DIR}/src" "${CMAKE_SOURCE_DIR}/tests" )
+    list( REMOVE_DUPLICATES  WALBERLA_MODULE_DIRS )
+    set ( WALBERLA_MODULE_DIRS  ${WALBERLA_MODULE_DIRS} CACHE INTERNAL "All folders that contain modules or tests" )
+else()
+    message( FATAL_ERROR "waLBerla not found - Use 'cmake -DWALBERLA_DIR=path_to_waLBerla_sources  pathToApplicationSources' "  )
+endif()
diff --git a/cmake/pairs-config.cmake.in b/cmake/pairs-config.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..fdf10effa045063348fbe03d60608d6db3892cc2
--- /dev/null
+++ b/cmake/pairs-config.cmake.in
@@ -0,0 +1,6 @@
+set ( pairs_SOURCE_DIR @pairs_SOURCE_DIR@ )
+set ( pairs_BINARY_DIR @pairs_BINARY_DIR@ )
+
+set ( PAIRS_LINK_LIBRARIES @PAIRS_LINK_LIBRARIES@ )
+set ( PAIRS_LINK_DIRS @PAIRS_LINK_DIRS@ )
+set ( PAIRS_INCLUDE_DIRS @PAIRS_INCLUDE_DIRS@ )
diff --git a/data/planes.input b/data/planes.input
index 3c84ed90dba0e02f137fb4ae6308c48a56a53930..af3744e15f94a6ec863856ed5913c4fc18d344a3 100644
--- a/data/planes.input
+++ b/data/planes.input
@@ -1,2 +1,2 @@
-100000,0,1,0.0,0.0,0.0,0.0,0.0,1.0,13
-100001,0,1,0.8,0.015,0.2,0.0,0.0,-1.0,13
+0,  1,  0.0,    0.0,    0.0,    0.0,    0.0,    1.0,    13
+0,  1,  0.8,    0.015,  0.2,    0.0,    0.0,    -1.0,   13
diff --git a/data/sd_planes.input b/data/sd_planes.input
new file mode 100644
index 0000000000000000000000000000000000000000..42c7724c8b621dfca891624bc9c3d58aea6ca606
--- /dev/null
+++ b/data/sd_planes.input
@@ -0,0 +1,6 @@
+0, 1, 0, 0, 0, 1, 0, 0, 13
+0, 1, 0, 0, 0, 0, 1, 0, 13
+0, 1, 0, 0, 0, 0, 0, 1, 13
+0, 1, 10, 10, 10, -1, 0, 0, 13
+0, 1, 10, 10, 10, 0, -1, 0, 13
+0, 1, 10, 10, 10, 0, 0, -1, 13
diff --git a/examples/dem2.py b/examples/dem2.py
deleted file mode 100644
index 7ebb47b1a335c4e0728a62500ecb0e2adcd81402..0000000000000000000000000000000000000000
--- a/examples/dem2.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import pairs
-import sys
-
-
-def linear_spring_dashpot(i, j):
-    penetration_depth = squared_distance(i, j) - radius[i] - radius[j]
-    skip_when(penetration_depth >= 0.0)
-
-    delta_ij = delta(i, j)
-    contact_normal = delta_ij / length(delta_ij)
-    k = radius[j] + 0.5 * penetration_depth
-    contact_point = position[j] + contact_normal * k
-
-    rel_vel = -velocity_wf[i] - velocity_wf[j]
-    rel_vel_n = dot(rel_vel, contact_normal) * contact_normal
-    rel_vel_t = rel_vel - rel_vel_n
-
-    fN = stiffness_norm[i, j] * (-penetration_depth) * contact_normal + damping_norm[i, j] * rel_vel_n;
-
-    tan_spring_disp = tangential_spring_displacement[i, j]
-    impact_vel_magnitude = impact_velocity_magnitude[i, j]
-    impact_magnitude = select(impact_vel_magnitude > 0.0, impact_vel_magnitude, length(rel_vel))
-    sticking = is_sticking[i, j]
-
-    rotated_tan_disp = tan_spring_disp - contact_normal * (contact_normal * tan_spring_disp)
-    new_tan_spring_disp = dt * rel_vel_t + \
-                          select(squared_length(rotated_tan_disp) <= 0.0,
-                                 zero_vector(),
-                                 rotated_tan_disp * length(tan_spring_disp) / length(rotated_tan_disp))
-
-    fTLS = stiffness_tan[i, j] * new_tan_spring_disp + damping_tan[i, j] * rel_vel_t
-    fTLS_len = length(fTLS)
-    fTLS_inv_len = 1.0 / fTLS_len
-    t = select(fTLS_len > 0, fTLS / fTLS_inv_len, zero_vector())
-
-    f_friction_abs_static = friction_static[i, j] * length(fN)
-    f_friction_abs_dynamic = friction_dynamic[i, j] * length(fN)
-    tan_vel_threshold = 1e-8
-
-    cond1 = sticking == 1 and length(rel_vel_t) < tan_vel_threshold and fTLS_len < f_friction_abs_static
-    cond2 = sticking == 1 and fTLS_len < f_friction_abs_dynamic
-    f_friction_abs = select(cond1, f_friction_abs_static, f_friction_abs_dynamic)
-    n_sticking = select(cond1 or cond2 or fTLS_len < f_friction_abs_dynamic, 1, 0)
-
-    if not cond1 and not cond2 and stiffness_tan[i, j] > 0.0:
-        tangential_spring_displacement[i, j] = \
-            (f_friction_abs * t - damping_tan[i, j] * rel_vel_t) / stiffness_tan[i, j]
-
-    else:
-        tangential_spring_displacement[i, j] = new_tan_spring_disp
-
-    impact_velocity_magnitude[i, j] = impact_magnitude
-    is_sticking[i, j] = n_sticking
-
-    fTabs = min(fTLS_len, f_friction_abs)
-    fT = fTabs * t
-    force[i] += fN + fT
-
-
-def euler(i):
-    velocity[i] += dt * force[i] / mass[i]
-    position[i] += dt * velocity[i]
-
-
-cmd = sys.argv[0]
-target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
-if target != 'cpu' and target != 'gpu':
-    print(f"Invalid target, use {cmd} <cpu/gpu>")
-
-
-dt = 0.005
-cutoff_radius = 2.5
-skin = 0.3
-ntypes = 4
-stiffness_norm = 1.0
-stiffness_tan = 1.0
-damping_norm = 1.0
-damping_tan = 1.0
-friction_static = 1.0
-friction_dynamic = 1.0
-
-psim = pairs.simulation("dem", debug=True)
-psim.add_position('position')
-psim.add_property('mass', pairs.double(), 1.0)
-psim.add_property('velocity', pairs.vector())
-psim.add_property('velocity_wf', pairs.vector())
-psim.add_property('force', pairs.vector(), vol=True)
-psim.add_property('radius', pairs.double(), 1.0)
-psim.add_feature('type', ntypes)
-psim.add_feature_property('type', 'stiffness_norm', pairs.double(), [stiffness_norm for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'stiffness_tan', pairs.double(), [stiffness_tan for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'damping_norm', pairs.double(), [damping_norm for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'damping_tan', pairs.double(), [damping_tan for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'friction_static', pairs.double(), [friction_static for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'friction_dynamic', pairs.double(), [friction_dynamic for i in range(ntypes * ntypes)])
-psim.add_contact_property('is_sticking', pairs.int32(), False)
-psim.add_contact_property('tangential_spring_displacement', pairs.vector(), [0.0, 0.0, 0.0])
-psim.add_contact_property('impact_velocity_magnitude', pairs.double(), 0.0)
-
-psim.read_particle_data("data/fluidized_bed.input", ['mass', 'position', 'velocity'])
-psim.build_neighbor_lists(cutoff_radius + skin)
-psim.vtk_output(f"output/test_{target}")
-psim.compute(linear_spring_dashpot, cutoff_radius, symbols={'dt': dt})
-psim.compute(euler, symbols={'dt': dt})
-
-if target == 'gpu':
-    psim.target(pairs.target_gpu())
-else:
-    psim.target(pairs.target_cpu())
-
-psim.generate()
diff --git a/examples/lift.py b/examples/lift.py
deleted file mode 100644
index 2b48fdf02a0afdcc7075843cdab8a1852cee9e45..0000000000000000000000000000000000000000
--- a/examples/lift.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from coupling.parse_cpp import parse_walberla_file
-from coupling.parse_cpp import get_class_method, print_tree
-
-filename = "mesa_pd/kernel/SpringDashpot.hpp"
-translation_unit = parse_walberla_file(filename)
-
-# subtree = get_subtree(tu.cursor, "walberla::mesa_pd::kernel")
-# print_tree(subtree)
-
-kernel = get_class_method(
-        translation_unit.cursor,
-        "walberla::mesa_pd::kernel::SpringDashpot",
-        "operator()")
-print_tree(kernel)
diff --git a/examples/lj_embedded.py b/examples/lj_embedded.py
deleted file mode 100644
index 160c6081bacc9e0a3cd6e8561ebff9ea2eb7bf15..0000000000000000000000000000000000000000
--- a/examples/lj_embedded.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import pairs
-
-
-dt = 0.005
-cutoff_radius = 2.5
-skin = 0.3
-sigma = 1.0
-epsilon = 1.0
-sigma6 = sigma ** 6
-
-psim = pairs.simulation("lj")
-mass = psim.add_real_property('mass', 1.0)
-position = psim.add_vector_property('position')
-velocity = psim.add_vector_property('velocity')
-force = psim.add_vector_property('force', vol=True)
-psim.from_file("data/minimd_setup_4x4x4.input", ['mass', 'position', 'velocity'])
-psim.create_cell_lists(2.8, 2.8)
-psim.create_neighbor_lists()
-psim.periodic(2.8)
-psim.vtk_output("output/test")
-
-for i, j, delta, rsq in psim.particle_pairs(cutoff_radius, position):
-    sr2 = 1.0 / rsq
-    sr6 = sr2 * sr2 * sr2 * sigma6
-    f = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon
-    force[i].add(delta * f)
-
-for i in psim.particles():
-    velocity[i].add(dt * force[i] / mass[i])
-    position[i].add(dt * velocity[i])
-
-psim.generate()
diff --git a/examples/lj_onetype.py b/examples/lj_onetype.py
deleted file mode 100644
index e703122d5469e96458dd831aedbfa4cf9c456090..0000000000000000000000000000000000000000
--- a/examples/lj_onetype.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import pairs
-import sys
-
-
-def lj(i, j):
-    sr2 = 1.0 / rsq
-    sr6 = sr2 * sr2 * sr2 * sigma6
-    force[i] += delta * 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon
-
-
-def euler(i):
-    velocity[i] += dt * force[i] / mass[i]
-    position[i] += dt * velocity[i]
-
-
-cmd = sys.argv[0]
-target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
-if target != 'cpu' and target != 'gpu':
-    print(f"Invalid target, use {cmd} <cpu/gpu>")
-
-
-dt = 0.005
-cutoff_radius = 2.5
-skin = 0.3
-sigma = 1.0
-epsilon = 1.0
-sigma6 = sigma ** 6
-
-psim = pairs.simulation("lj", debug=True)
-psim.add_real_property('mass', 1.0)
-psim.add_position('position')
-psim.add_vector_property('velocity')
-psim.add_vector_property('force', vol=True)
-psim.from_file("data/minimd_setup_32x32x32.input", ['mass', 'position', 'velocity'])
-psim.build_neighbor_lists(cutoff_radius + skin)
-psim.vtk_output(f"output/test_{target}")
-psim.compute(lj, cutoff_radius, {'sigma6': sigma6, 'epsilon': epsilon})
-psim.compute(euler, symbols={'dt': dt})
-
-if target == 'gpu':
-    psim.target(pairs.target_gpu())
-else:
-    psim.target(pairs.target_cpu())
-
-psim.generate()
diff --git a/examples/modular/force_reduction.cpp b/examples/modular/force_reduction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e884b3d0acab91189ffd4465364d9e24319e8992
--- /dev/null
+++ b/examples/modular/force_reduction.cpp
@@ -0,0 +1,125 @@
+#include <iostream>
+//---
+#include "force_reduction.hpp"
+
+int main(int argc, char **argv) {
+
+    auto pairs_sim = std::make_shared<PairsSimulation>();    
+    pairs_sim->initialize();
+    auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
+    
+    // Set domain
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 0.1, 0.1, 0.1);
+
+    // Create bodies
+    pairs::id_t pUid = pairs::create_sphere(pairs_runtime, 0.0499,   0.0499,   0.07,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
+    
+    // setup_sim after creating all bodies
+    pairs_sim->setup_sim();
+    pairs_sim->update_mass_and_inertia();
+
+    // Track particle
+    //-------------------------------------------------------------------------------------------
+    if (pUid != ac->getInvalidUid()){
+        std::cout<< "Particle " << pUid << " is created in rank " << pairs_sim->rank() << std::endl;
+    }
+
+    MPI_Allreduce(MPI_IN_PLACE, &pUid, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    if (pUid != ac->getInvalidUid()){
+        std::cout<< "Particle " << pUid << " will be tracked by rank " << pairs_sim->rank() << std::endl;
+    }
+
+    // Communicate particles (exchange/ghost)
+    //-------------------------------------------------------------------------------------------
+    pairs_sim->communicate(0);
+    ac->update();
+        
+    // Helper lambdas for demo
+    //-------------------------------------------------------------------------------------------
+    auto pIsLocalInMyRank = [&](pairs::id_t uid){return ac->uidToIdxLocal(uid) != ac->getInvalidIdx();};
+    auto pIsGhostInMyRank = [&](pairs::id_t uid){return ac->uidToIdxGhost(uid) != ac->getInvalidIdx();};
+
+    // Check which rank owns the particle, and which ranks have it as a ghost
+    //-------------------------------------------------------------------------------------------
+    ac->syncUid(PairsAccessor::Host);
+    if (pIsLocalInMyRank(pUid)){
+        std::cout<< "Particle " << pUid << " is local in rank " << pairs_sim->rank() << std::endl;
+    }
+    if (pIsGhostInMyRank(pUid)){
+        std::cout<< "Particle " << pUid << " is ghost in rank " << pairs_sim->rank() << std::endl;
+    }
+
+    // Start timestep loop
+    //-------------------------------------------------------------------------------------------
+    int num_timesteps = 1;
+    for (int t=0; t<num_timesteps; ++t){
+        ac->syncUid(PairsAccessor::Host);
+
+        // Add local contribution
+        //-------------------------------------------------------------------------------------------
+        if (pIsLocalInMyRank(pUid)){
+            int idx = ac->uidToIdxLocal(pUid);
+            pairs::Vector3<double> local_force(0.1, 0.1, 0.1);
+            pairs::Vector3<double> local_torque(0.2, 0.2, 0.2);
+
+            std::cout << "Force on particle " << pUid << " from local rank [" << pairs_sim->rank() << "] : (" 
+                        << local_force[0] << ", " << local_force[1] << ", " << local_force[2] << ")" <<  std::endl;
+
+            ac->setHydrodynamicForce(idx, local_force);
+            ac->setHydrodynamicTorque(idx, local_torque);
+            ac->syncHydrodynamicForce(PairsAccessor::Host, true);
+            ac->syncHydrodynamicTorque(PairsAccessor::Host, true);
+        }
+
+        // Add neighbor contributions
+        //-------------------------------------------------------------------------------------------
+        if (pIsGhostInMyRank(pUid)){
+            int idx = ac->uidToIdxGhost(pUid);
+            pairs::Vector3<double> ghost_force(pairs_sim->rank()*10, 1, 1);
+            pairs::Vector3<double> ghost_torque(pairs_sim->rank()*20, 2, 2);
+
+            std::cout << "Force on particle " << pUid << " from neighbor rank [" << pairs_sim->rank() << "] : (" 
+                        << ghost_force[0] << ", " << ghost_force[1] << ", " << ghost_force[2] << ")" <<  std::endl;
+
+            ac->setHydrodynamicForce(idx, ghost_force);
+            ac->setHydrodynamicTorque(idx, ghost_torque);
+            ac->syncHydrodynamicForce(PairsAccessor::Host, true);
+            ac->syncHydrodynamicTorque(PairsAccessor::Host, true);
+        }
+        
+        // Do computations
+        //-------------------------------------------------------------------------------------------
+        pairs_sim->update_cells(t); 
+        pairs_sim->gravity(); 
+        pairs_sim->spring_dashpot();
+        pairs_sim->euler(5e-5);        
+        //-------------------------------------------------------------------------------------------
+
+        std::cout << "---- reverse_comm and reduce ----" << std::endl;
+        // reverse_comm() communicates data from ghost particles back to their owner ranks using
+        // information from the previous time that communicate() was called 
+        pairs_sim->reverse_comm();  
+
+        // Get the reduced force on the owner rank
+        //-------------------------------------------------------------------------------------------
+        if (pIsLocalInMyRank(pUid)){
+            int idx = ac->uidToIdxLocal(pUid);
+            ac->syncHydrodynamicForce(PairsAccessor::Host);
+            ac->syncHydrodynamicTorque(PairsAccessor::Host);
+            auto force_sum = ac->getHydrodynamicForce(idx);
+            auto torque_sum = ac->getHydrodynamicTorque(idx);
+
+            std::cout << "Reduced force on particle " << pUid << " in local rank [" << pairs_sim->rank() << "] : (" 
+                        << force_sum[0] << ", " << force_sum[1] << ", " << force_sum[2] << ")" <<  std::endl;
+        }
+        
+        // Usual communication 
+        //-------------------------------------------------------------------------------------------
+        pairs_sim->communicate(t);
+        ac->update();
+    }
+
+    pairs_sim->end();
+}
diff --git a/examples/modular/force_reduction.py b/examples/modular/force_reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..af9cea7058190b7fda485cc1ec3a6fd6cde8b4f1
--- /dev/null
+++ b/examples/modular/force_reduction.py
@@ -0,0 +1,113 @@
+import math
+import pairs
+import sys
+import os
+
+def update_mass_and_inertia(i):
+    rotation_matrix[i] = diagonal_matrix(1.0)
+    rotation[i] = default_quaternion()
+
+    if is_sphere(i):
+        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
+
+    else:
+        mass[i] = infinity
+        inv_inertia[i] = 0.0
+
+def spring_dashpot(i, j):
+    delta_ij = -penetration_depth(i, j)
+    skip_when(delta_ij < 0.0)
+    
+    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+    
+    rel_vel = -(velocity_wf_i - velocity_wf_j)
+    rel_vel_n = dot(rel_vel, contact_normal(i, j))
+    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
+
+    fNabs = stiffness[i,j] * delta_ij + damping_norm[i,j] * rel_vel_n
+    fN = fNabs * contact_normal(i, j)
+
+    fTabs = min(damping_tan[i,j] * length(rel_vel_t), friction[i, j] * fNabs)
+    fT = fTabs * normalized(rel_vel_t)
+
+    partial_force = fN + fT
+    apply(force, partial_force)
+    apply(torque, cross(contact_point(i, j) - position, partial_force))
+
+def euler(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] +=  0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
+    linear_velocity[i] += inv_mass * force[i] * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation[i] = quaternion(phi, length(phi)) * rotation[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation[i])
+    angular_velocity[i] += wdot * dt
+
+def gravity(i):
+    force[i][2] -= force[i][2] - mass[i] * gravity_SI
+
+
+file_name = os.path.basename(__file__)
+file_name_without_extension = os.path.splitext(file_name)[0]
+
+psim = pairs.simulation(
+    file_name_without_extension,
+    [pairs.sphere(), pairs.halfspace()],
+    double_prec=True,
+    particle_capacity=1000000,
+    neighbor_capacity=20,
+    debug=True, 
+    generate_whole_program=False)
+
+
+target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
+
+if target == 'gpu':
+    psim.target(pairs.target_gpu())
+elif target == 'cpu':
+    psim.target(pairs.target_cpu())
+else:
+    print(f"Invalid target, use {sys.argv[0]} <cpu/gpu>")
+
+gravity_SI = 9.81
+diameter = 100      # required for linkedCellWidth. TODO: set linkedCellWidth at runtime
+linkedCellWidth = 1.01 * diameter
+ntypes = 2
+
+psim.add_position('position')
+psim.add_property('mass', pairs.real())
+psim.add_property('linear_velocity', pairs.vector())
+psim.add_property('angular_velocity', pairs.vector())
+psim.add_property('force', pairs.vector(), volatile=True)
+psim.add_property('torque', pairs.vector(), volatile=True)
+psim.add_property('radius', pairs.real())
+psim.add_property('normal', pairs.vector())
+psim.add_property('inv_inertia', pairs.matrix())
+psim.add_property('rotation_matrix', pairs.matrix())
+psim.add_property('rotation', pairs.quaternion())
+
+# Properties that get reduced during reverse communication
+psim.add_property('hydrodynamic_force', pairs.vector(), reduce=True)
+psim.add_property('hydrodynamic_torque', pairs.vector(), reduce=True)
+
+psim.add_feature('type', ntypes)
+psim.add_feature_property('type', 'stiffness', pairs.real(), [3000 for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'damping_norm', pairs.real(), [10.0 for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'damping_tan', pairs.real())
+psim.add_feature_property('type', 'friction', pairs.real())
+
+psim.set_domain_partitioner(pairs.block_forest())
+psim.pbc([False, False, False])
+psim.build_cell_lists(linkedCellWidth)
+
+# The order of user-defined functions is not important here since 
+# they are not used by other subroutines and are only callable individually 
+psim.compute(update_mass_and_inertia, symbols={'infinity': math.inf })
+psim.compute(spring_dashpot, linkedCellWidth)
+psim.compute(gravity, symbols={'gravity_SI': gravity_SI })
+psim.compute(euler, parameters={'dt': pairs.real()})
+
+psim.generate()
+
diff --git a/examples/modular/sd_1.cpp b/examples/modular/sd_1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e56cfc5943c7626d97d6b872524655b9dd0dca0b
--- /dev/null
+++ b/examples/modular/sd_1.cpp
@@ -0,0 +1,47 @@
+#include <iostream>
+#include <memory>
+
+#include "spring_dashpot.hpp"
+
+int main(int argc, char **argv) {
+
+    auto pairs_sim = std::make_shared<PairsSimulation>();
+    pairs_sim->initialize();
+
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1); 
+
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
+
+    pairs_sim->setup_sim(0.1, 0.1, 0.1, 0.1);
+    pairs_sim->update_mass_and_inertia();
+
+    int num_timesteps = 2000;
+    int vtk_freq = 20;
+    double dt = 1e-3;
+    
+    for (int t=0; t<num_timesteps; ++t){
+        if ((t%500==0) && pairs_sim->rank()==0) std::cout << "Timestep: " << t << std::endl;
+
+        pairs_sim->communicate(t);
+        
+        pairs_sim->update_cells(t); 
+
+        pairs_sim->gravity(); 
+        pairs_sim->spring_dashpot(); 
+        pairs_sim->euler(dt); 
+
+        pairs::vtk_write_data(pairs_runtime, "output/sd_1_local", 0, pairs_sim->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_1_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
+    }
+
+    pairs_sim->end();
+}
diff --git a/examples/modular/sd_2.cpp b/examples/modular/sd_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4bdb4c85a37efef0391ccecc9d2a1e1df6e3313d
--- /dev/null
+++ b/examples/modular/sd_2.cpp
@@ -0,0 +1,68 @@
+#include <iostream>
+#include <memory>
+
+#include <blockforest/BlockForest.h>
+#include <blockforest/Initialization.h>
+
+#include "spring_dashpot.hpp"
+
+int main(int argc, char **argv) {
+
+    auto pairs_sim = std::make_shared<PairsSimulation>();
+    pairs_sim->initialize();
+
+    // Create forest
+    // -------------------------------------------------------------------------------
+    walberla::math::AABB domain(0, 0, 0, 1, 1, 1);
+    std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance();
+    mpiManager->initializeMPI(&argc, &argv);
+    mpiManager->useWorldComm();
+    auto procs = mpiManager->numProcesses();
+
+    walberla::Vector3<int> block_config;
+    if (procs==1)        block_config = walberla::Vector3<int>(1, 1, 1);
+    else if (procs==4)   block_config = walberla::Vector3<int>(2, 2, 1);
+    else { std::cout << "Error: Check block_config" << std::endl; exit(-1);} 
+
+    auto ref_level = 0;
+    std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest(
+            domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level);
+
+    // Pass forest to P4IRS
+    // -------------------------------------------------------------------------------
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->useDomain(forest);
+
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
+
+    pairs_sim->setup_sim(0.1, 0.1, 0.1, 0.1);
+    pairs_sim->update_mass_and_inertia();
+
+    int num_timesteps = 2000;
+    int vtk_freq = 20;
+    double dt = 1e-3;
+
+    for (int t=0; t<num_timesteps; ++t){
+        if ((t%500==0) && pairs_sim->rank()==0) std::cout << "Timestep: " << t << std::endl;
+
+        pairs_sim->communicate(t);
+        
+        pairs_sim->update_cells(t); 
+
+        pairs_sim->gravity(); 
+        pairs_sim->spring_dashpot(); 
+        pairs_sim->euler(dt); 
+
+        pairs::vtk_write_data(pairs_runtime, "output/sd_2_local", 0, pairs_sim->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_2_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
+    }
+
+    pairs_sim->end();
+}
diff --git a/examples/modular/sd_3_CPU.cpp b/examples/modular/sd_3_CPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8e4a93bc5e816eb8eee900b7ef1d6769e7ec206
--- /dev/null
+++ b/examples/modular/sd_3_CPU.cpp
@@ -0,0 +1,95 @@
+#include <iostream>
+#include <memory>
+
+#include "spring_dashpot.hpp"
+
+void change_gravitational_force(std::shared_ptr<PairsAccessor> &ac, int idx){
+    pairs::Vector3<double> upward_gravity(0.0, 0.0, 2 * ac->getMass(idx) * 9.81); 
+    ac->setForce(idx, ac->getForce(idx) + upward_gravity);
+}
+
+int main(int argc, char **argv) {
+
+    auto pairs_sim = std::make_shared<PairsSimulation>();
+    pairs_sim->initialize();
+
+    auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
+    
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1);
+
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+
+    pairs::id_t pUid = pairs::create_sphere(pairs_runtime ,0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 0, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 0, 0);
+
+    MPI_Allreduce(MPI_IN_PLACE, &pUid, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    auto pIsLocalInMyRank = [&](pairs::id_t uid){return ac->uidToIdxLocal(uid) != ac->getInvalidIdx();};
+
+    pairs_sim->setup_sim(0.1, 0.1, 0.1, 0.1);
+    pairs_sim->update_mass_and_inertia();
+
+    pairs_sim->communicate(0);
+
+    int num_timesteps = 2000;
+    int vtk_freq = 20;
+    double dt = 1e-3;
+
+    for (int t=0; t<num_timesteps; ++t){
+
+        // Print position of particle pUid
+        //-------------------------------------------------------------------------------------------
+        if(pIsLocalInMyRank(pUid)){
+            std::cout << "Timestep (" << t << "): Particle " << pUid << " is in rank " << pairs_sim->rank() << std::endl;
+            int idx = ac->uidToIdxLocal(pUid);
+            std::cout << "Position = (" 
+                    << ac->getPosition(idx)[0] << ", "
+                    << ac->getPosition(idx)[1] << ", " 
+                    << ac->getPosition(idx)[2] << ")" << std::endl;
+
+        }
+
+        // Calculate forces
+        //-------------------------------------------------------------------------------------------
+        pairs_sim->update_cells(t);
+        pairs_sim->gravity(); 
+        pairs_sim->spring_dashpot(); 
+
+        // Change gravitational force on particle pUid
+        //-------------------------------------------------------------------------------------------
+        if(pIsLocalInMyRank(pUid)){
+            int idx = ac->uidToIdxLocal(pUid);
+
+            std::cout << "Force before changing = (" 
+                    << ac->getForce(idx)[0] << ", "
+                    << ac->getForce(idx)[1] << ", " 
+                    << ac->getForce(idx)[2] << ")" << std::endl;
+
+            change_gravitational_force(ac, idx);
+
+            std::cout << "Force after changing = (" 
+                    << ac->getForce(idx)[0] << ", "
+                    << ac->getForce(idx)[1] << ", " 
+                    << ac->getForce(idx)[2] << ")" << std::endl;
+        }
+
+        // Euler
+        //-------------------------------------------------------------------------------------------
+        pairs_sim->euler(dt);
+
+        // Communicate
+        //-------------------------------------------------------------------------------------------
+        pairs_sim->communicate(t);
+
+        pairs::vtk_write_data(pairs_runtime, "output/sd_3_CPU_local", 0, ac->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_3_CPU_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
+    }
+
+    pairs_sim->end();
+}
\ No newline at end of file
diff --git a/examples/modular/sd_3_GPU.cu b/examples/modular/sd_3_GPU.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b44af846643ed9cf0a730b07c5f60543560e29b8
--- /dev/null
+++ b/examples/modular/sd_3_GPU.cu
@@ -0,0 +1,152 @@
+#include <iostream>
+#include <memory>
+#include <cuda_runtime.h>
+
+#include "spring_dashpot.hpp"
+
+void checkCudaError(cudaError_t err, const char* func) {
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA error in %s: %s\n", func, cudaGetErrorString(err));
+        exit(err);
+    }
+}
+
+__global__ void print_position(PairsAccessor ac, int idx){
+    printf("Position [from device] = (%f, %f, %f) \n", ac.getPosition(idx)[0], ac.getPosition(idx)[1], ac.getPosition(idx)[2]);
+}
+
+__global__ void change_gravitational_force(PairsAccessor ac, int idx){
+    printf("Force [from device] before setting = (%f, %f, %f) \n", ac.getForce(idx)[0], ac.getForce(idx)[1], ac.getForce(idx)[2]);
+
+    pairs::Vector3<double> upward_gravity(0.0, 0.0, 2 * ac.getMass(idx) * 9.81); 
+    ac.setForce(idx, ac.getForce(idx) + upward_gravity);
+
+    printf("Force [from device] after setting = (%f, %f, %f) \n", ac.getForce(idx)[0], ac.getForce(idx)[1], ac.getForce(idx)[2]);
+}
+
+void set_feature_properties(std::shared_ptr<PairsAccessor> &ac){
+    ac->setTypeStiffness(0,0, 0);
+    ac->setTypeStiffness(0,1, 1000);
+    ac->setTypeStiffness(1,0, 1000);
+    ac->setTypeStiffness(1,1, 3000);
+    ac->syncTypeStiffness();
+
+    ac->setTypeDampingNorm(0,0, 0);
+    ac->setTypeDampingNorm(0,1, 20);
+    ac->setTypeDampingNorm(1,0, 20);
+    ac->setTypeDampingNorm(1,1, 10);
+    ac->syncTypeDampingNorm();
+}
+
+int main(int argc, char **argv) {
+
+    auto pairs_sim = std::make_shared<PairsSimulation>();
+    pairs_sim->initialize();
+
+    // Create PairsAccessor after PairsSimulation is initialized
+    auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
+
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1);
+
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+
+    pairs::id_t pUid = pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 1, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 1, 0);
+
+    set_feature_properties(ac);
+
+    MPI_Allreduce(MPI_IN_PLACE, &pUid, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    auto pIsLocalInMyRank = [&](pairs::id_t uid){return ac->uidToIdxLocal(uid) != ac->getInvalidIdx();};
+
+    pairs_sim->setup_sim(0.1, 0.1, 0.1, 0.1);
+    pairs_sim->update_mass_and_inertia();
+
+    pairs_sim->communicate(0);
+    // PairsAccessor requires an update when particles are communicated 
+    ac->update();
+
+    int num_timesteps = 2000;
+    int vtk_freq = 20;
+    double dt = 1e-3;
+
+    for (int t=0; t<num_timesteps; ++t){
+        // Up-to-date uids might be on host or device. So sync uid in Host before accessing them from host
+        ac->syncUid(PairsAccessor::Host);
+
+        // Print position of particle pUid
+        //-------------------------------------------------------------------------------------------
+        if(pIsLocalInMyRank(pUid)){
+            std::cout << "Timestep (" << t << "): Particle " << pUid << " is in rank " << pairs_sim->rank() << std::endl;
+            int idx = ac->uidToIdxLocal(pUid);
+
+            // Up-to-date position might be on host or device. 
+            // Sync position on Host before reading it from host:
+            ac->syncPosition(PairsAccessor::Host); 
+            std::cout << "Position [from host] = (" 
+                    << ac->getPosition(idx)[0] << ", "
+                    << ac->getPosition(idx)[1] << ", " 
+                    << ac->getPosition(idx)[2] << ")" << std::endl;
+            
+            // Sync position on Device before reading it from device:
+            ac->syncPosition(PairsAccessor::Device); 
+            print_position<<<1,1>>>(*ac, idx);
+            checkCudaError(cudaDeviceSynchronize(), "print_position");
+            
+            // There's no need to sync position here to continue the simulation, since position wasn't modified.
+        }
+
+        // Calculate forces
+        //-------------------------------------------------------------------------------------------
+        pairs_sim->update_cells(t);
+        pairs_sim->gravity(); 
+        pairs_sim->spring_dashpot(); 
+
+        // Change gravitational force on particle pUid
+        //-------------------------------------------------------------------------------------------
+        ac->syncUid(PairsAccessor::Host);
+
+        if(pIsLocalInMyRank(pUid)){
+            std::cout << "Force Timestep (" << t << "): Particle " << pUid << " is in rank " << pairs_sim->rank() << std::endl;
+            int idx = ac->uidToIdxLocal(pUid);
+
+            // Up-to-date force and mass might be on host or device. 
+            // So sync them in Device before accessing them on device. (No data will be transfered if they are already on device)
+            ac->syncForce(PairsAccessor::Device);
+            ac->syncMass(PairsAccessor::Device);
+
+            // Modify force from device:
+            change_gravitational_force<<<1,1>>>(*ac, idx);
+            checkCudaError(cudaDeviceSynchronize(), "change_gravitational_force");
+
+            // Force on device was modified.
+            // So sync force before continuing the simulation.
+            ac->syncForce(PairsAccessor::Host);
+            std::cout << "Force [from host] after changing = (" 
+                    << ac->getForce(idx)[0] << ", "
+                    << ac->getForce(idx)[1] << ", " 
+                    << ac->getForce(idx)[2] << ")" << std::endl;
+        }
+
+        // Euler
+        //-------------------------------------------------------------------------------------------
+        pairs_sim->euler(dt);
+
+        // Communicate
+        //-------------------------------------------------------------------------------------------
+        pairs_sim->communicate(t);
+        // PairsAccessor requires an update when particles are communicated
+        ac->update();
+
+        pairs::vtk_write_data(pairs_runtime, "output/dem_sd_local", 0, ac->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/dem_sd_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
+    }
+
+    pairs_sim->end();
+}
\ No newline at end of file
diff --git a/examples/modular/sd_4.cpp b/examples/modular/sd_4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..80ec40313e13692d1b41fcee01f6b5ce7d4ef91b
--- /dev/null
+++ b/examples/modular/sd_4.cpp
@@ -0,0 +1,99 @@
+#include <iostream>
+#include <memory>
+#include <iomanip>
+
+#include "spring_dashpot.hpp"
+
+void set_feature_properties(std::shared_ptr<PairsAccessor> &ac){
+    ac->setTypeStiffness(0,0, 100000);
+    ac->setTypeStiffness(0,1, 100000);
+    ac->setTypeStiffness(1,0, 100000);
+    ac->setTypeStiffness(1,1, 100000);
+    ac->syncTypeStiffness();
+
+    ac->setTypeDampingNorm(0,0, 300);
+    ac->setTypeDampingNorm(0,1, 300);
+    ac->setTypeDampingNorm(1,0, 300);
+    ac->setTypeDampingNorm(1,1, 300);
+    ac->syncTypeDampingNorm();
+
+    ac->setTypeFriction(0,0, 0.5);
+    ac->setTypeFriction(0,1, 0.5);
+    ac->setTypeFriction(1,0, 0.5);
+    ac->setTypeFriction(1,1, 0.5);
+    ac->syncTypeFriction();
+
+    ac->setTypeDampingTan(0,0, 20);
+    ac->setTypeDampingTan(0,1, 20);
+    ac->setTypeDampingTan(1,0, 20);
+    ac->setTypeDampingTan(1,1, 20);
+    ac->syncTypeDampingTan();
+}
+
+int main(int argc, char **argv) {
+    auto pairs_sim = std::make_shared<PairsSimulation>();
+    pairs_sim->initialize();
+
+    auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
+    set_feature_properties(ac);
+
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+
+    pairs_runtime->initDomain(&argc, &argv, 
+                    0, 0, 0, 20, 20, 20,    // Domain bounds
+                    false, false, false,    // PBCs --------------> TODO: runtime pbc
+                    true                    // Enable dynamic load balancing (does initial refinement on a <1,1,1> blockforest)
+                ); 
+
+    pairs_runtime->getDomainPartitioner()->initWorkloadBalancer(pairs::Hilbert, 100, 1000);
+
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 20,20,20,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 20,20,20,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 20,20,20,  0, 0, -1,    0, 13);
+
+    double diameter_min = 0.3;
+    double diameter_max = 0.3;
+    double sphere_spacing = 0.4;
+    pairs::dem_sc_grid(pairs_runtime, 10, 10, 15,  sphere_spacing, diameter_min, diameter_min, diameter_max,    2,      100,    2);
+    
+    double lcw = diameter_max * 1.01;       // Linked-cell width
+    double interaction_radius = diameter_max;
+    pairs_sim->setup_sim(lcw, lcw, lcw, interaction_radius);
+
+    pairs_sim->update_mass_and_inertia();
+
+    int num_timesteps = 4000;
+    int vtk_freq = 20;
+    int rebalance_freq = 200;
+    double dt = 1e-3;
+
+    pairs::vtk_write_subdom(pairs_runtime, "output/subdom_init", 0);
+
+    
+    for (int t=0; t<num_timesteps; ++t){
+        if ((t % vtk_freq==0) && pairs_sim->rank()==0) std::cout << "Timestep: " << t << std::endl;
+        
+        if (t % rebalance_freq == 0){ 
+            pairs_sim->update_domain();
+        }
+        
+        pairs_sim->update_cells(t); 
+        
+        pairs_sim->gravity(); 
+        pairs_sim->spring_dashpot(); 
+        pairs_sim->euler(dt); 
+        
+        pairs_sim->communicate(t);
+
+        if (t % vtk_freq==0){
+            pairs::vtk_write_subdom(pairs_runtime, "output/subdom", t);
+            pairs::vtk_write_data(pairs_runtime, "output/sd_4_local", 0, pairs_sim->nlocal(), t);
+            pairs::vtk_write_data(pairs_runtime, "output/sd_4_ghost", pairs_sim->nlocal(), pairs_sim->size(), t);
+        }
+    }
+
+    pairs_sim->end();
+}
\ No newline at end of file
diff --git a/examples/modular/spring_dashpot.py b/examples/modular/spring_dashpot.py
new file mode 100644
index 0000000000000000000000000000000000000000..191c000ca61962af8ebae77ab4ec1b97b433d399
--- /dev/null
+++ b/examples/modular/spring_dashpot.py
@@ -0,0 +1,107 @@
+import math
+import pairs
+import sys
+import os
+
+def update_mass_and_inertia(i):
+    rotation_matrix[i] = diagonal_matrix(1.0)
+    rotation[i] = default_quaternion()
+
+    if is_sphere(i):
+        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
+
+    else:
+        mass[i] = infinity
+        inv_inertia[i] = 0.0
+
+def spring_dashpot(i, j):
+    delta_ij = -penetration_depth(i, j)
+    skip_when(delta_ij < 0.0)
+    
+    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+    
+    rel_vel = -(velocity_wf_i - velocity_wf_j)
+    rel_vel_n = dot(rel_vel, contact_normal(i, j))
+    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
+
+    fNabs = stiffness[i,j] * delta_ij + damping_norm[i,j] * rel_vel_n
+    fN = fNabs * contact_normal(i, j)
+
+    fTabs = min(damping_tan[i,j] * length(rel_vel_t), friction[i, j] * fNabs)
+    fT = fTabs * normalized(rel_vel_t)
+
+    partial_force = fN + fT
+    apply(force, partial_force)
+    apply(torque, cross(contact_point(i, j) - position, partial_force))
+
+def euler(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] +=  0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
+    linear_velocity[i] += inv_mass * force[i] * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation[i] = quaternion(phi, length(phi)) * rotation[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation[i])
+    angular_velocity[i] += wdot * dt
+
+def gravity(i):
+    force[i][2] -= mass[i] * gravity_SI
+
+
+file_name = os.path.basename(__file__)
+file_name_without_extension = os.path.splitext(file_name)[0]
+
+psim = pairs.simulation(
+    file_name_without_extension,
+    [pairs.sphere(), pairs.halfspace()],
+    double_prec=True,
+    particle_capacity=1000000,
+    neighbor_capacity=20,
+    debug=True, 
+    generate_whole_program=False)
+
+
+target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
+
+if target == 'gpu':
+    psim.target(pairs.target_gpu())
+elif target == 'cpu':
+    psim.target(pairs.target_cpu())
+else:
+    print(f"Invalid target, use {sys.argv[0]} <cpu/gpu>")
+
+psim.add_position('position')
+psim.add_property('mass', pairs.real())
+psim.add_property('linear_velocity', pairs.vector())
+psim.add_property('angular_velocity', pairs.vector())
+psim.add_property('force', pairs.vector(), volatile=True)
+psim.add_property('torque', pairs.vector(), volatile=True)
+psim.add_property('radius', pairs.real())
+psim.add_property('normal', pairs.vector())
+psim.add_property('inv_inertia', pairs.matrix())
+psim.add_property('rotation_matrix', pairs.matrix())
+psim.add_property('rotation', pairs.quaternion())
+
+ntypes = 2
+psim.add_feature('type', ntypes)
+psim.add_feature_property('type', 'stiffness', pairs.real(), [3000 for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'damping_norm', pairs.real(), [10.0 for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'damping_tan', pairs.real())
+psim.add_feature_property('type', 'friction', pairs.real())
+
+psim.set_domain_partitioner(pairs.block_forest())
+psim.pbc([False, False, False])
+psim.build_cell_lists()
+
+# The order of user-defined functions is not important here since 
+# they are not used by other subroutines and are only callable individually 
+psim.compute(update_mass_and_inertia, symbols={'infinity': math.inf })
+psim.compute(spring_dashpot)
+psim.compute(euler, parameters={'dt': pairs.real()})
+
+gravity_SI = 9.81
+psim.compute(gravity, symbols={'gravity_SI': gravity_SI })
+
+psim.generate()
+
diff --git a/examples/dem.py b/examples/whole-program-generation/linear_spring_dashpot.py
similarity index 91%
rename from examples/dem.py
rename to examples/whole-program-generation/linear_spring_dashpot.py
index 9cecf55eb8a8cef70cce3ff5927ffb1cfdbc8ebd..90348fd7ecb0acfe1a7fd6d84bf4b7bc5c4dbcf1 100644
--- a/examples/dem.py
+++ b/examples/whole-program-generation/linear_spring_dashpot.py
@@ -97,9 +97,6 @@ if target != 'cpu' and target != 'gpu':
 
 # Config file parameters
 domainSize_SI = [0.8, 0.015, 0.2]
-#domainSize_SI = [0.4, 0.4, 0.2] # node base
-#domainSize_SI = [0.6, 0.6, 0.2] # node base
-#domainSize_SI = [0.8, 0.8, 0.2] # node base
 diameter_SI = 0.0029
 gravity_SI = 9.81
 densityFluid_SI = 1000
@@ -112,7 +109,6 @@ restitutionCoefficient = 0.1
 collisionTime_SI = 5e-4
 poissonsRatio = 0.22
 timeSteps = 10000
-#timeSteps = 1000
 visSpacing = 100
 denseBottomLayer = False
 bottomLayerOffsetFactor = 1.0
@@ -128,13 +124,14 @@ frictionStatic = 0.0
 frictionDynamic = frictionCoefficient
 
 psim = pairs.simulation(
-    "dem",
+    "linear_spring_dashpot",
     [pairs.sphere(), pairs.halfspace()],
     timesteps=timeSteps,
     double_prec=True,
     use_contact_history=True,
     particle_capacity=1000000,
-    neighbor_capacity=20)
+    neighbor_capacity=20,
+    generate_whole_program=True)
 
 if target == 'gpu':
     psim.target(pairs.target_gpu())
@@ -167,29 +164,16 @@ psim.dem_sc_grid(
     domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
     diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
 
-#psim.read_particle_data(
-#    "data/spheres.input",
-#    "data/spheres_4x4x2.input",
-#    "data/spheres_6x6x2.input",
-#    "data/spheres_8x8x2.input",
-#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
-#    pairs.sphere())
-
-#psim.read_particle_data(
-#    "data/spheres_bottom.input",
-#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
-#    pairs.sphere())
 
 psim.read_particle_data(
     "data/planes.input",
-    ['uid', 'type', 'mass', 'position', 'normal', 'flags'],
+    ['type', 'mass', 'position', 'normal', 'flags'],
     pairs.halfspace())
 
 psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
                                      'pi': math.pi,
                                      'infinity': math.inf })
 
-#psim.compute_half()
 psim.build_cell_lists(linkedCellWidth)
 #psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
 
diff --git a/examples/md.py b/examples/whole-program-generation/md.py
similarity index 85%
rename from examples/md.py
rename to examples/whole-program-generation/md.py
index 08729e1e52994e47395cb13bfd4417ef0348b77f..22eb3c1f58d0fb7d8892232b04a9aa8b146cccf4 100644
--- a/examples/md.py
+++ b/examples/whole-program-generation/md.py
@@ -35,7 +35,12 @@ nz = 32
 rho = 0.8442
 temp = 1.44
 
-psim = pairs.simulation("md", [pairs.point_mass()], timesteps=200, double_prec=True)
+psim = pairs.simulation("md", 
+                        [pairs.point_mass()],
+                        timesteps=200, 
+                        double_prec=True, 
+                        debug=True,
+                        generate_whole_program=True)
 
 if target == 'gpu':
     psim.target(pairs.target_gpu())
@@ -55,9 +60,8 @@ psim.set_domain_partitioner(pairs.regular_domain_partitioner())
 psim.compute_thermo(100)
 
 psim.reneighbor_every(20)
-#psim.compute_half()
 psim.build_neighbor_lists(cutoff_radius + skin)
-#psim.vtk_output(f"output/md_{target}")
+# psim.vtk_output(f"output/md_{target}")
 
 psim.compute(initial_integrate, symbols={'dt': dt}, pre_step=True, skip_first=True)
 psim.compute(lennard_jones, cutoff_radius)
diff --git a/examples/whole-program-generation/spring_dashpot.py b/examples/whole-program-generation/spring_dashpot.py
new file mode 100644
index 0000000000000000000000000000000000000000..6212b0aa7fd41d3d78ad4655f3d9de8f8afa1750
--- /dev/null
+++ b/examples/whole-program-generation/spring_dashpot.py
@@ -0,0 +1,162 @@
+import math
+import pairs
+import sys
+import os
+
+def update_mass_and_inertia(i):
+    rotation_matrix[i] = diagonal_matrix(1.0)
+    rotation[i] = default_quaternion()
+
+    if is_sphere(i):
+        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
+
+    else:
+        mass[i] = infinity
+        inv_inertia[i] = 0.0
+
+def spring_dashpot(i, j):
+    delta_ij = -penetration_depth(i, j)
+    skip_when(delta_ij < 0.0)
+    
+    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+    
+    rel_vel = -(velocity_wf_i - velocity_wf_j)
+    rel_vel_n = dot(rel_vel, contact_normal(i, j))
+    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
+
+    fNabs = stiffness[i,j] * delta_ij + damping_norm[i,j] * rel_vel_n
+    fN = fNabs * contact_normal(i, j)
+
+    fTabs = min(damping_tan[i,j] * length(rel_vel_t), friction[i, j] * fNabs)
+    fT = fTabs * normalized(rel_vel_t)
+
+    partial_force = fN + fT
+    apply(force, partial_force)
+    apply(torque, cross(contact_point(i, j) - position, partial_force))
+
+def euler(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] +=  0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
+    linear_velocity[i] += inv_mass * force[i] * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation[i] = quaternion(phi, length(phi)) * rotation[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation[i])
+    angular_velocity[i] += wdot * dt
+
+def gravity(i):
+    force[i][2] -= mass[i] * gravity_SI
+
+
+# Domain size
+domainSize_SI=[10, 10, 10]
+
+# Parameters required for generating the initial grid of particles 'dem_sc_grid'
+generationSpacing_SI = 0.4
+diameter_SI = 0.3
+minDiameter_SI = diameter_SI
+maxDiameter_SI = diameter_SI
+initialVelocity_SI = 2
+densityParticle_SI = 100
+
+# Linked cell width 
+linkedCellWidth = 1.01 * maxDiameter_SI
+
+# Required symbol for the 'gravity' module
+gravity_SI = 9.81
+
+# Required symbol for the 'euler' module
+dt_SI = 1e-3
+
+# VTK frequency
+visSpacing = 20
+
+timeSteps = 2000
+
+# file_name_without_extension is the simulation identifer (in this case "spring_dashpot")
+# TODO: Integration with cmake
+file_name = os.path.basename(__file__)
+file_name_without_extension = os.path.splitext(file_name)[0]
+
+psim = pairs.simulation(
+    file_name_without_extension,
+    [pairs.sphere(), pairs.halfspace()],
+    timesteps=timeSteps,
+    double_prec=True,
+    particle_capacity=1000000,
+    neighbor_capacity=20,
+    debug=True, 
+    generate_whole_program=True)
+
+target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
+if target == 'gpu':
+    psim.target(pairs.target_gpu())
+elif target == 'cpu':
+    psim.target(pairs.target_cpu())
+else:
+    print(f"Invalid target, use {sys.argv[0]} <cpu/gpu>")
+
+
+# Register properties
+psim.add_position('position')
+psim.add_property('mass', pairs.real())
+psim.add_property('linear_velocity', pairs.vector())
+psim.add_property('angular_velocity', pairs.vector())
+psim.add_property('force', pairs.vector(), volatile=True)
+psim.add_property('torque', pairs.vector(), volatile=True)
+psim.add_property('radius', pairs.real())
+psim.add_property('normal', pairs.vector())
+psim.add_property('inv_inertia', pairs.matrix())
+psim.add_property('rotation_matrix', pairs.matrix())
+psim.add_property('rotation', pairs.quaternion())
+
+# Define the number of 'type' features and their pair-wise properties
+ntypes = 2
+stiffness_SI = [100000 for i in range(ntypes * ntypes)]
+dampingNorm_SI = [300 for i in range(ntypes * ntypes)]
+dampingTan_SI = [0.5 for i in range(ntypes * ntypes)]
+friction_SI = [20.0 for i in range(ntypes * ntypes)]
+
+# Register 'type' as a feature
+psim.add_feature('type', ntypes)
+
+# Register properties for the 'type' feature
+psim.add_feature_property('type', 'stiffness', pairs.real(), stiffness_SI)
+psim.add_feature_property('type', 'damping_norm', pairs.real(), dampingNorm_SI)
+psim.add_feature_property('type', 'damping_tan', pairs.real(), dampingTan_SI)
+psim.add_feature_property('type', 'friction', pairs.real(), friction_SI)
+
+# Define the domain and optimization strategies
+psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
+psim.pbc([False, False, False])
+psim.set_domain_partitioner(pairs.block_forest())
+psim.set_workload_balancer(pairs.morton(), regrid_min=100, regrid_max=1000, rebalance_frequency=200)
+psim.build_cell_lists(linkedCellWidth)
+
+# Generate particles
+psim.dem_sc_grid(domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], 
+                 generationSpacing_SI,
+                 diameter_SI, 
+                 minDiameter_SI, 
+                 maxDiameter_SI, 
+                 initialVelocity_SI, 
+                 densityParticle_SI, 
+                 ntypes)
+
+# Read planes from file
+psim.read_particle_data( "data/sd_planes.input", ['type', 'mass', 'position', 'normal', 'flags'], pairs.halfspace())
+
+psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
+
+# The user-defined 'setup' functions are executed only once before the timestep loop
+psim.setup(update_mass_and_inertia, symbols={'infinity': math.inf })
+
+# The user-defined 'compute' functions are added to the timestep loop in the order they are given to 'compute'
+psim.compute(spring_dashpot, linkedCellWidth)
+psim.compute(gravity, symbols={'gravity_SI': gravity_SI })
+psim.compute(euler, symbols={'dt': dt_SI})
+
+# Triger code generation
+psim.generate()
+
diff --git a/runtime/array.hpp b/runtime/array.hpp
index 03a538543ee1f465104b101ea7bb353d527c4f78..d422842f96cbdb6343c11ca370d76a3f94be29c3 100644
--- a/runtime/array.hpp
+++ b/runtime/array.hpp
@@ -24,14 +24,14 @@ public:
         PAIRS_ASSERT(size_ > 0);
     }
 
-    array_t getId() { return id; }
-    std::string getName() { return name; }
-    void *getHostPointer() { return h_ptr; }
-    void *getDevicePointer() { return d_ptr; }
+    array_t getId() const { return id; }
+    std::string getName() const { return name; }
+    void *getHostPointer() const { return h_ptr; }
+    void *getDevicePointer() const { return d_ptr; }
     void setPointers(void *h_ptr_, void *d_ptr_) { h_ptr = h_ptr_, d_ptr = d_ptr_; }
-    void setSize(size_t size_) { size = size_; }
-    size_t getSize() { return size; };
-    bool isStatic() { return is_static; }
+    void setSize(size_t size_) { size = size_;}
+    size_t getSize() const { return size; }
+    bool isStatic() const { return is_static; }
 };
 
 }
diff --git a/runtime/boundary_weights.cpp b/runtime/boundary_weights.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a67d29386c5d1d11b057b3a6435a3c8d7496626
--- /dev/null
+++ b/runtime/boundary_weights.cpp
@@ -0,0 +1,46 @@
+#include "boundary_weights.hpp"
+
+// Always include last generated interfaces
+#include "last_generated.hpp"
+namespace pairs {
+
+void compute_boundary_weights(
+    PairsRuntime *ps,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax,
+    long unsigned int *comp_weight, long unsigned int *comm_weight) {
+
+    const int particle_capacity = ps->getTrackedVariableAsInteger("particle_capacity");
+    const int nlocal = ps->getTrackedVariableAsInteger("nlocal");
+    auto position_prop = ps->getPropertyByName("position");
+    auto flags_prop = ps->getPropertyByName("flags");
+
+    real_t *position_ptr = static_cast<real_t *>(position_prop.getHostPointer());
+    int *flags_ptr = static_cast<int *>(flags_prop.getHostPointer());
+
+    *comp_weight = 0;
+
+    for(int i = 0; i < nlocal; i++) {
+        if (pairs_host_interface::get_flags(flags_ptr, i) & (pairs::flags::INFINITE | pairs::flags::GLOBAL)) {
+            continue;
+        }
+
+        real_t pos_x = pairs_host_interface::get_position(position_ptr, i, 0, particle_capacity);
+        real_t pos_y = pairs_host_interface::get_position(position_ptr, i, 1, particle_capacity);
+        real_t pos_z = pairs_host_interface::get_position(position_ptr, i, 2, particle_capacity);
+
+        if( pos_x >= xmin && pos_x < xmax &&
+            pos_y >= ymin && pos_y < ymax &&
+            pos_z >= zmin && pos_z < zmax) {
+                (*comp_weight)++;
+        }
+    }
+
+    // TODO: Count the number of ghosts that must be communicated with this block.
+    // Note: The ghosts stored in this rank are NOT contained in the aabb of any of its blocks.
+    //       And neighbor blocks are going to change after rebalancing.
+    // const int nghost = ps->getTrackedVariableAsInteger("nghost");
+    *comm_weight = 0;
+
+}
+
+}
diff --git a/runtime/boundary_weights.cu b/runtime/boundary_weights.cu
new file mode 100644
index 0000000000000000000000000000000000000000..191139fa245dd5104afdb4793fbe0f27cdaa4441
--- /dev/null
+++ b/runtime/boundary_weights.cu
@@ -0,0 +1,108 @@
+#include "boundary_weights.hpp"
+// #include "devices/device.hpp"
+
+// Always include last generated interfaces
+#include "last_generated.hpp"
+#define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); }
+
+namespace pairs {
+
+#define REDUCE_BLOCK_SIZE 64
+
+__global__ void reduceBoundaryWeights( real_t *position, int *flags, int start, int end, int particle_capacity,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, int *d_weights) {
+
+    __shared__ int red_data[REDUCE_BLOCK_SIZE];
+    int tid = threadIdx.x;
+    int i = blockIdx.x * blockDim.x + tid;
+    int particle_idx = start + i;
+
+    red_data[tid] = 0;
+
+    if(particle_idx < end) {
+        if (!(pairs_cuda_interface::get_flags(flags, i) & (pairs::flags::INFINITE | pairs::flags::GLOBAL))) {
+
+            real_t pos_x = pairs_cuda_interface::get_position(position, particle_idx, 0, particle_capacity);
+            real_t pos_y = pairs_cuda_interface::get_position(position, particle_idx, 1, particle_capacity);
+            real_t pos_z = pairs_cuda_interface::get_position(position, particle_idx, 2, particle_capacity);
+
+            if( pos_x >= xmin && pos_x < xmax &&
+                pos_y >= ymin && pos_y < ymax &&
+                pos_z >= zmin && pos_z < zmax) {
+                    red_data[tid] = 1;
+            }
+        }
+    }
+
+    __syncthreads();
+
+    int s = blockDim.x >> 1;
+    while(s > 0) {
+        if(tid < s) {
+            red_data[tid] += red_data[tid + s];
+        }
+
+        __syncthreads();
+        s >>= 1;
+    }
+
+    if(tid == 0) {
+        d_weights[blockIdx.x] = red_data[0];
+    }
+}
+
+int cuda_compute_boundary_weights(
+    real_t *position, int *flags, int start, int end, int particle_capacity,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
+    
+    if (start==end) return 0;
+    const int nblocks = (end - start + (REDUCE_BLOCK_SIZE - 1)) / REDUCE_BLOCK_SIZE;
+
+    int *h_weights = (int *) malloc(nblocks * sizeof(int));
+    int *d_weights = (int *) device_alloc(nblocks * sizeof(int));
+    int red = 0;
+
+    CUDA_ASSERT(cudaMemset(d_weights, 0, nblocks * sizeof(int)));
+    reduceBoundaryWeights<<<nblocks, REDUCE_BLOCK_SIZE>>>(
+            position, flags, start, end, particle_capacity,
+            xmin, xmax, ymin, ymax, zmin, zmax, d_weights);
+
+    CUDA_ASSERT(cudaPeekAtLastError());
+    CUDA_ASSERT(cudaDeviceSynchronize());
+    CUDA_ASSERT(cudaMemcpy(h_weights, d_weights, nblocks * sizeof(int), cudaMemcpyDeviceToHost));
+
+    for(int i = 0; i < nblocks; i++) {
+        red += h_weights[i];
+    }
+
+    return red;
+}
+
+void compute_boundary_weights(
+    PairsRuntime *ps,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax,
+    long unsigned int *comp_weight, long unsigned int *comm_weight) {
+
+    const int particle_capacity = ps->getTrackedVariableAsInteger("particle_capacity");
+    const int nlocal = ps->getTrackedVariableAsInteger("nlocal");
+    const int nghost = ps->getTrackedVariableAsInteger("nghost");
+    auto position_prop = ps->getPropertyByName("position");
+    auto flags_prop = ps->getPropertyByName("flags");
+
+
+    real_t *position_ptr = static_cast<real_t *>(position_prop.getDevicePointer());
+    int *flags_ptr = static_cast<int *>(flags_prop.getDevicePointer());
+
+    ps->copyPropertyToDevice(position_prop.getId(), ReadOnly);
+    ps->copyPropertyToDevice(flags_prop.getId(), ReadOnly);
+
+    *comp_weight = cuda_compute_boundary_weights(
+        position_ptr, flags_ptr, 0, nlocal, particle_capacity, xmin, xmax, ymin, ymax, zmin, zmax);
+
+    // TODO
+    // *comm_weight = cuda_compute_boundary_weights(
+    //     position_ptr, nlocal, nlocal + nghost, particle_capacity, xmin, xmax, ymin, ymax, zmin, zmax);
+    *comm_weight = 0;
+}
+
+}
diff --git a/runtime/boundary_weights.hpp b/runtime/boundary_weights.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e84348a0c8438255ca0090b765b56fd95f7deb10
--- /dev/null
+++ b/runtime/boundary_weights.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "pairs.hpp"
+#include <iostream>
+#include <string.h>
+#include <fstream>
+#include <sstream>
+//---
+#include "pairs.hpp"
+#include "pairs_common.hpp"
+
+
+namespace pairs {
+
+void compute_boundary_weights(
+    PairsRuntime *ps,
+    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax,
+    long unsigned int *comp_weight, long unsigned int *comm_weight);
+
+}
diff --git a/runtime/contact_property.hpp b/runtime/contact_property.hpp
index 2d1e03d66362b73a365e5bebcae025680e190dfe..a64992f6835aa6eb2b7d1a6be512c960cd444562 100644
--- a/runtime/contact_property.hpp
+++ b/runtime/contact_property.hpp
@@ -26,17 +26,18 @@ public:
         PAIRS_ASSERT(type != Prop_Invalid && layout_ != Invalid && sx_ > 0 && sy_ > 0);
     }
 
-    property_t getId() { return id; }
-    std::string getName() { return name; }
-    void *getHostPointer() { return h_ptr; }
-    void *getDevicePointer() { return d_ptr; }
+    property_t getId() const { return id; }
+    std::string getName() const { return name; }
+    void *getHostPointer() const { return h_ptr; }
+    void *getDevicePointer() const { return d_ptr; }
     void setPointers(void *h_ptr_, void *d_ptr_) { h_ptr = h_ptr_, d_ptr = d_ptr_; }
     void setSizes(size_t sx_, size_t sy_) { sx = sx_, sy = sy_; }
-    size_t getTotalSize() { return sx * sy * getPrimitiveTypeSize(); };
-    PropertyType getType() { return type; }
-    layout_t getLayout() { return layout; }
-    size_t getPrimitiveTypeSize() {
+    size_t getTotalSize() const { return sx * sy * getPrimitiveTypeSize(); };
+    PropertyType getType() const { return type; }
+    layout_t getLayout() const { return layout; }
+    size_t getPrimitiveTypeSize() const {
         return  (type == Prop_Integer) ? sizeof(int) :
+                (type == Prop_UInt64) ? sizeof(uint64_t) :
                 (type == Prop_Real) ? sizeof(real_t) :
                 (type == Prop_Vector) ? sizeof(real_t) :
                 (type == Prop_Matrix) ? sizeof(real_t) :
diff --git a/runtime/copper_fcc_lattice.cpp b/runtime/copper_fcc_lattice.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1fd364f50fc9da952d30fa41c4e7596aad5a7f71
--- /dev/null
+++ b/runtime/copper_fcc_lattice.cpp
@@ -0,0 +1,142 @@
+#include <iostream>
+#include <math.h>
+//---
+#include "copper_fcc_lattice.hpp"
+
+namespace pairs {
+
+double myrandom(int* seed) {
+    int k = (*seed) / IQ;
+    double ans;
+
+    *seed = IA * (*seed - k * IQ) - IR * k;
+    if(*seed < 0) *seed += IM;
+    ans = AM * (*seed);
+    return ans;
+}
+
+void random_reset(int *seed, int ibase, double *coord) {
+    int i;
+    char *str = (char *) &ibase;
+    int n = sizeof(int);
+    unsigned int hash = 0;
+
+    for (i = 0; i < n; i++) {
+        hash += str[i];
+        hash += (hash << 10);
+        hash ^= (hash >> 6);
+    }
+
+    str = (char *) coord;
+    n = 3 * sizeof(double);
+    for (i = 0; i < n; i++) {
+        hash += str[i];
+        hash += (hash << 10);
+        hash ^= (hash >> 6);
+    }
+
+    hash += (hash << 3);
+    hash ^= (hash >> 11);
+    hash += (hash << 15);
+
+    // keep 31 bits of unsigned int as new seed
+    // do not allow seed = 0, since will cause hang in gaussian()
+
+    *seed = hash & 0x7ffffff;
+    if (!(*seed)) *seed = 1;
+
+    // warm up the RNG
+
+    for (i = 0; i < 5; i++) myrandom(seed);
+    //save = 0;
+}
+
+double copper_fcc_lattice(
+    PairsRuntime *ps, int nx, int ny, int nz, double xprd, double yprd, double zprd,
+    double rho, int ntypes) {
+
+    auto uids = ps->getAsUInt64Property(ps->getPropertyByName("uid"));
+    auto shapes = ps->getAsIntegerProperty(ps->getPropertyByName("shape"));
+    auto types = ps->getAsIntegerProperty(ps->getPropertyByName("type"));
+    auto flags = ps->getAsIntegerProperty(ps->getPropertyByName("flags"));
+    auto masses = ps->getAsFloatProperty(ps->getPropertyByName("mass"));
+    auto positions = ps->getAsVectorProperty(ps->getPropertyByName("position"));
+    auto velocities = ps->getAsVectorProperty(ps->getPropertyByName("linear_velocity"));
+    double xlo = 0.0, xhi = xprd;
+    double ylo = 0.0, yhi = yprd;
+    double zlo = 0.0, zhi = zprd;
+    int natoms = ps->getTrackedVariableAsInteger("nlocal");
+    //int natoms_expected = 4 * nx * ny * nz;
+
+    double alat = pow((4.0 / rho), (1.0 / 3.0));
+    int ilo = (int) (xlo / (0.5 * alat) - 1);
+    int ihi = (int) (xhi / (0.5 * alat) + 1);
+    int jlo = (int) (ylo / (0.5 * alat) - 1);
+    int jhi = (int) (yhi / (0.5 * alat) + 1);
+    int klo = (int) (zlo / (0.5 * alat) - 1);
+    int khi = (int) (zhi / (0.5 * alat) + 1);
+
+    ilo = MAX(ilo, 0);
+    ihi = MIN(ihi, 2 * nx - 1);
+    jlo = MAX(jlo, 0);
+    jhi = MIN(jhi, 2 * ny - 1);
+    klo = MAX(klo, 0);
+    khi = MIN(khi, 2 * nz - 1);
+
+    double xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
+    int i, j, k, m, n;
+    int sx = 0; int sy = 0; int sz = 0;
+    int ox = 0; int oy = 0; int oz = 0;
+    int subboxdim = 8;
+
+    while(oz * subboxdim <= khi) {
+        k = oz * subboxdim + sz;
+        j = oy * subboxdim + sy;
+        i = ox * subboxdim + sx;
+
+        if(((i + j + k) % 2 == 0) &&
+            (i >= ilo) && (i <= ihi) &&
+            (j >= jlo) && (j <= jhi) &&
+            (k >= klo) && (k <= khi)) {
+
+            xtmp = 0.5 * alat * i;
+            ytmp = 0.5 * alat * j;
+            ztmp = 0.5 * alat * k;
+
+            if(ps->getDomainPartitioner()->isWithinSubdomain(xtmp, ytmp, ztmp)) {
+                n = k * (2 * ny) * (2 * nx) + j * (2 * nx) + i + 1;
+                for(m = 0; m < 5; m++) { myrandom(&n); }
+                vxtmp = myrandom(&n);
+                for(m = 0; m < 5; m++){ myrandom(&n); }
+                vytmp = myrandom(&n);
+                for(m = 0; m < 5; m++) { myrandom(&n); }
+                vztmp = myrandom(&n);
+
+                uids(natoms) = UniqueID::create(ps);
+                masses(natoms) = 1.0;
+                positions(natoms, 0) = xtmp;
+                positions(natoms, 1) = ytmp;
+                positions(natoms, 2) = ztmp;
+                velocities(natoms, 0) = vxtmp;
+                velocities(natoms, 1) = vytmp;
+                velocities(natoms, 2) = vztmp;
+                types(natoms) = rand() % ntypes;
+                flags(natoms) = 0;
+                shapes(natoms) = 2; // point mass
+                natoms++;
+            }
+        }
+
+        sx++;
+
+        if(sx == subboxdim) { sx = 0; sy++; }
+        if(sy == subboxdim) { sy = 0; sz++; }
+        if(sz == subboxdim) { sz = 0; ox++; }
+        if(ox * subboxdim > ihi) { ox = 0; oy++; }
+        if(oy * subboxdim > jhi) { oy = 0; oz++; }
+    }
+
+    return natoms;
+}
+
+}
diff --git a/runtime/copper_fcc_lattice.hpp b/runtime/copper_fcc_lattice.hpp
index 48fec71f6cdc50a99dd09568d91a8390e38f551c..8c4a016acc5d891a33ab7e0f52db983a540d45dd 100644
--- a/runtime/copper_fcc_lattice.hpp
+++ b/runtime/copper_fcc_lattice.hpp
@@ -1,12 +1,8 @@
-#include <iostream>
-#include <math.h>
-//---
 #include "pairs.hpp"
+#include "unique_id.hpp"
 
 #pragma once
 
-namespace pairs {
-
 /* Park/Miller RNG w/out MASKING, so as to be like f90s version */
 #define IA 16807
 #define IM 2147483647
@@ -15,133 +11,12 @@ namespace pairs {
 #define IR 2836
 #define MASK 123459876
 
-double myrandom(int* seed) {
-    int k= (*seed) / IQ;
-    double ans;
-
-    *seed = IA * (*seed - k * IQ) - IR * k;
-    if(*seed < 0) *seed += IM;
-    ans = AM * (*seed);
-    return ans;
-}
-
-void random_reset(int *seed, int ibase, double *coord) {
-    int i;
-    char *str = (char *) &ibase;
-    int n = sizeof(int);
-    unsigned int hash = 0;
-
-    for (i = 0; i < n; i++) {
-        hash += str[i];
-        hash += (hash << 10);
-        hash ^= (hash >> 6);
-    }
-
-    str = (char *) coord;
-    n = 3 * sizeof(double);
-    for (i = 0; i < n; i++) {
-        hash += str[i];
-        hash += (hash << 10);
-        hash ^= (hash >> 6);
-    }
-
-    hash += (hash << 3);
-    hash ^= (hash >> 11);
-    hash += (hash << 15);
-
-    // keep 31 bits of unsigned int as new seed
-    // do not allow seed = 0, since will cause hang in gaussian()
-
-    *seed = hash & 0x7ffffff;
-    if (!(*seed)) *seed = 1;
-
-    // warm up the RNG
-
-    for (i = 0; i < 5; i++) myrandom(seed);
-    //save = 0;
-}
-
-double copper_fcc_lattice(PairsSimulation *ps, int nx, int ny, int nz, double xprd, double yprd, double zprd, double rho, int ntypes) {
-    auto shape = ps->getAsIntegerProperty(ps->getPropertyByName("shape"));
-    auto types = ps->getAsIntegerProperty(ps->getPropertyByName("type"));
-    auto flags = ps->getAsIntegerProperty(ps->getPropertyByName("flags"));
-    auto masses = ps->getAsFloatProperty(ps->getPropertyByName("mass"));
-    auto positions = ps->getAsVectorProperty(ps->getPropertyByName("position"));
-    auto velocities = ps->getAsVectorProperty(ps->getPropertyByName("linear_velocity"));
-    double xlo = 0.0, xhi = xprd;
-    double ylo = 0.0, yhi = yprd;
-    double zlo = 0.0, zhi = zprd;
-    int natoms = 0;
-    //int natoms_expected = 4 * nx * ny * nz;
-
-    double alat = pow((4.0 / rho), (1.0 / 3.0));
-    int ilo = (int) (xlo / (0.5 * alat) - 1);
-    int ihi = (int) (xhi / (0.5 * alat) + 1);
-    int jlo = (int) (ylo / (0.5 * alat) - 1);
-    int jhi = (int) (yhi / (0.5 * alat) + 1);
-    int klo = (int) (zlo / (0.5 * alat) - 1);
-    int khi = (int) (zhi / (0.5 * alat) + 1);
-
-    ilo = MAX(ilo, 0);
-    ihi = MIN(ihi, 2 * nx - 1);
-    jlo = MAX(jlo, 0);
-    jhi = MIN(jhi, 2 * ny - 1);
-    klo = MAX(klo, 0);
-    khi = MIN(khi, 2 * nz - 1);
-
-    double xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
-    int i, j, k, m, n;
-    int sx = 0; int sy = 0; int sz = 0;
-    int ox = 0; int oy = 0; int oz = 0;
-    int subboxdim = 8;
-
-    while(oz * subboxdim <= khi) {
-        k = oz * subboxdim + sz;
-        j = oy * subboxdim + sy;
-        i = ox * subboxdim + sx;
-
-        if(((i + j + k) % 2 == 0) &&
-            (i >= ilo) && (i <= ihi) &&
-            (j >= jlo) && (j <= jhi) &&
-            (k >= klo) && (k <= khi)) {
-
-            xtmp = 0.5 * alat * i;
-            ytmp = 0.5 * alat * j;
-            ztmp = 0.5 * alat * k;
-
-            if(ps->getDomainPartitioner()->isWithinSubdomain(xtmp, ytmp, ztmp)) {
-                n = k * (2 * ny) * (2 * nx) + j * (2 * nx) + i + 1;
-                for(m = 0; m < 5; m++) { myrandom(&n); }
-                vxtmp = myrandom(&n);
-                for(m = 0; m < 5; m++){ myrandom(&n); }
-                vytmp = myrandom(&n);
-                for(m = 0; m < 5; m++) { myrandom(&n); }
-                vztmp = myrandom(&n);
-
-                masses(natoms) = 1.0;
-                positions(natoms, 0) = xtmp;
-                positions(natoms, 1) = ytmp;
-                positions(natoms, 2) = ztmp;
-                velocities(natoms, 0) = vxtmp;
-                velocities(natoms, 1) = vytmp;
-                velocities(natoms, 2) = vztmp;
-                types(natoms) = rand() % ntypes;
-                flags(natoms) = 0;
-                shape(natoms) = 2; // point mass
-                natoms++;
-            }
-        }
-
-        sx++;
-
-        if(sx == subboxdim) { sx = 0; sy++; }
-        if(sy == subboxdim) { sy = 0; sz++; }
-        if(sz == subboxdim) { sz = 0; ox++; }
-        if(ox * subboxdim > ihi) { ox = 0; oy++; }
-        if(oy * subboxdim > jhi) { oy = 0; oz++; }
-    }
+namespace pairs {
 
-    return natoms;
-}
+double myrandom(int* seed);
+void random_reset(int *seed, int ibase, double *coord);
+double copper_fcc_lattice(
+    PairsRuntime *ps, int nx, int ny, int nz, double xprd, double yprd, double zprd,
+    double rho, int ntypes);
 
 }
diff --git a/runtime/create_body.cpp b/runtime/create_body.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c431f646db037670fa29b0d4e73ab3e35a420ae
--- /dev/null
+++ b/runtime/create_body.cpp
@@ -0,0 +1,75 @@
+#include "create_body.hpp"
+
+namespace pairs {
+
+// returns the uid of the body created, or 0 if the body is not created
+id_t create_halfspace(PairsRuntime *pr, 
+                    double x, double y, double z, 
+                    double nx, double ny, double nz, 
+                    int type, int flag){
+    // TODO: increase capacity if exceeded
+    id_t uid = 0;
+    auto uids = pr->getAsUInt64Property(pr->getPropertyByName("uid"));   
+    auto shapes = pr->getAsIntegerProperty(pr->getPropertyByName("shape"));
+    auto types = pr->getAsIntegerProperty(pr->getPropertyByName("type"));
+    auto flags = pr->getAsIntegerProperty(pr->getPropertyByName("flags"));
+    auto positions = pr->getAsVectorProperty(pr->getPropertyByName("position"));
+    auto normals = pr->getAsVectorProperty(pr->getPropertyByName("normal"));
+
+    if(pr->getDomainPartitioner()->isWithinSubdomain(x, y, z) || flag & (flags::INFINITE | flags::GLOBAL) ){
+        int n = pr->getTrackedVariableAsInteger("nlocal");
+        uid = (flag & flags::GLOBAL) ? UniqueID::createGlobal(pr) : UniqueID::create(pr);
+        uids(n) = uid;
+        positions(n, 0) = x;
+        positions(n, 1) = y;
+        positions(n, 2) = z;
+        normals(n, 0) = nx;
+        normals(n, 1) = ny;
+        normals(n, 2) = nz;
+        types(n) = type;
+        flags(n) = flag;
+        shapes(n) = 1;   // halfspace
+        pr->setTrackedVariableAsInteger("nlocal", n + 1);
+    }
+
+    return uid;
+}
+
+// returns the uid of the body created, or 0 if the body is not created
+id_t create_sphere(PairsRuntime *pr, 
+                    double x, double y, double z, 
+                    double vx, double vy, double vz, 
+                    double density, double radius, int type, int flag){
+    // TODO: increase capacity if exceeded
+    id_t uid = 0;
+    auto uids = pr->getAsUInt64Property(pr->getPropertyByName("uid"));   
+    auto shapes = pr->getAsIntegerProperty(pr->getPropertyByName("shape"));
+    auto types = pr->getAsIntegerProperty(pr->getPropertyByName("type"));
+    auto flags = pr->getAsIntegerProperty(pr->getPropertyByName("flags"));
+    auto masses = pr->getAsFloatProperty(pr->getPropertyByName("mass"));
+    auto radii = pr->getAsFloatProperty(pr->getPropertyByName("radius"));
+    auto positions = pr->getAsVectorProperty(pr->getPropertyByName("position"));
+    auto velocities = pr->getAsVectorProperty(pr->getPropertyByName("linear_velocity"));
+
+    if(pr->getDomainPartitioner()->isWithinSubdomain(x, y, z)) {
+        int n = pr->getTrackedVariableAsInteger("nlocal");
+        uid = (flag & flags::GLOBAL) ? UniqueID::createGlobal(pr) : UniqueID::create(pr);
+        uids(n) = uid;
+        radii(n) = radius;
+        masses(n) = ((4.0 / 3.0) * M_PI) * radius * radius * radius * density;
+        positions(n, 0) = x;
+        positions(n, 1) = y;
+        positions(n, 2) = z;
+        velocities(n, 0) = vx;
+        velocities(n, 1) = vy;
+        velocities(n, 2) = vz;
+        types(n) = type;
+        flags(n) = flag;
+        shapes(n) = 0;   // sphere
+        pr->setTrackedVariableAsInteger("nlocal", n + 1);
+    }
+    
+    return uid;
+}
+
+}
\ No newline at end of file
diff --git a/runtime/create_body.hpp b/runtime/create_body.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..995b1f6998940c09d484fad159ba0a382640a82b
--- /dev/null
+++ b/runtime/create_body.hpp
@@ -0,0 +1,18 @@
+#include "pairs.hpp"
+#include "unique_id.hpp"
+
+#pragma once
+
+namespace pairs {
+
+id_t create_halfspace(PairsRuntime *pr, 
+                    double x, double y, double z, 
+                    double nx, double ny, double nz, 
+                    int type, int flag);
+
+id_t create_sphere(PairsRuntime *pr, 
+                    double x, double y, double z, 
+                    double vx, double vy, double vz, 
+                    double density, double radius, int type, int flag);
+
+}
\ No newline at end of file
diff --git a/runtime/dem_sc_grid.cpp b/runtime/dem_sc_grid.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..119ec78d75cf50dbbc73a9033750bb791353e6b5
--- /dev/null
+++ b/runtime/dem_sc_grid.cpp
@@ -0,0 +1,134 @@
+#include <iostream>
+//---
+#include "dem_sc_grid.hpp"
+
+namespace pairs {
+
+namespace internal {
+
+static std::mt19937 generator; // static std::mt19937_64 generator;
+
+std::mt19937 & get_generator() {
+    // std::mt19937_64
+    return generator;
+}
+
+}
+
+bool point_within_aabb(double point[], double aabb[]) {
+    return point[0] >= aabb[0] && point[0] < aabb[3] &&
+           point[1] >= aabb[1] && point[1] < aabb[4] &&
+           point[2] >= aabb[2] && point[2] < aabb[5];
+}
+
+int dem_sc_grid(PairsRuntime *ps, double xmax, double ymax, double zmax, double spacing, double diameter, double min_diameter, double max_diameter, double initial_velocity, double particle_density, int ntypes) {
+    auto uids = ps->getAsUInt64Property(ps->getPropertyByName("uid"));
+    auto shapes = ps->getAsIntegerProperty(ps->getPropertyByName("shape"));
+    auto types = ps->getAsIntegerProperty(ps->getPropertyByName("type"));
+    auto flags = ps->getAsIntegerProperty(ps->getPropertyByName("flags"));
+    auto masses = ps->getAsFloatProperty(ps->getPropertyByName("mass"));
+    auto radius = ps->getAsFloatProperty(ps->getPropertyByName("radius"));
+    auto positions = ps->getAsVectorProperty(ps->getPropertyByName("position"));
+    auto velocities = ps->getAsVectorProperty(ps->getPropertyByName("linear_velocity"));
+    int nparticles = ps->getTrackedVariableAsInteger("nlocal");
+
+    const double xmin = 0.0;
+    const double ymin = 0.0;
+    const double zmin = 0.0;
+
+    double gen_domain[] = {xmin, ymin, zmin, xmax, ymax, zmax};
+    double ref_point[] = {spacing * 0.5, spacing * 0.5, spacing * 0.5};
+    double sc_xmin = xmin - ref_point[0];
+    double sc_ymin = ymin - ref_point[1];
+    double sc_zmin = zmin - ref_point[2];
+
+    int iret = (int)(ceil(sc_xmin / spacing));
+    int jret = (int)(ceil(sc_ymin / spacing));
+    int kret = (int)(ceil(sc_zmin / spacing));
+
+    int i = iret;
+    int j = jret;
+    int k = kret;
+
+    double point[3];
+    point[0] = ref_point[0] + i * spacing;
+    point[1] = ref_point[1] + j * spacing;
+    point[2] = ref_point[2] + k * spacing;
+
+    while(point_within_aabb(point, gen_domain)) {
+        auto pdiam = realRandom<real_t>(min_diameter, max_diameter);
+
+        if(ps->getDomainPartitioner()->isWithinSubdomain(point[0], point[1], point[2])) {
+            real_t rad = pdiam * 0.5;
+            uids(nparticles) = UniqueID::create(ps);
+            radius(nparticles) = rad;
+            masses(nparticles) = ((4.0 / 3.0) * M_PI) * rad * rad * rad * particle_density;
+            positions(nparticles, 0) = point[0];
+            positions(nparticles, 1) = point[1];
+            positions(nparticles, 2) = point[2];
+            velocities(nparticles, 0) = 0.1 * realRandom<real_t>(-initial_velocity, initial_velocity);
+            velocities(nparticles, 1) = 0.1 * realRandom<real_t>(-initial_velocity, initial_velocity);
+            velocities(nparticles, 2) = 0.1 * realRandom<real_t>(-initial_velocity, initial_velocity);
+            types(nparticles) = rand() % ntypes;
+            flags(nparticles) = 0;
+            shapes(nparticles) = shapes::Sphere;
+
+            /*
+            std::cout << uid(nparticles) << "," << types(nparticles) << "," << masses(nparticles) << "," << radius(nparticles) << ","
+                      << positions(nparticles, 0) << "," << positions(nparticles, 1) << "," << positions(nparticles, 2) << ","
+                      << velocities(nparticles, 0) << "," << velocities(nparticles, 1) << "," << velocities(nparticles, 2) << ","
+                      << flags(nparticles) << std::endl;
+            */
+
+            nparticles++;
+        }
+
+        ++i;
+        point[0] = ref_point[0] + i * spacing;
+        point[1] = ref_point[1] + j * spacing;
+        point[2] = ref_point[2] + k * spacing;
+
+        if(!point_within_aabb(point, gen_domain)) {
+            i = iret;
+            j++;
+            point[0] = ref_point[0] + i * spacing;
+            point[1] = ref_point[1] + j * spacing;
+            point[2] = ref_point[2] + k * spacing;
+
+            if(!point_within_aabb(point, gen_domain)) {
+                j = jret;
+                k++;
+                point[0] = ref_point[0] + i * spacing;
+                point[1] = ref_point[1] + j * spacing;
+                point[2] = ref_point[2] + k * spacing;
+
+                if(!point_within_aabb(point, gen_domain)) {
+                    break;
+                }
+            }
+        }
+    }
+
+    ps->setTrackedVariableAsInteger("nlocal", nparticles);
+
+    int global_nparticles = nparticles;
+    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
+        MPI_Allreduce(&nparticles, &global_nparticles, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    }
+
+    if(ps->getDomainPartitioner()->getRank() == 0) {
+        std::cout << "DEM Simple-Cubic Grid" << std::endl;
+        std::cout << "Domain size: <" << xmax << ", " << ymax << ", " << zmax << ">" << std::endl;
+        std::cout << "Spacing: " << spacing << std::endl;
+        std::cout << "Diameter: " << diameter
+                  << " (min = " << min_diameter << ", max = " << max_diameter << ")" << std::endl;
+        std::cout << "Initial velocity: " << initial_velocity << std::endl;
+        std::cout << "Particle density: " << particle_density << std::endl;
+        std::cout << "Number of types: " << ntypes << std::endl;
+        std::cout << "Number of particles: " << global_nparticles << std::endl;
+    }
+
+    return nparticles;
+}
+
+}
diff --git a/runtime/dem_sc_grid.hpp b/runtime/dem_sc_grid.hpp
index 8c85ce6b77d84ddd9e8e52327e3323fefb29378a..9eb34620a56aea9703b676c941fada9b751f42e8 100644
--- a/runtime/dem_sc_grid.hpp
+++ b/runtime/dem_sc_grid.hpp
@@ -1,8 +1,9 @@
-#include <iostream>
 #include <math.h>
 #include <random>
 //---
 #include "pairs.hpp"
+#include "pairs_common.hpp"
+#include "unique_id.hpp"
 
 #pragma once
 
@@ -10,16 +11,11 @@ namespace pairs {
 
 namespace internal {
 
-static std::mt19937 generator; // static std::mt19937_64 generator;
+std::mt19937 & get_generator();
 
-std::mt19937 & get_generator() {
-    // std::mt19937_64
-    return generator;
 }
 
-}
-
-template< typename REAL_TYPE = real_t>
+template<typename REAL_TYPE = real_t>
 REAL_TYPE realRandom(
     const REAL_TYPE min = REAL_TYPE(0),
     const REAL_TYPE max = REAL_TYPE(1),
@@ -41,134 +37,23 @@ REAL_TYPE realRandom(
    return value;
 }
 
+template<typename REAL_TYPE>
+class RealRandom {
+public:
+    RealRandom(const std::mt19937::result_type& seed = std::mt19937::result_type()) {
+        generator_.seed(seed);
+    }
 
+    REAL_TYPE operator()(const REAL_TYPE min = REAL_TYPE(0), const REAL_TYPE max = REAL_TYPE(1)) {
+        return realRandom(min, max, generator_);
+    }
 
-template<typename REAL_TYPE> class RealRandom {
-public:
-   RealRandom(const std::mt19937::result_type& seed = std::mt19937::result_type()) { generator_.seed(seed); }
-   REAL_TYPE operator()(const REAL_TYPE min = REAL_TYPE(0), const REAL_TYPE max = REAL_TYPE(1) ) {
-      return realRandom(min, max, generator_);
-   }
 private:
    std::mt19937 generator_;
 };
 
-bool point_within_aabb(double point[], double aabb[]) {
-    return point[0] >= aabb[0] && point[0] < aabb[3] &&
-           point[1] >= aabb[1] && point[1] < aabb[4] &&
-           point[2] >= aabb[2] && point[2] < aabb[5];
-}
-
-int dem_sc_grid(PairsSimulation *ps, double xmax, double ymax, double zmax, double spacing, double diameter, double min_diameter, double max_diameter, double initial_velocity, double particle_density, int ntypes) {
-    auto uid = ps->getAsIntegerProperty(ps->getPropertyByName("uid"));
-    auto shape = ps->getAsIntegerProperty(ps->getPropertyByName("shape"));
-    auto types = ps->getAsIntegerProperty(ps->getPropertyByName("type"));
-    auto flags = ps->getAsIntegerProperty(ps->getPropertyByName("flags"));
-    auto masses = ps->getAsFloatProperty(ps->getPropertyByName("mass"));
-    auto radius = ps->getAsFloatProperty(ps->getPropertyByName("radius"));
-    auto positions = ps->getAsVectorProperty(ps->getPropertyByName("position"));
-    auto velocities = ps->getAsVectorProperty(ps->getPropertyByName("linear_velocity"));
-    int last_uid = 1;
-    int nparticles = 0;
-
-    const double xmin = 0.0;
-    const double ymin = 0.0;
-    const double zmin = diameter;
-
-    double gen_domain[] = {xmin, ymin, zmin, xmax, ymax, zmax};
-    double ref_point[] = {spacing * 0.5, spacing * 0.5, spacing * 0.5};
-    double sc_xmin = xmin - ref_point[0];
-    double sc_ymin = ymin - ref_point[1];
-    double sc_zmin = zmin - ref_point[2];
-
-    int iret = (int)(ceil(sc_xmin / spacing));
-    int jret = (int)(ceil(sc_ymin / spacing));
-    int kret = (int)(ceil(sc_zmin / spacing));
-
-    int i = iret;
-    int j = jret;
-    int k = kret;
-
-    double point[3];
-    point[0] = ref_point[0] + i * spacing;
-    point[1] = ref_point[1] + j * spacing;
-    point[2] = ref_point[2] + k * spacing;
-
-    while(point_within_aabb(point, gen_domain)) {
-        int particle_uid = last_uid;
-        auto diameter = realRandom<real_t>(min_diameter, max_diameter);
-
-        if(ps->getDomainPartitioner()->isWithinSubdomain(point[0], point[1], point[2])) {
-            real_t rad = diameter * 0.5;
-            uid(nparticles) = particle_uid;
-            radius(nparticles) = rad;
-            masses(nparticles) = ((4.0 / 3.0) * M_PI) * rad * rad * rad * particle_density;
-            positions(nparticles, 0) = point[0];
-            positions(nparticles, 1) = point[1];
-            positions(nparticles, 2) = point[2];
-            velocities(nparticles, 0) = 0.1 * realRandom<real_t>(-initial_velocity, initial_velocity);
-            velocities(nparticles, 1) = 0.1 * realRandom<real_t>(-initial_velocity, initial_velocity);
-            velocities(nparticles, 2) = -initial_velocity;
-            types(nparticles) = rand() % ntypes;
-            flags(nparticles) = 0;
-            shape(nparticles) = 0; // sphere
-
-            /*
-            std::cout << uid(nparticles) << "," << types(nparticles) << "," << masses(nparticles) << "," << radius(nparticles) << ","
-                      << positions(nparticles, 0) << "," << positions(nparticles, 1) << "," << positions(nparticles, 2) << ","
-                      << velocities(nparticles, 0) << "," << velocities(nparticles, 1) << "," << velocities(nparticles, 2) << ","
-                      << flags(nparticles) << std::endl;
-            */
-
-            nparticles++;
-        }
-
-        ++i;
-        point[0] = ref_point[0] + i * spacing;
-        point[1] = ref_point[1] + j * spacing;
-        point[2] = ref_point[2] + k * spacing;
-
-        if(!point_within_aabb(point, gen_domain)) {
-            i = iret;
-            j++;
-            point[0] = ref_point[0] + i * spacing;
-            point[1] = ref_point[1] + j * spacing;
-            point[2] = ref_point[2] + k * spacing;
-
-            if(!point_within_aabb(point, gen_domain)) {
-                j = jret;
-                k++;
-                point[0] = ref_point[0] + i * spacing;
-                point[1] = ref_point[1] + j * spacing;
-                point[2] = ref_point[2] + k * spacing;
-
-                if(!point_within_aabb(point, gen_domain)) {
-                    break;
-                }
-            }
-        }
-
-        last_uid++;
-    }
-
-    int global_nparticles = nparticles;
-    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
-        MPI_Allreduce(&nparticles, &global_nparticles, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    }
+bool point_within_aabb(double point[], double aabb[]);
 
-    if(ps->getDomainPartitioner()->getRank() == 0) {
-        std::cout << "DEM Simple-Cubic Grid" << std::endl;
-        std::cout << "Domain size: <" << xmax << ", " << ymax << ", " << zmax << ">" << std::endl;
-        std::cout << "Spacing: " << spacing << std::endl;
-        std::cout << "Diameter: " << diameter
-                  << " (min = " << min_diameter << ", max = " << max_diameter << ")" << std::endl;
-        std::cout << "Initial velocity: " << initial_velocity << std::endl;
-        std::cout << "Particle density: " << particle_density << std::endl;
-        std::cout << "Number of types: " << ntypes << std::endl;
-        std::cout << "Number of particles: " << global_nparticles << std::endl;
-    }
-
-    return nparticles;
-}
+int dem_sc_grid(PairsRuntime *ps, double xmax, double ymax, double zmax, double spacing, double diameter, double min_diameter, double max_diameter, double initial_velocity, double particle_density, int ntypes);
 
 }
diff --git a/runtime/device_flags.hpp b/runtime/device_flags.hpp
index 4b5085fb6b61f8df1c3e1e129541266765745fdf..089e32f3122dff09e56e01120ca33816844f8030 100644
--- a/runtime/device_flags.hpp
+++ b/runtime/device_flags.hpp
@@ -14,7 +14,7 @@ private:
     static const int narrays_per_flag = 64;
 public:
     DeviceFlags(int narrays_) : narrays(narrays_) {
-        nflags = std::ceil((double) narrays_ / (double) narrays_per_flag);
+        nflags = static_cast<int>(std::ceil((double) narrays_ / (double) narrays_per_flag));
         hflags = new unsigned long long int[nflags];
         dflags = new unsigned long long int[nflags];
 
diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu
index 8bb7c59ef3bbefff0556847669faeae57744372e..2cae5aa89accf7c720ea3f29d922184ab0264830 100644
--- a/runtime/devices/cuda.cu
+++ b/runtime/devices/cuda.cu
@@ -1,18 +1,13 @@
 #include <cuda_runtime.h>
 #include <iostream>
 #include <cstring>
+#include "../pairs_common.hpp"
+#include "device.hpp"
 
 #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); }
 
 namespace pairs {
 
-inline void cuda_assert(cudaError_t err, const char *file, int line) {
-    if(err != cudaSuccess) {
-        std::cerr << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
-        exit(-1);
-    }
-}
-
 __host__ void *device_alloc(size_t size) {
     void *ptr;
     CUDA_ASSERT(cudaMalloc(&ptr, size));
@@ -71,4 +66,38 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t
     //CUDA_ASSERT(cudaMemcpyFromSymbol(h_ptr, d_ptr, count));
 }
 
+#if __CUDA_ARCH__ < 600
+__device__ double atomicAdd_double(double* address, double val) {
+    unsigned long long int * ull_addr = (unsigned long long int*) address;
+    unsigned long long int old = *ull_addr, assumed;
+
+    do {
+        assumed = old;
+        old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+        // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+#else
+__device__ double atomicAdd_double(double* address, double val) {
+    return atomicAdd(address, val);
+}
+#endif
+
+__device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); }
+__device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); }
+__device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
+    const int add_res = *addr + val;
+    
+    // printf("atomic_add_resize_check::: add_res %d --- val %d --- capacity %d --- resize %d\n", add_res, val, capacity, *resize);
+    
+    if(add_res >= capacity) {
+        *resize = add_res;
+        return *addr;
+    }
+
+    return atomic_add(addr, val);
+}
+
 }
diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp
index 107b70ee91512ed9ccd336be2168e0b75ed5eab8..c5c406ec7c5c02634e119d4db9b6bbde3d9c0aac 100644
--- a/runtime/devices/device.hpp
+++ b/runtime/devices/device.hpp
@@ -8,6 +8,8 @@
 #ifndef PAIRS_TARGET_CUDA
 #   define __host__
 typedef int cudaError_t;
+#else
+#include <cuda_runtime.h>
 #endif
 
 namespace pairs {
@@ -71,42 +73,19 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize
 }
 
 #ifdef PAIRS_TARGET_CUDA
-#if __CUDA_ARCH__ < 600
-__device__ double atomicAdd_double(double* address, double val) {
-    unsigned long long int * ull_addr = (unsigned long long int*) address;
-    unsigned long long int old = *ull_addr, assumed;
-
-    do {
-        assumed = old;
-        old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
-#else
-__device__ double atomicAdd_double(double* address, double val) {
-    return atomicAdd(address, val);
-}
-#endif
-
-__device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); }
-__device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); }
-__device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
-    const int add_res = *addr + val;
-    if(add_res >= capacity) {
-        *resize = add_res;
-        return *addr;
+inline void cuda_assert(cudaError_t err, const char *file, int line) {
+    if(err != cudaSuccess) {
+        std::cerr << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
+        exit(-1);
     }
-
-    return atomic_add(addr, val);
 }
+__device__ double atomicAdd_double(double* address, double val);
+__device__ int atomic_add(int *addr, int val);
+__device__ real_t atomic_add(real_t *addr, real_t val);
+__device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity);
 #else
-inline int atomic_add(int *addr, int val) { return host_atomic_add(addr, val); }
-inline int atomic_add(real_t *addr, real_t val) { return host_atomic_add(addr, val); }
-inline int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
-    return host_atomic_add_resize_check(addr, val, resize, capacity);
-}
+int atomic_add(int *addr, int val);
+real_t atomic_add(real_t *addr, real_t val);
+int atomic_add_resize_check(int *addr, int val, int *resize, int capacity);
 #endif
-
 }
diff --git a/runtime/devices/dummy.cpp b/runtime/devices/dummy.cpp
index a0151fc0aecd0322f2cd55feb9699ace713ae52e..9b06d0b267e45fa0ece7b492c27c86c47a1525b5 100644
--- a/runtime/devices/dummy.cpp
+++ b/runtime/devices/dummy.cpp
@@ -19,4 +19,16 @@ void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count) {
     std::memcpy(d_ptr1, d_ptr2, count);
 }
 
+int atomic_add(int *addr, int val) {
+    return host_atomic_add(addr, val);
+}
+
+real_t atomic_add(real_t *addr, real_t val) {
+    return host_atomic_add(addr, val);
+}
+
+int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
+    return host_atomic_add_resize_check(addr, val, resize, capacity);
+}
+
 }
diff --git a/runtime/domain/ParticleDataHandling.hpp b/runtime/domain/ParticleDataHandling.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c54bae6404434af71f43c3d6a98d32d0e0537574
--- /dev/null
+++ b/runtime/domain/ParticleDataHandling.hpp
@@ -0,0 +1,349 @@
+#include <blockforest/BlockForest.h>
+#include <blockforest/BlockDataHandling.h>
+
+#pragma once
+
+namespace pairs {
+
+class PairsRuntime;
+
+void relocate_particle(PairsRuntime *ps, int dst, int src){
+    for(auto &prop: ps->getProperties()) {
+        if(!prop.isVolatile()) {
+            auto prop_type = prop.getType();
+
+            if(prop_type == pairs::Prop_Vector) {
+                auto vector_ptr = ps->getAsVectorProperty(prop);
+                constexpr int nelems = 3;
+
+                for(int e = 0; e < nelems; e++) {
+                    vector_ptr(dst, e) = vector_ptr(src, e);
+                }
+            } else if(prop_type == pairs::Prop_Matrix) {
+                auto matrix_ptr = ps->getAsMatrixProperty(prop);
+                constexpr int nelems = 9;
+
+                for(int e = 0; e < nelems; e++) {
+                    matrix_ptr(dst, e) = matrix_ptr(src, e);
+                }
+            } else if(prop_type == pairs::Prop_Quaternion) {
+                auto quat_ptr = ps->getAsQuaternionProperty(prop);
+                constexpr int nelems = 4;
+
+                for(int e = 0; e < nelems; e++) {
+                    quat_ptr(dst, e) = quat_ptr(src, e);
+                }
+            } else if(prop_type == pairs::Prop_Integer) {
+                auto int_ptr = ps->getAsIntegerProperty(prop);
+                int_ptr(dst) = int_ptr(src);
+            } else if(prop_type == pairs::Prop_UInt64) {
+                auto uint64_ptr = ps->getAsUInt64Property(prop);
+                uint64_ptr(dst) = uint64_ptr(src);
+            } else if(prop_type == pairs::Prop_Real) {
+                auto float_ptr = ps->getAsFloatProperty(prop);
+                float_ptr(dst) = float_ptr(src);
+            } else {
+                std::cerr << "relocate_particle(): Invalid property type!" << std::endl;
+                return;
+            }
+        }
+    }
+}
+
+}
+
+namespace walberla {
+
+namespace internal {
+
+class ParticleDeleter {
+    friend bool operator==(const ParticleDeleter& lhs, const ParticleDeleter& rhs);
+
+public:
+    ParticleDeleter(pairs::PairsRuntime *ps_, const math::AABB& aabb_) : ps(ps_), aabb(aabb_) {}
+
+    ~ParticleDeleter() {
+        int nlocal = ps->getTrackedVariableAsInteger("nlocal");
+        auto position = ps->getAsVectorProperty(ps->getPropertyByName("position"));
+        auto flags = ps->getAsIntegerProperty(ps->getPropertyByName("flags"));
+
+        int ndeleted = 0;
+        int *goneIdx = new int[nlocal];
+        
+        for (int i=0; i<nlocal; ++i) {
+            if (flags(i) & (pairs::flags::INFINITE | pairs::flags::GLOBAL))  continue;
+
+            const real_t pos_x = position(i, 0);
+            const real_t pos_y = position(i, 1);
+            const real_t pos_z = position(i, 2);
+
+            if( aabb.contains(pos_x, pos_y, pos_z)) {
+                goneIdx[ndeleted] = i;
+                ++ndeleted;
+            }
+        }
+
+        int beg = 0;
+        int end = ndeleted - 1;
+        int i = nlocal - 1;
+        while ((i > goneIdx[beg]) && (beg <= end)) {
+            if(i == goneIdx[end]){
+                --end;
+            }
+            else{
+                pairs::relocate_particle(ps, goneIdx[beg], i);
+                ++beg;
+            }
+            --i;
+        }
+        
+        delete[] goneIdx;
+        
+        ps->setTrackedVariableAsInteger("nlocal", nlocal - ndeleted);
+        ps->setTrackedVariableAsInteger("nghost", 0);
+    }
+
+private:
+    pairs::PairsRuntime *ps;
+    math::AABB aabb;
+};
+
+inline bool operator==(const ParticleDeleter& lhs, const ParticleDeleter& rhs) {
+    return lhs.aabb == rhs.aabb;
+}
+
+} // namespace internal
+
+class ParticleDataHandling : public blockforest::BlockDataHandling<internal::ParticleDeleter> {
+private:
+    pairs::PairsRuntime *ps;
+
+public:
+    ParticleDataHandling(pairs::PairsRuntime *ps_) : ps(ps_) {}
+    ~ParticleDataHandling() override = default;
+
+    internal::ParticleDeleter *initialize(IBlock *const block) override {
+        return new internal::ParticleDeleter(ps, block->getAABB());
+    }
+
+    void serialize(IBlock *const block, const BlockDataID& id, mpi::SendBuffer& buffer) override {
+        serializeImpl(static_cast<Block*>(block), id, buffer, 0, false);
+    }
+
+    internal::ParticleDeleter* deserialize(IBlock *const block) override {
+        return initialize(block);
+    }
+
+    void deserialize(IBlock *const block, const BlockDataID& id, mpi::RecvBuffer& buffer) override {
+        deserializeImpl(block, id, buffer);
+    }
+
+    void serializeCoarseToFine(Block *const block, const BlockDataID& id, mpi::SendBuffer& buffer, const uint_t child) override {
+        serializeImpl(block, id, buffer, child, true);
+    }
+
+    void serializeFineToCoarse(Block *const block, const BlockDataID& id, mpi::SendBuffer& buffer) override {
+        serializeImpl(block, id, buffer, 0, false);
+    }
+
+    internal::ParticleDeleter *deserializeCoarseToFine(Block *const block) override {
+        return initialize(block);
+    }
+
+    internal::ParticleDeleter *deserializeFineToCoarse(Block *const block) override {
+        return initialize(block);
+    }
+
+    void deserializeCoarseToFine(Block *const block, const BlockDataID& id, mpi::RecvBuffer& buffer) override {
+        deserializeImpl(block, id, buffer);
+    }
+
+    void deserializeFineToCoarse(Block *const block, const BlockDataID& id, mpi::RecvBuffer& buffer, const uint_t) override {
+        deserializeImpl(block, id, buffer);
+    }
+
+    void serializeImpl(Block *const block, const BlockDataID&, mpi::SendBuffer& buffer, const uint_t child, bool check_child) {
+        auto ptr = buffer.allocate<uint_t>();
+        double aabb_check[6];
+
+        if(check_child) {
+            const auto child_id = BlockID(block->getId(), child);
+            const auto child_aabb = block->getForest().getAABBFromBlockId(child_id);
+            aabb_check[0] = child_aabb.xMin();
+            aabb_check[1] = child_aabb.xMax();
+            aabb_check[2] = child_aabb.yMin();
+            aabb_check[3] = child_aabb.yMax();
+            aabb_check[4] = child_aabb.zMin();
+            aabb_check[5] = child_aabb.zMax();
+        } else {
+            const auto aabb = block->getAABB();
+            aabb_check[0] = aabb.xMin();
+            aabb_check[1] = aabb.xMax();
+            aabb_check[2] = aabb.yMin();
+            aabb_check[3] = aabb.yMax();
+            aabb_check[4] = aabb.zMin();
+            aabb_check[5] = aabb.zMax();
+        }
+
+        int nlocal = ps->getTrackedVariableAsInteger("nlocal");
+        auto position = ps->getAsVectorProperty(ps->getPropertyByName("position"));
+        auto flags = ps->getAsIntegerProperty(ps->getPropertyByName("flags"));
+        int nserialized = 0;
+        int *goneIdx = new int[nlocal];
+
+        for (int i=0; i<nlocal; ++i) {
+            if (flags(i) & (pairs::flags::INFINITE | pairs::flags::GLOBAL)) continue;
+            const real_t pos_x = position(i, 0);
+            const real_t pos_y = position(i, 1);
+            const real_t pos_z = position(i, 2);
+
+            // Important: When rebalancing, it is assumed that all particles are within domain bounds.  
+            // If a particle's center of mass lies outside the domain, it won't be contained
+            // in any of the checked blocks during serialization. In that case, the particle  
+            // can become disassociated from its owner if the new block it should belong to is  
+            // not an immediate neighbor to its owner rank. (if it's in an immediate neighbor, it will be exchanged)
+            if( pos_x >= aabb_check[0] && pos_x < aabb_check[1] &&
+                pos_y >= aabb_check[2] && pos_y < aabb_check[3] &&
+                pos_z >= aabb_check[4] && pos_z < aabb_check[5]) {
+
+                goneIdx[nserialized] = i;
+                ++nserialized;
+                
+                for(auto &prop: ps->getProperties()) {
+                    if(!prop.isVolatile()) {
+                        auto prop_type = prop.getType();
+
+                        if(prop_type == pairs::Prop_Vector) {
+                            auto vector_ptr = ps->getAsVectorProperty(prop);
+                            constexpr int nelems = 3;
+
+                            for(int e = 0; e < nelems; e++) {
+                                buffer << vector_ptr(i, e);
+                            }
+                        } else if(prop_type == pairs::Prop_Matrix) {
+                            auto matrix_ptr = ps->getAsMatrixProperty(prop);
+                            constexpr int nelems = 9;
+
+                            for(int e = 0; e < nelems; e++) {
+                                buffer << matrix_ptr(i, e);
+                            }
+                        } else if(prop_type == pairs::Prop_Quaternion) {
+                            auto quat_ptr = ps->getAsQuaternionProperty(prop);
+                            constexpr int nelems = 4;
+
+                            for(int e = 0; e < nelems; e++) {
+                                buffer << quat_ptr(i, e);
+                            }
+                        } else if(prop_type == pairs::Prop_Integer) {
+                            auto int_ptr = ps->getAsIntegerProperty(prop);
+                                buffer << int_ptr(i);
+                        } else if(prop_type == pairs::Prop_UInt64) {
+                            auto uint64_ptr = ps->getAsUInt64Property(prop);
+                                buffer << uint64_ptr(i);
+                        } else if(prop_type == pairs::Prop_Real) {
+                            auto float_ptr = ps->getAsFloatProperty(prop);
+                                buffer << float_ptr(i);
+                        } else {
+                            std::cerr << "serializeImpl(): Invalid property type!" << std::endl;
+                            return;
+                        }
+                    }
+                }
+                // TODO: serialize contact history data as well
+            }
+        }
+
+        // Here we replace serialized particles with the remaining locals 
+        // (Traverse locals in reverse order and move them to empty slots)
+        // Ghosts are ignored since they become invalid after rebalancing
+        int beg = 0;
+        int end = nserialized - 1;
+        int i = nlocal - 1;
+        while ((i > goneIdx[beg]) && (beg <= end)) {
+            if(i == goneIdx[end]){
+                --end;
+            }
+            else{
+                pairs::relocate_particle(ps, goneIdx[beg], i);
+                ++beg;
+            }
+            --i;
+        }
+
+        delete[] goneIdx;
+
+        ps->setTrackedVariableAsInteger("nlocal", nlocal - nserialized);
+        ps->setTrackedVariableAsInteger("nghost", 0);
+        
+        *ptr = (uint_t) nserialized;
+    }
+
+    void deserializeImpl(IBlock *const, const BlockDataID&, mpi::RecvBuffer& buffer) {
+        int nlocal = ps->getTrackedVariableAsInteger("nlocal");
+        int particle_capacity = ps->getTrackedVariableAsInteger("particle_capacity");
+        real_t real_tmp;
+        int int_tmp;
+        uint_t nrecv;
+        uint64_t uint64_tmp;
+
+        buffer >> nrecv;
+        
+        // TODO: Check if there is enough particle capacity for the new particles, when there is not,
+        // all properties and arrays which have particle_capacity as one of their dimensions must be reallocated
+        PAIRS_ASSERT(nlocal + nrecv < particle_capacity);
+
+        for(int i = 0; i < nrecv; ++i) {
+            for(auto &prop: ps->getProperties()) {
+                if(!prop.isVolatile()) {
+                    auto prop_type = prop.getType();
+
+                    if(prop_type == pairs::Prop_Vector) {
+                        auto vector_ptr = ps->getAsVectorProperty(prop);
+                        constexpr int nelems = 3;
+
+                        for(int e = 0; e < nelems; e++) {
+                            buffer >> real_tmp;
+                            vector_ptr(nlocal + i, e) = real_tmp;
+                        }
+                    } else if(prop_type == pairs::Prop_Matrix) {
+                        auto matrix_ptr = ps->getAsMatrixProperty(prop);
+                        constexpr int nelems = 9;
+
+                        for(int e = 0; e < nelems; e++) {
+                            buffer >> real_tmp;
+                            matrix_ptr(nlocal + i, e) = real_tmp;
+                        }
+                    } else if(prop_type == pairs::Prop_Quaternion) {
+                        auto quat_ptr = ps->getAsQuaternionProperty(prop);
+                        constexpr int nelems = 4;
+
+                        for(int e = 0; e < nelems; e++) {
+                            buffer >> real_tmp;
+                            quat_ptr(nlocal + i, e) = real_tmp;
+                        }
+                     } else if(prop_type == pairs::Prop_Integer) {
+                        auto int_ptr = ps->getAsIntegerProperty(prop);
+                        buffer >> int_tmp;
+                        int_ptr(nlocal + i) = int_tmp;
+                    } else if(prop_type == pairs::Prop_UInt64) {
+                        auto uint64_ptr = ps->getAsUInt64Property(prop);
+                        buffer >> uint64_tmp;
+                        uint64_ptr(nlocal + i) = uint64_tmp;
+                    } else if(prop_type == pairs::Prop_Real) {
+                        auto float_ptr = ps->getAsFloatProperty(prop);
+                        buffer >> real_tmp;
+                        float_ptr(nlocal + i) = real_tmp;
+                    } else {
+                        std::cerr << "deserializeImpl(): Invalid property type!" << std::endl;
+                        return;
+                    }
+                }
+            }
+        }
+        
+        ps->setTrackedVariableAsInteger("nlocal", nlocal + nrecv);
+        ps->setTrackedVariableAsInteger("nghost", 0);
+    }
+};
+
+} // namespace walberla
diff --git a/runtime/domain/block_forest.cpp b/runtime/domain/block_forest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0851f2c1733243792909dff5e15b8a8ddc9bb2e6
--- /dev/null
+++ b/runtime/domain/block_forest.cpp
@@ -0,0 +1,489 @@
+#include <map>
+#include <mpi.h>
+#include <vector>
+//---
+#include <blockforest/BlockForest.h>
+#include <blockforest/Initialization.h>
+#include <blockforest/loadbalancing/DynamicCurve.h>
+#include <blockforest/loadbalancing/DynamicDiffusive.h>
+#include <blockforest/loadbalancing/DynamicParMetis.h>
+#include <blockforest/loadbalancing/InfoCollection.h>
+#include <blockforest/loadbalancing/PODPhantomData.h>
+#include <blockforest/loadbalancing/level_determination/MinMaxLevelDetermination.h>
+#include <blockforest/loadbalancing/weight_assignment/MetisAssignmentFunctor.h>
+#include <blockforest/loadbalancing/weight_assignment/WeightAssignmentFunctor.h>
+//---
+#include "../boundary_weights.hpp"
+#include "../pairs_common.hpp"
+#include "../devices/device.hpp"
+#include "regular_6d_stencil.hpp"
+#include "ParticleDataHandling.hpp"
+#include "../unique_id.hpp"
+
+namespace pairs {
+
+BlockForest::BlockForest(
+        PairsRuntime *ps_,
+        real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, bool pbcx, bool pbcy, bool pbcz, bool balance_workload_) :
+        DomainPartitioner(xmin, xmax, ymin, ymax, zmin, zmax), ps(ps_), globalPBC{pbcx, pbcy, pbcz}, balance_workload(balance_workload_) {
+
+        subdom = new real_t[ndims * 2];
+}
+
+BlockForest::BlockForest(PairsRuntime *ps_, const std::shared_ptr<walberla::blockforest::BlockForest> &bf) :
+        forest(bf),
+        DomainPartitioner(bf->getDomain().xMin(), bf->getDomain().xMax(),
+                        bf->getDomain().yMin(), bf->getDomain().yMax(),
+                        bf->getDomain().zMin(), bf->getDomain().zMax()), 
+        ps(ps_), 
+        globalPBC{bf->isXPeriodic(), bf->isYPeriodic(), bf->isZPeriodic()} {
+            subdom = new real_t[ndims * 2];
+            mpiManager = walberla::mpi::MPIManager::instance();
+            world_size = mpiManager->numProcesses();
+            rank = mpiManager->rank();
+            this->info = make_shared<walberla::blockforest::InfoCollection>();
+}
+
+void BlockForest::updateNeighborhood() {
+    std::map<int, std::vector<walberla::math::AABB>> neighborhood;
+    std::map<int, std::vector<walberla::BlockID>> blocks_pushed;
+    auto me = mpiManager->rank();
+    this->nranks = 0;
+    this->total_aabbs = 0;
+
+    ranks.clear();
+    naabbs.clear();
+    aabb_offsets.clear();
+    aabbs.clear();
+    for(auto& iblock: *forest) {
+        auto block = static_cast<walberla::blockforest::Block *>(&iblock);
+        for(uint neigh = 0; neigh < block->getNeighborhoodSize(); ++neigh) {
+            auto neighbor_rank = walberla::int_c(block->getNeighborProcess(neigh));
+
+            // Neighbor blocks that belong to the same rank should be added to 
+            // neighboorhood only if there's PBC along any dim, otherwise they should be skipped.
+            // TODO: Make PBCs work with runtime load balancing
+            if((neighbor_rank != me) || globalPBC[0] || globalPBC[1] || globalPBC[2]) {
+                const walberla::BlockID& neighbor_id = block->getNeighborId(neigh);
+                walberla::math::AABB neighbor_aabb = block->getNeighborAABB(neigh);
+                auto begin = blocks_pushed[neighbor_rank].begin();
+                auto end = blocks_pushed[neighbor_rank].end();
+                
+                if(find_if(begin, end, [neighbor_id](const auto &bp) { return bp == neighbor_id; }) == end) {
+                    neighborhood[neighbor_rank].push_back(neighbor_aabb);
+                    blocks_pushed[neighbor_rank].push_back(neighbor_id);
+                }
+            }
+        }
+    }
+
+    for(auto& nbh: neighborhood) {
+        auto rank = nbh.first;
+        auto aabb_list = nbh.second;
+        ranks.push_back((int) rank);
+        aabb_offsets.push_back(this->total_aabbs);
+        naabbs.push_back((int) aabb_list.size());
+
+        for(auto &aabb: aabb_list) {
+            aabbs.push_back(aabb.xMin());
+            aabbs.push_back(aabb.xMax());
+            aabbs.push_back(aabb.yMin());
+            aabbs.push_back(aabb.yMax());
+            aabbs.push_back(aabb.zMin());
+            aabbs.push_back(aabb.zMax());
+            this->total_aabbs++;
+        }
+
+        this->nranks++;
+    }
+}
+
+void BlockForest::copyRuntimeArray(const std::string& name, void *dest, const int size) {
+    void *src = name.compare("ranks") == 0          ? static_cast<void *>(ranks.data()) :
+                name.compare("naabbs") == 0         ? static_cast<void *>(naabbs.data()) :
+                name.compare("aabb_offsets") == 0   ? static_cast<void *>(aabb_offsets.data()) :
+                name.compare("aabbs") == 0          ? static_cast<void *>(aabbs.data()) :
+                name.compare("subdom") == 0         ? static_cast<void *>(subdom) : nullptr;
+
+    PAIRS_ASSERT(src != nullptr);
+    bool is_real = (name.compare("aabbs") == 0) || (name.compare("subdom") == 0);
+    int tsize = is_real ? sizeof(real_t) : sizeof(int);
+    std::memcpy(dest, src, size * tsize);
+}
+
+void BlockForest::updateWeights() {
+    walberla::mpi::BufferSystem bs(mpiManager->comm(), 756);
+
+    info->clear();
+
+    int sum_block_locals = 0;
+    // Compute the weights for my blocks and their children
+    for(auto& iblock: *forest) {
+        auto block = static_cast<walberla::blockforest::Block *>(&iblock);
+        auto aabb = block->getAABB();
+        auto& block_info = (*info)[block->getId()];
+
+        pairs::compute_boundary_weights(
+            this->ps,
+            aabb.xMin(), aabb.xMax(), aabb.yMin(), aabb.yMax(), aabb.zMin(), aabb.zMax(),
+            &(block_info.computationalWeight), &(block_info.communicationWeight));
+        
+        sum_block_locals += block_info.computationalWeight;
+
+        for(int branch = 0; branch < 8; ++branch) {
+            const auto b_id = walberla::BlockID(block->getId(), branch);
+            const auto b_aabb = forest->getAABBFromBlockId(b_id);
+            auto& b_info = (*info)[b_id];
+
+            pairs::compute_boundary_weights(
+                this->ps,
+                b_aabb.xMin(), b_aabb.xMax(), b_aabb.yMin(), b_aabb.yMax(), b_aabb.zMin(), b_aabb.zMax(),
+                &(b_info.computationalWeight), &(b_info.communicationWeight));
+        }
+    }
+    
+    int non_globals = ps->getTrackedVariableAsInteger("nlocal") - UniqueID::getNumGlobals();
+    
+    if(sum_block_locals!=non_globals){
+        std::cout << "Warning: " << non_globals - sum_block_locals << " particles in rank " << rank << 
+        " may get lost in the next rebalancing." << std::endl;
+    }
+
+    // Send the weights of my blocks and their children to the neighbors of my blocks
+    for(auto& iblock: *forest) {
+        auto block = static_cast<walberla::blockforest::Block *>(&iblock);
+        auto& block_info = (*info)[block->getId()];
+
+        for(int neigh = 0; neigh < block->getNeighborhoodSize(); ++neigh) {
+            bs.sendBuffer(block->getNeighborProcess(neigh)) <<
+                walberla::blockforest::InfoCollection::value_type(block->getId(), block_info);
+        }
+
+        for(int branch = 0; branch < 8; ++branch) {
+            const auto b_id = walberla::BlockID(block->getId(), branch);
+            auto& b_info = (*info)[b_id];
+
+            for(int neigh = 0; neigh < block->getNeighborhoodSize(); ++neigh) {
+                bs.sendBuffer(block->getNeighborProcess(neigh)) <<
+                    walberla::blockforest::InfoCollection::value_type(b_id, b_info);
+            }
+        }
+    }
+
+    bs.setReceiverInfoFromSendBufferState(false, true);
+    bs.sendAll();
+
+    for(auto recv = bs.begin(); recv != bs.end(); ++recv) {
+        while(!recv.buffer().isEmpty()) {
+            walberla::blockforest::InfoCollectionPair val;
+            recv.buffer() >> val;
+            info->insert(val);
+        }
+    }
+}
+
+walberla::Vector3<int> BlockForest::getBlockConfig(int num_processes, int nx, int ny, int nz) {
+    const int bx_factor = 1;
+    const int by_factor = 1;
+    const int bz_factor = 1;
+    const int ax = nx * ny;
+    const int ay = nx * nz;
+    const int az = ny * nz;
+
+    int bestsurf = 2 * (ax + ay + az);
+    int x = 1;
+    int y = 1;
+    int z = 1;
+
+    for(int i = 1; i < num_processes; ++i) {
+        if(num_processes % i == 0) {
+            const int rem_yz = num_processes / i;
+
+            for(int j = 1; j < rem_yz; ++j) {
+                if(rem_yz % j == 0) {
+                    const int k = rem_yz / j;
+                    const int surf = (ax / i / j) + (ay / i / k) + (az / j / k);
+
+                    if(surf < bestsurf) {
+                        x = i, y = j, z = k;
+                        bestsurf = surf;
+                    }
+                }
+            }
+        }
+    }
+
+    return walberla::Vector3<int>(x * bx_factor, y * by_factor, z * bz_factor);
+}
+
+int BlockForest::getInitialRefinementLevel(int num_processes) {
+    int splitFactor = 8;
+    int blocks = 1;
+    int refinementLevel = 0;
+
+    while(blocks < num_processes) {
+        refinementLevel++;
+        blocks *= splitFactor;
+    }
+
+    return refinementLevel;
+}
+
+void BlockForest::setBoundingBox() {
+    for (int i=0; i<6; ++i) subdom[i] = 0.0;
+    if (forest->empty()) return;
+
+    auto aabb_union = forest->begin()->getAABB();
+    for(auto& iblock: *forest) {
+        auto block = static_cast<walberla::blockforest::Block *>(&iblock);
+        aabb_union.merge(block->getAABB());
+    }
+
+    subdom[0] = aabb_union.xMin();
+    subdom[1] = aabb_union.xMax();
+    subdom[2] = aabb_union.yMin();
+    subdom[3] = aabb_union.yMax();
+    subdom[4] = aabb_union.zMin();
+    subdom[5] = aabb_union.zMax();
+}
+
+void BlockForest::initialize(int *argc, char ***argv) {
+    mpiManager = walberla::mpi::MPIManager::instance();
+    mpiManager->initializeMPI(argc, argv);
+    mpiManager->useWorldComm();
+    world_size = mpiManager->numProcesses();
+    rank = mpiManager->rank();
+
+    walberla::math::AABB domain(
+        grid_min[0], grid_min[1], grid_min[2], grid_max[0], grid_max[1], grid_max[2]);
+
+    int gridsize[3] = {32, 32, 32};
+    auto procs = mpiManager->numProcesses();
+    auto block_config = balance_workload ? walberla::Vector3<int>(1, 1, 1) :
+                                           getBlockConfig(procs, gridsize[0], gridsize[1], gridsize[2]);
+
+    auto ref_level = balance_workload ? getInitialRefinementLevel(procs) : 0;
+
+    walberla::Vector3<bool> pbc(globalPBC[0], globalPBC[1], globalPBC[2]);
+
+    forest = walberla::blockforest::createBlockForest(domain, block_config, pbc, procs, ref_level);
+
+    this->info = make_shared<walberla::blockforest::InfoCollection>();
+
+    if (rank==0) {
+        std::cout << "Domain: " << domain << std::endl;
+        std::cout << "PBC: " << pbc << std::endl;
+        std::cout << "Block config: " << block_config  << std::endl;
+        std::cout << "Initial refinement level: " << ref_level << std::endl;
+        std::cout << "Dynamic load balancing: " << (balance_workload ? "True" : "False") << std::endl;
+    }
+}
+
+void BlockForest::update() {
+    if(balance_workload) {
+        if(!forest->loadBalancingFunctionRegistered()){
+            std::cerr << "Workload balancer is not initialized." << std::endl;
+            exit(-1);
+        }
+
+        this->updateWeights();
+        const int nlocal = ps->getTrackedVariableAsInteger("nlocal");
+        for(auto &prop: ps->getProperties()) {
+            if(!prop.isVolatile()) {
+                const int ptypesize = get_proptype_size(prop.getType());
+                ps->copyPropertyToHost(prop, pairs::WriteAfterRead, nlocal*ptypesize);
+            }
+        }
+        
+        // PAIRS_DEBUG("Rebalance\n");
+        if (rank==0) std::cout << "Rebalance" << std::endl;
+        forest->refresh(); 
+}
+
+    this->updateNeighborhood();
+    this->setBoundingBox();
+}
+
+void BlockForest::initWorkloadBalancer(LoadBalancingAlgorithms algorithm, size_t regridMin, size_t regridMax) {
+    if (rank==0) {
+        std::cout << "Load balancing algorithm: " << getAlgorithmName(algorithm) << std::endl;
+        std::cout << "regridMin = " << regridMin << ", regirdMax = " << regridMax << std::endl;
+    }
+    this->balance_workload = true;  // balance_workload is set to true in case the forest has been initialized externally
+    real_t baseWeight = 1.0;
+    int maxBlocksPerProcess = 100;
+
+    // Metis-specific params
+    real_t metisipc2redist = 1.0;
+    string metisAlgorithm = "PART_GEOM_KWAY";
+    string metisWeightsToUse = "BOTH_WEIGHTS";
+    string metisEdgeSource = "EDGES_FROM_EDGE_WEIGHTS";
+
+    forest->recalculateBlockLevelsInRefresh(true);
+    forest->alwaysRebalanceInRefresh(true);
+    forest->reevaluateMinTargetLevelsAfterForcedRefinement(true);
+    forest->allowRefreshChangingDepth(true);
+
+    forest->allowMultipleRefreshCycles(false);
+    forest->checkForEarlyOutInRefresh(false);
+    forest->checkForLateOutInRefresh(false);
+
+    // TODO: Define another functor that makes use of communicationWeight as well
+    forest->setRefreshMinTargetLevelDeterminationFunction(
+        walberla::blockforest::MinMaxLevelDetermination(info, regridMin, regridMax));
+
+    if(algorithm == Morton) {
+        forest->setRefreshPhantomBlockDataAssignmentFunction(
+            walberla::blockforest::WeightAssignmentFunctor(info, baseWeight));
+        forest->setRefreshPhantomBlockDataPackFunction(
+            walberla::blockforest::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor());
+        forest->setRefreshPhantomBlockDataUnpackFunction(
+            walberla::blockforest::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor());
+
+        auto prepFunc = walberla::blockforest::DynamicCurveBalance<walberla::blockforest::WeightAssignmentFunctor::PhantomBlockWeight>(false, true, false);
+        prepFunc.setMaxBlocksPerProcess(maxBlocksPerProcess);
+        forest->setRefreshPhantomBlockMigrationPreparationFunction(prepFunc);
+
+    } else if(algorithm == Hilbert) {
+        forest->setRefreshPhantomBlockDataAssignmentFunction(
+            walberla::blockforest::WeightAssignmentFunctor(info, baseWeight));
+        forest->setRefreshPhantomBlockDataPackFunction(
+            walberla::blockforest::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor());
+        forest->setRefreshPhantomBlockDataUnpackFunction(
+            walberla::blockforest::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor());
+
+        auto prepFunc = walberla::blockforest::DynamicCurveBalance<walberla::blockforest::WeightAssignmentFunctor::PhantomBlockWeight>(true, true, false);
+        prepFunc.setMaxBlocksPerProcess(maxBlocksPerProcess);
+        forest->setRefreshPhantomBlockMigrationPreparationFunction(prepFunc);
+
+    } else if(algorithm == Metis) {
+        forest->setRefreshPhantomBlockDataAssignmentFunction(
+            walberla::blockforest::MetisAssignmentFunctor(info, baseWeight));
+        forest->setRefreshPhantomBlockDataPackFunction(
+            walberla::blockforest::MetisAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor());
+        forest->setRefreshPhantomBlockDataUnpackFunction(
+            walberla::blockforest::MetisAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor());
+
+        auto alg = walberla::blockforest::DynamicParMetis::stringToAlgorithm(metisAlgorithm);
+        auto vWeight = walberla::blockforest::DynamicParMetis::stringToWeightsToUse(metisWeightsToUse);
+        auto eWeight = walberla::blockforest::DynamicParMetis::stringToEdgeSource(metisEdgeSource);
+        auto prepFunc = walberla::blockforest::DynamicParMetis(alg, vWeight, eWeight);
+
+        prepFunc.setipc2redist(metisipc2redist);
+        forest->setRefreshPhantomBlockMigrationPreparationFunction(prepFunc);
+
+    } else if(algorithm == Diffusive) {
+        forest->setRefreshPhantomBlockDataAssignmentFunction(
+            walberla::blockforest::WeightAssignmentFunctor(info, baseWeight));
+        forest->setRefreshPhantomBlockDataPackFunction(
+            walberla::blockforest::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor());
+        forest->setRefreshPhantomBlockDataUnpackFunction(
+            walberla::blockforest::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor());
+
+        auto prepFunc = walberla::blockforest::DynamicDiffusionBalance<walberla::blockforest::WeightAssignmentFunctor::PhantomBlockWeight>(1, 1, false);
+        forest->setRefreshPhantomBlockMigrationPreparationFunction(prepFunc);
+    }
+    else {
+        std::cerr << "Invalid load balancing algorithm." << std::endl;
+        exit(-1);
+    }
+
+    forest->addBlockData(make_shared<walberla::ParticleDataHandling>(ps), "Interface");
+}
+
+void BlockForest::finalize() {
+    mpiManager->finalizeMPI();
+}
+
+int BlockForest::isWithinSubdomain(real_t x, real_t y, real_t z) {
+    for(auto& iblock: *forest) {
+        auto block = static_cast<walberla::blockforest::Block *>(&iblock);
+
+        if(block->getAABB().contains(x, y, z)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void BlockForest::communicateSizes(int dim, const int *nsend, int *nrecv) {
+    std::vector<MPI_Request> send_requests;
+    std::vector<MPI_Request> recv_requests;
+    size_t nranks = 0;
+
+    for(auto neigh_rank: ranks) {
+        if(neigh_rank != rank) {
+            MPI_Request send_req, recv_req;
+            MPI_Irecv(&nrecv[nranks], 1, MPI_INT, neigh_rank, 0, MPI_COMM_WORLD, &recv_req);
+            MPI_Isend(&nsend[nranks], 1, MPI_INT, neigh_rank, 0, MPI_COMM_WORLD, &send_req);
+            send_requests.push_back(send_req);
+            recv_requests.push_back(recv_req);
+        } else {
+            nrecv[nranks] = nsend[nranks];
+        }
+        nranks++;
+    }
+
+    if(!send_requests.empty()) {
+        MPI_Waitall(send_requests.size(), send_requests.data(), MPI_STATUSES_IGNORE);
+    }
+    if(!recv_requests.empty()) {
+        MPI_Waitall(recv_requests.size(), recv_requests.data(), MPI_STATUSES_IGNORE);
+    }
+}
+
+void BlockForest::communicateData(
+    int dim, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+    std::vector<MPI_Request> send_requests;
+    std::vector<MPI_Request> recv_requests;
+    size_t nranks = 0;
+
+    for(auto neigh_rank: ranks) {
+        const real_t *send_ptr = &send_buf[send_offsets[nranks] * elem_size];
+        real_t *recv_ptr = &recv_buf[recv_offsets[nranks] * elem_size];
+
+        if(neigh_rank != rank) {
+            MPI_Request send_req, recv_req;
+
+            MPI_Irecv(recv_ptr, nrecv[nranks] * elem_size, MPI_DOUBLE, neigh_rank, 0, MPI_COMM_WORLD, &recv_req);
+            MPI_Isend(send_ptr, nsend[nranks] * elem_size, MPI_DOUBLE, neigh_rank, 0, MPI_COMM_WORLD, &send_req);
+
+            send_requests.push_back(send_req);
+            recv_requests.push_back(recv_req);
+        } else {
+            pairs::copy_in_device(recv_ptr, send_ptr, nsend[nranks] * elem_size * sizeof(real_t));
+        }
+
+        nranks++;
+    }
+
+    if(!send_requests.empty()) {
+        MPI_Waitall(send_requests.size(), send_requests.data(), MPI_STATUSES_IGNORE);
+    }
+
+    if(!recv_requests.empty()) {
+        MPI_Waitall(recv_requests.size(), recv_requests.data(), MPI_STATUSES_IGNORE);
+    }
+}
+
+void BlockForest::communicateDataReverse(
+    int dim, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+        this->communicateData(dim, elem_size,send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv);
+}
+
+void BlockForest::communicateAllData(
+    int ndims, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+    this->communicateData(0, elem_size, send_buf, send_offsets, nsend, recv_buf, recv_offsets, nrecv);
+}
+
+}
diff --git a/runtime/domain/block_forest.hpp b/runtime/domain/block_forest.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d814d02c423358b99f11622bbe2ed7f88b557f97
--- /dev/null
+++ b/runtime/domain/block_forest.hpp
@@ -0,0 +1,96 @@
+#include <memory>
+#include <map>
+
+#include "../pairs_common.hpp"
+#include "domain_partitioning.hpp"
+
+#pragma once
+
+#define SMALL 0.00001
+
+namespace walberla {
+    namespace blockforest{
+        class BlockForest;
+        class BlockID;
+        class BlockInfo;
+        using InfoCollection = std::map<BlockID, BlockInfo>;
+    }
+
+    namespace mpi {
+        class MPIManager;
+    }
+
+    namespace math{
+        template<typename T> 
+        class Vector3;
+    }
+}
+namespace pairs {
+
+class PairsRuntime;
+
+class BlockForest : public DomainPartitioner {
+private:
+    std::shared_ptr<walberla::mpi::MPIManager> mpiManager;
+    std::shared_ptr<walberla::blockforest::BlockForest> forest;
+    std::shared_ptr<walberla::blockforest::InfoCollection> info;
+    std::vector<int> ranks;
+    std::vector<int> naabbs;
+    std::vector<int> aabb_offsets;
+    std::vector<double> aabbs;
+    PairsRuntime *ps;
+    real_t *subdom;
+    const bool globalPBC[3];
+    int world_size, rank, nranks, total_aabbs;
+    bool balance_workload = false;
+
+public:
+    BlockForest(
+        PairsRuntime *ps_,
+        real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, bool pbcx, bool pbcy, bool pbcz, bool balance_workload_);
+
+    BlockForest(PairsRuntime *ps_, const std::shared_ptr<walberla::blockforest::BlockForest> &bf);
+
+    ~BlockForest() {
+        delete[] subdom;
+    }
+
+    void initialize(int *argc, char ***argv);
+    void initWorkloadBalancer(LoadBalancingAlgorithms algorithm, size_t regridMin, size_t regridMax);
+
+    void update();
+    void finalize();
+    int getWorldSize() const { return world_size; }
+    int getRank() const { return rank; }
+    int getNumberOfNeighborRanks() { return this->nranks; }
+    int getNumberOfNeighborAABBs() { return this->total_aabbs; }
+    double getSubdomMin(int dim) const { return subdom[2*dim + 0];}
+    double getSubdomMax(int dim) const { return subdom[2*dim + 1];}
+
+    void updateNeighborhood();
+    void updateWeights();
+    walberla::math::Vector3<int> getBlockConfig(int num_processes, int nx, int ny, int nz);
+    int getInitialRefinementLevel(int num_processes);
+    void setBoundingBox();
+    void rebalance();
+
+    int isWithinSubdomain(real_t x, real_t y, real_t z);
+    void copyRuntimeArray(const std::string& name, void *dest, const int size);
+    void communicateSizes(int dim, const int *send_sizes, int *recv_sizes);
+    void communicateData(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+
+    void communicateDataReverse(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+
+    void communicateAllData(
+        int ndims, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+};
+
+}
diff --git a/runtime/domain/domain_partitioning.hpp b/runtime/domain/domain_partitioning.hpp
index e08e5eebba5576d6094205a837d69c606e4f7662..3dfdaaebfa8c9f91f58705fd1b750f53569afc04 100644
--- a/runtime/domain/domain_partitioning.hpp
+++ b/runtime/domain/domain_partitioning.hpp
@@ -8,6 +8,7 @@ class Regular6DStencil;
 
 class DomainPartitioner {
     friend class Regular6DStencil;
+    friend class BlockForest;
 
 protected:
     real_t *grid_min;
@@ -36,13 +37,35 @@ public:
         delete[] grid_max;
     }
 
+    double getMin(int dim) const { return grid_min[dim]; }
+    double getMax(int dim) const { return grid_max[dim]; }
+    virtual double getSubdomMin(int dim) const = 0;
+    virtual double getSubdomMax(int dim) const = 0;
     virtual void initialize(int *argc, char ***argv) = 0;
-    virtual void fillArrays(int *neighbor_ranks, int *pbc, real_t *subdom) = 0;
+    virtual void initWorkloadBalancer(LoadBalancingAlgorithms algorithm, size_t regridMin, size_t regridMax) = 0;
+    virtual void update() = 0;
+    virtual int getWorldSize() const = 0;
+    virtual int getRank() const = 0;
+    virtual int getNumberOfNeighborAABBs() = 0;
+    virtual int getNumberOfNeighborRanks() = 0;
+    virtual int isWithinSubdomain(real_t x, real_t y, real_t z) = 0;
+    virtual void copyRuntimeArray(const std::string& name, void *dest, const int size) = 0;
     virtual void communicateSizes(int dim, const int *nsend, int *nrecv) = 0;
     virtual void communicateData(
         int dim, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv) = 0;
+
+    virtual void communicateDataReverse(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv) = 0;
+
+    virtual void communicateAllData(
+        int ndims, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv) = 0;
+        
     virtual void finalize() = 0;
 };
 
diff --git a/runtime/domain/regular_6d_stencil.cpp b/runtime/domain/regular_6d_stencil.cpp
index b01d2c76dc097391fd0e0204cc291f6ee65442b2..96ea998eb245ef6a3ed1add6302b57d8db78fc06 100644
--- a/runtime/domain/regular_6d_stencil.cpp
+++ b/runtime/domain/regular_6d_stencil.cpp
@@ -71,7 +71,7 @@ void Regular6DStencil::setBoundingBox() {
         MPI_Cart_shift(cartesian, d, 1, &(prev[d]), &(next[d]));
         pbc_prev[d] = (myloc[d] == 0) ? 1 : 0;
         pbc_next[d] = (myloc[d] == nranks[d] - 1) ? -1 : 0;
-        subdom_min[d] = this->grid_min[d] + rank_length[d] * (real_t)myloc[d];
+        subdom_min[d] = this->grid_min[d] + rank_length[d] * (real_t) myloc[d];
         subdom_max[d] = subdom_min[d] + rank_length[d];
     }
 
@@ -89,6 +89,10 @@ void Regular6DStencil::initialize(int *argc, char ***argv) {
     this->setBoundingBox();
 }
 
+void Regular6DStencil::initWorkloadBalancer(LoadBalancingAlgorithms algorithm, size_t regridMin, size_t regridMax) {}
+
+void Regular6DStencil::update() {}
+
 void Regular6DStencil::finalize() {
     MPI_Finalize();
 }
@@ -99,14 +103,24 @@ int Regular6DStencil::isWithinSubdomain(real_t x, real_t y, real_t z) {
            z >= subdom_min[2] && z < subdom_max[2] - SMALL;
 }
 
-void Regular6DStencil::fillArrays(int *neighbor_ranks, int *pbc, real_t *subdom) {
+void Regular6DStencil::copyRuntimeArray(const std::string& name, void *dest, const int size) {
     for(int d = 0; d < ndims; d++) {
-        neighbor_ranks[d * 2 + 0] = prev[d];
-        neighbor_ranks[d * 2 + 1] = next[d];
-        pbc[d * 2 + 0] = pbc_prev[d];
-        pbc[d * 2 + 1] = pbc_next[d];
-        subdom[d * 2 + 0] = subdom_min[d];
-        subdom[d * 2 + 1] = subdom_max[d];
+        if(name.compare("neighbor_ranks") == 0) {
+            int *neighbor_ranks = static_cast<int *>(dest);
+            neighbor_ranks[d * 2 + 0] = prev[d];
+            neighbor_ranks[d * 2 + 1] = next[d];
+        } else if(name.compare("pbc") == 0) {
+            int *pbc = static_cast<int *>(dest);
+            pbc[d * 2 + 0] = pbc_prev[d];
+            pbc[d * 2 + 1] = pbc_next[d];
+        } else if(name.compare("subdom") == 0) {
+            real_t *subdom = static_cast<real_t *>(dest);
+            subdom[d * 2 + 0] = subdom_min[d];
+            subdom[d * 2 + 1] = subdom_max[d];
+        } else {
+            std::cerr << "copyRuntimeArray(): Array \"" << name << "\" is invalid." << std::endl;
+            exit(-1);
+        }
     }
 }
 
@@ -131,8 +145,6 @@ void Regular6DStencil::communicateData(
     const real_t *send_buf, const int *send_offsets, const int *nsend,
     real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
 
-    //MPI_Request recv_requests[2];
-    //MPI_Request send_requests[2];
     const real_t *send_prev = &send_buf[send_offsets[dim * 2 + 0] * elem_size];
     const real_t *send_next = &send_buf[send_offsets[dim * 2 + 1] * elem_size];
     real_t *recv_prev = &recv_buf[recv_offsets[dim * 2 + 0] * elem_size];
@@ -143,16 +155,6 @@ void Regular6DStencil::communicateData(
             send_prev, nsend[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0,
             recv_prev, nrecv[dim * 2 + 0] * elem_size, MPI_DOUBLE, next[dim], 0,
             MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-
-        /*
-        MPI_Irecv(
-            recv_prev, nrecv[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0,
-            MPI_COMM_WORLD, &recv_requests[0]);
-
-        MPI_Isend(
-            send_prev, nsend[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0,
-            MPI_COMM_WORLD, &send_requests[0]);
-        */
     } else {
         pairs::copy_in_device(recv_prev, send_prev, nsend[dim * 2 + 0] * elem_size * sizeof(real_t));
     }
@@ -162,22 +164,38 @@ void Regular6DStencil::communicateData(
             send_next, nsend[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0,
             recv_next, nrecv[dim * 2 + 1] * elem_size, MPI_DOUBLE, prev[dim], 0,
             MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else {
+        pairs::copy_in_device(recv_next, send_next, nsend[dim * 2 + 1] * elem_size * sizeof(real_t));
+    }
+}
 
-        /*
-        MPI_Irecv(
-            recv_next, nrecv[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0,
-            MPI_COMM_WORLD, &recv_requests[1]);
+void Regular6DStencil::communicateDataReverse(
+    int dim, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
 
-        MPI_Isend(
-            send_next, nsend[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0,
-            MPI_COMM_WORLD, &send_requests[1]);
-        */
+    const real_t *send_prev = &send_buf[send_offsets[dim * 2 + 0] * elem_size];
+    const real_t *send_next = &send_buf[send_offsets[dim * 2 + 1] * elem_size];
+    real_t *recv_prev = &recv_buf[recv_offsets[dim * 2 + 0] * elem_size];
+    real_t *recv_next = &recv_buf[recv_offsets[dim * 2 + 1] * elem_size];
+
+    if(prev[dim] != rank) {
+        MPI_Sendrecv(
+            send_prev, nsend[dim * 2 + 0] * elem_size, MPI_DOUBLE, next[dim], 0,
+            recv_prev, nrecv[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0,
+            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
     } else {
-        pairs::copy_in_device(recv_next, send_next, nsend[dim * 2 + 1] * elem_size * sizeof(real_t));
+        pairs::copy_in_device(recv_prev, send_prev, nsend[dim * 2 + 0] * elem_size * sizeof(real_t));
     }
 
-    //MPI_Waitall(2, recv_requests, MPI_STATUSES_IGNORE);
-    //MPI_Waitall(2, send_requests, MPI_STATUSES_IGNORE);
+    if(next[dim] != rank) {
+        MPI_Sendrecv(
+            send_next, nsend[dim * 2 + 1] * elem_size, MPI_DOUBLE, prev[dim], 0,
+            recv_next, nrecv[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0,
+            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else {
+        pairs::copy_in_device(recv_next, send_next, nsend[dim * 2 + 1] * elem_size * sizeof(real_t));
+    }
 }
 
 void Regular6DStencil::communicateAllData(
@@ -185,9 +203,6 @@ void Regular6DStencil::communicateAllData(
     const real_t *send_buf, const int *send_offsets, const int *nsend,
     real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
 
-    //std::vector<MPI_Request> send_requests(ndims * 2, MPI_REQUEST_NULL);
-    //std::vector<MPI_Request> recv_requests(ndims * 2, MPI_REQUEST_NULL);
-
     for (int d = 0; d < ndims; d++) {
         const real_t *send_prev = &send_buf[send_offsets[d * 2 + 0] * elem_size];
         const real_t *send_next = &send_buf[send_offsets[d * 2 + 1] * elem_size];
@@ -199,16 +214,6 @@ void Regular6DStencil::communicateAllData(
                 send_prev, nsend[d * 2 + 0] * elem_size, MPI_DOUBLE, prev[d], 0,
                 recv_prev, nrecv[d * 2 + 0] * elem_size, MPI_DOUBLE, next[d], 0,
                 MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-
-            /*
-            MPI_Isend(
-                send_prev, nsend[d * 2 + 0] * elem_size, MPI_DOUBLE, prev[d], 0,
-                MPI_COMM_WORLD, &send_requests[d * 2 + 0]);
-
-            MPI_Irecv(
-                recv_prev, nrecv[d * 2 + 0] * elem_size, MPI_DOUBLE, prev[d], 0,
-                MPI_COMM_WORLD, &recv_requests[d * 2 + 0]);
-            */
         } else {
             pairs::copy_in_device(recv_prev, send_prev, nsend[d * 2 + 0] * elem_size * sizeof(real_t));
         }
@@ -218,23 +223,10 @@ void Regular6DStencil::communicateAllData(
                 send_next, nsend[d * 2 + 1] * elem_size, MPI_DOUBLE, next[d], 0,
                 recv_next, nrecv[d * 2 + 1] * elem_size, MPI_DOUBLE, prev[d], 0,
                 MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-
-            /*
-            MPI_Isend(
-                send_next, nsend[d * 2 + 1] * elem_size, MPI_DOUBLE, next[d], 0,
-                MPI_COMM_WORLD, &send_requests[d * 2 + 1]);
-
-            MPI_Irecv(
-                recv_next, nrecv[d * 2 + 1] * elem_size, MPI_DOUBLE, next[d], 0,
-                MPI_COMM_WORLD, &recv_requests[d * 2 + 1]);
-            */
         } else {
             pairs::copy_in_device(recv_next, send_next, nsend[d * 2 + 1] * elem_size * sizeof(real_t));
         }
     }
-
-    //MPI_Waitall(ndims * 2, send_requests.data(), MPI_STATUSES_IGNORE);
-    //MPI_Waitall(ndims * 2, recv_requests.data(), MPI_STATUSES_IGNORE);
 }
 
 }
diff --git a/runtime/domain/regular_6d_stencil.hpp b/runtime/domain/regular_6d_stencil.hpp
index 330af65a6ccb140cef8d283eac3ab183d0503c45..b4a9e5c6634c6f15c89041f539f0b955ecce992f 100644
--- a/runtime/domain/regular_6d_stencil.hpp
+++ b/runtime/domain/regular_6d_stencil.hpp
@@ -51,17 +51,30 @@ public:
     void setConfig();
     void setBoundingBox();
     void initialize(int *argc, char ***argv);
+    void initWorkloadBalancer(LoadBalancingAlgorithms algorithm, size_t regridMin, size_t regridMax);
+    void update();
     void finalize();
+
     int getWorldSize() const { return world_size; }
     int getRank() const { return rank; }
+    int getNumberOfNeighborRanks() { return 6; }
+    int getNumberOfNeighborAABBs() { return 6; }
+    double getSubdomMin(int dim) const { return subdom_min[dim];}
+    double getSubdomMax(int dim) const { return subdom_max[dim];}
+
     int isWithinSubdomain(real_t x, real_t y, real_t z);
-    void fillArrays(int *neighbor_ranks, int *pbc, real_t *subdom);
+    void copyRuntimeArray(const std::string& name, void *dest, const int size);
     void communicateSizes(int dim, const int *send_sizes, int *recv_sizes);
     void communicateData(
         int dim, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv);
 
+    void communicateDataReverse(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+        
     void communicateAllData(
         int ndims, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
diff --git a/runtime/feature_property.hpp b/runtime/feature_property.hpp
index 025a514766a01643c21915e362a24481664af5fd..e3221b4cefcfdc20f7d70d612c2d7be3f90c89ae 100644
--- a/runtime/feature_property.hpp
+++ b/runtime/feature_property.hpp
@@ -22,13 +22,13 @@ public:
         nkinds(nkinds_),
         array_size(array_size_) {}
 
-    property_t getId() { return id; }
-    std::string getName() { return name; }
-    void *getHostPointer() { return h_ptr; }
-    void *getDevicePointer() { return d_ptr; }
-    PropertyType getType() { return type; }
-    size_t getNumberOfKinds() { return nkinds; }
-    size_t getArraySize() { return array_size; }
+    property_t getId() const { return id; }
+    std::string getName() const { return name; }
+    void *getHostPointer() const { return h_ptr; }
+    void *getDevicePointer() const { return d_ptr; }
+    PropertyType getType() const { return type; }
+    size_t getNumberOfKinds() const { return nkinds; }
+    size_t getArraySize() const { return array_size; }
 };
 
 }
diff --git a/runtime/math/MathTrait.h b/runtime/math/MathTrait.h
new file mode 100644
index 0000000000000000000000000000000000000000..44362c9f53fc0767ba707b39cb14ce3067cca176
--- /dev/null
+++ b/runtime/math/MathTrait.h
@@ -0,0 +1,633 @@
+#pragma once
+
+//*************************************************************************************************
+// Includes
+//*************************************************************************************************
+
+#include <cstddef>
+
+namespace pairs {
+
+
+//=================================================================================================
+//
+//  MATHEMATICAL TRAIT
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*!\class MathTrait
+ * \brief Base template for the MathTrait class.
+ * \ingroup math
+ *
+ * \section mathtrait_general General
+ *
+ * The MathTrait class template offers the possibility to select the resulting data type
+ * of a generic mathematical operation. In case of operations between built-in data types,
+ * the MathTrait class defines the more significant data type as the resulting data type.
+ * For this selection, signed data types are given a higher significance. In case of
+ * operations involving user-defined data types, the MathTrait template specifies the
+ * resulting data type of this operation.\n
+ * Specifying the resulting data type for a specific operation is done by specializing
+ * the MathTrait template for this particular type combination. In case a certain type
+ * combination is not defined in a MathTrait specialization, the base template is selected,
+ * which defines no resulting types and therefore stops the compilation process. Each
+ * specialization defines the data types \a HighType that represents the high-order data
+ * type of the two given data types and \a LowType that represents the low-order data type.
+ * Additionally, each specialization defines the types \a AddType, \a SubType, \a MultType
+ * and \a DivType, that represent the type of the resulting data type of the corresponding
+ * mathematical operation. The following example shows the specialization for operations
+ * between the double and the integer type:
+
+   \code
+   template<>
+   struct MathTrait< double, int >
+   {
+      typedef double  HighType;
+      typedef int     LowType;
+      typedef double  AddType;
+      typedef double  SubType;
+      typedef double  MultType;
+      typedef double  DivType;
+   };
+   \endcode
+
+ * Per default, the MathTrait template provides specializations for the following built-in
+ * data types:
+ *
+ * <ul>
+ *    <li>integers</li>
+ *    <ul>
+ *       <li>unsigned char, signed char, char, wchar_t</li>
+ *       <li>unsigned short, short</li>
+ *       <li>unsigned int, int</li>
+ *       <li>unsigned long, long</li>
+ *       <li>std::size_t, std::ptrdiff_t (for certain 64-bit compilers)</li>
+ *    </ul>
+ *    <li>floating points</li>
+ *    <ul>
+ *       <li>float</li>
+ *       <li>double</li>
+ *       <li>long double</li>
+ *    </ul>
+ * </ul>
+ *
+ *
+ * \n \section specializations Creating custom specializations
+ *
+ * It is possible to specialize the MathTrait template for additional user-defined data types.
+ * However, it is possible that a specific mathematical operation is invalid for the particular
+ * type combination. In this case, the INVALID_NUMERICAL_TYPE can be used to fill the missing
+ * type definition. The INVALID_NUMERICAL_TYPE represents the resulting data type of an invalid
+ * numerical operation. It is left undefined to stop the compilation process in case it is
+ * instantiated. The following example shows the specialization of the MathTrait template for
+ * Matrix3 and Vector3. In this case, only the multiplication between the matrix and the vector
+ * is a valid numerical operation. Therefore for all other types the INVALID_NUMERICAL_TYPE is
+ * used.
+
+   \code
+   template< typename T1, typename T2 >
+   struct MathTrait< Matrix3<T1>, Vector3<T2> >
+   {
+      typedef INVALID_NUMERICAL_TYPE                          HighType;  // Invalid, no common high data type
+      typedef INVALID_NUMERICAL_TYPE                          LowType;   // Invalid, no common low data type
+      typedef INVALID_NUMERICAL_TYPE                          AddType;   // Invalid, cannot add a matrix and a vector
+      typedef INVALID_NUMERICAL_TYPE                          SubType;   // Invalid, cannot subtract a vector from a matrix
+      typedef Vector3< typename MathTrait<T1,T2>::MultType >  MultType;  // Multiplication between a matrix and a vector
+      typedef INVALID_NUMERICAL_TYPE                          DivType;   // Invalid, cannot divide a matrix by a vector
+   };
+   \endcode
+
+ * \n \section mathtrait_examples Examples
+ *
+ * The following example demonstrates the use of the MathTrait template, where depending on
+ * the two given data types the resulting data type is selected:
+
+   \code
+   template< typename T1, typename T2 >    // The two generic types
+   typename MathTrait<T1,T2>::HighType     // The resulting generic return type
+   add( T1 t1, T2 t2 )                     //
+   {                                       // The function 'add' returns the sum
+      return t1 + t2;                      // of the two given values
+   }                                       //
+   \endcode
+
+ * Additionally, the specializations of the MathTrait template enable arithmetic operations
+ * between any combination of the supported data types:
+
+   \code
+   typedef Vector3< Matrix3< float  > >  VectorOfMatrices;  // Vector of single-precision matrices
+   typedef Vector3< Vector3  < double > >  VectorOfVectors;   // Vector of double-precision vectors
+   typedef Vector3< double >                   VectorOfScalars;   // Vector of double-precision scalars
+
+   VectorOfMatrices vm;  // Setup of a vector of matrices
+   VectorOfVectors  vv;  // Setup of a vector of vectors
+
+   // Calculation of the scalar product between the two vectors. The resulting data type
+   // is a plain 3-dimensional vector of scalar values of type double.
+   VectorOfScalars res = vm * vv;
+   \endcode
+ */
+//*************************************************************************************************
+
+//strange but needed for compatibility reasons with visual studio compiler
+//backward compatibility to old PAIRS code
+template< typename T1, typename T2 >
+struct MathTrait
+{
+   using HighType = T1;
+   using LowType = T2;
+   using High = T1;
+   using Low = T2;
+};
+
+template< typename T>
+struct MathTrait< T, T >
+{
+   using HighType = T;
+   using LowType = T;
+   using High = T;
+   using Low = T;
+};
+
+
+//=================================================================================================
+//
+//  MATHTRAIT SPECIALIZATION MACRO
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+/*!\brief Macro for the creation of MathTrait specializations for the built-in data types.
+ * \ingroup math
+ *
+ * This macro is used for the setup of the MathTrait specializations for the built-in data
+ * types.
+ */
+#define PAIRS_CREATE_MATHTRAIT_SPECIALIZATION(T1,T2,HIGH,LOW) \
+   template<> \
+   struct MathTrait< T1, T2 > \
+   { \
+      typedef HIGH  HighType; \
+      typedef LOW   LowType;  \
+      typedef HIGH  High;     \
+      typedef LOW   Low;      \
+      typedef HIGH  AddType;  \
+      typedef HIGH  SubType;  \
+      typedef HIGH  MultType; \
+      typedef HIGH  DivType;  \
+   }
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  UNSIGNED CHAR SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , unsigned char , unsigned char , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , char          , char          , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , signed char   , signed char   , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , wchar_t       , wchar_t       , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , unsigned short, unsigned short, unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , short         , short         , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , unsigned int  , unsigned int  , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , int           , int           , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , unsigned long , unsigned long , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , long          , long          , unsigned char  );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , std::size_t   , std::size_t   , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , std::ptrdiff_t, std::ptrdiff_t, unsigned char  );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , float         , float         , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , double        , double        , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned char , long double   , long double   , unsigned char  );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  CHAR SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , unsigned char , char          , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , char          , char          , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , signed char   , signed char   , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , wchar_t       , wchar_t       , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , unsigned short, unsigned short, char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , short         , short         , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , unsigned int  , unsigned int  , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , int           , int           , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , unsigned long , unsigned long , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , long          , long          , char           );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , std::size_t   , std::size_t   , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , std::ptrdiff_t, std::ptrdiff_t, char           );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , float         , float         , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , double        , double        , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( char          , long double   , long double   , char           );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  SIGNED CHAR SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , unsigned char , signed char   , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , char          , signed char   , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , signed char   , signed char   , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , wchar_t       , wchar_t       , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , unsigned short, unsigned short, signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , short         , short         , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , unsigned int  , unsigned int  , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , int           , int           , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , unsigned long , unsigned long , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , long          , long          , signed char    );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , std::size_t   , std::size_t   , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , std::ptrdiff_t, std::ptrdiff_t, signed char    );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , float         , float         , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , double        , double        , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( signed char   , long double   , long double   , signed char    );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  WCHAR_T SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , unsigned char , wchar_t       , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , char          , wchar_t       , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , signed char   , wchar_t       , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , wchar_t       , wchar_t       , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , unsigned short, unsigned short, wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , short         , short         , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , unsigned int  , unsigned int  , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , int           , int           , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , unsigned long , unsigned long , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , long          , long          , wchar_t        );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , std::size_t   , std::size_t   , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , std::ptrdiff_t, std::ptrdiff_t, wchar_t        );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , float         , float         , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , double        , double        , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( wchar_t       , long double   , long double   , wchar_t        );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  UNSIGNED SHORT SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, unsigned char , unsigned short, unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, char          , unsigned short, char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, signed char   , unsigned short, signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, wchar_t       , unsigned short, wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, unsigned short, unsigned short, unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, short         , short         , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, unsigned int  , unsigned int  , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, int           , int           , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, unsigned long , unsigned long , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, long          , long          , unsigned short );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, std::size_t   , std::size_t   , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, std::ptrdiff_t, std::ptrdiff_t, unsigned short );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, float         , float         , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, double        , double        , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned short, long double   , long double   , unsigned short );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  SHORT SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , unsigned char , short         , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , char          , short         , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , signed char   , short         , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , wchar_t       , short         , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , unsigned short, short         , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , short         , short         , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , unsigned int  , unsigned int  , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , int           , int           , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , unsigned long , unsigned long , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , long          , long          , short          );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , std::size_t   , std::size_t   , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , std::ptrdiff_t, std::ptrdiff_t, short          );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , float         , float         , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , double        , double        , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( short         , long double   , long double   , short          );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  UNSIGNED INT SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , unsigned char , unsigned int  , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , char          , unsigned int  , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , signed char   , unsigned int  , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , wchar_t       , unsigned int  , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , unsigned short, unsigned int  , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , short         , unsigned int  , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , unsigned int  , unsigned int  , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , int           , int           , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , unsigned long , unsigned long , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , long          , long          , unsigned int   );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , std::size_t   , std::size_t   , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , std::ptrdiff_t, std::ptrdiff_t, unsigned int   );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , float         , float         , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , double        , double        , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned int  , long double   , long double   , unsigned int   );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  INT SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , unsigned char , int           , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , char          , int           , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , signed char   , int           , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , wchar_t       , int           , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , unsigned short, int           , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , short         , int           , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , unsigned int  , int           , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , int           , int           , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , unsigned long , unsigned long , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , long          , long          , int            );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , std::size_t   , std::size_t   , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , std::ptrdiff_t, std::ptrdiff_t, int            );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , float         , float         , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , double        , double        , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( int           , long double   , long double   , int            );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  UNSIGNED LONG SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , unsigned char , unsigned long , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , char          , unsigned long , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , signed char   , unsigned long , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , wchar_t       , unsigned long , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , unsigned short, unsigned long , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , short         , unsigned long , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , unsigned int  , unsigned long , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , int           , unsigned long , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , unsigned long , unsigned long , unsigned long  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , long          , long          , unsigned long  );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , std::size_t   , std::size_t   , unsigned long  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , std::ptrdiff_t, std::ptrdiff_t, unsigned long  );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , float         , float         , unsigned long  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , double        , double        , unsigned long  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( unsigned long , long double   , long double   , unsigned long  );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  LONG SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , unsigned char , long          , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , char          , long          , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , signed char   , long          , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , wchar_t       , long          , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , unsigned short, long          , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , short         , long          , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , unsigned int  , long          , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , int           , long          , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , unsigned long , long          , unsigned long  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , long          , long          , long           );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , std::size_t   , std::size_t   , long           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , std::ptrdiff_t, std::ptrdiff_t, long           );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , float         , float         , long           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , double        , double        , long           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long          , long double   , long double   , long           );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  SIZE_T SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+#if defined(_WIN64)
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , unsigned char , std::size_t   , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , char          , std::size_t   , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , signed char   , std::size_t   , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , wchar_t       , std::size_t   , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , unsigned short, std::size_t   , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , short         , std::size_t   , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , unsigned int  , std::size_t   , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , int           , std::size_t   , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , unsigned long , std::size_t   , unsigned long  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , long          , std::size_t   , long           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , std::size_t   , std::size_t   , std::size_t    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , float         , float         , std::size_t    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , double        , double        , std::size_t    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( std::size_t   , long double   , long double   , std::size_t    );
+/*! \endcond */
+#endif
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  FLOAT SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , unsigned char , float         , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , char          , float         , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , signed char   , float         , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , wchar_t       , float         , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , unsigned short, float         , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , short         , float         , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , unsigned int  , float         , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , int           , float         , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , unsigned long , float         , unsigned long  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , long          , float         , long           );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , std::size_t   , float         , std::size_t    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , std::ptrdiff_t, float         , std::ptrdiff_t );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , float         , float         , float          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , double        , double        , float          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( float         , long double   , long double   , float          );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  DOUBLE SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , unsigned char , double        , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , char          , double        , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , signed char   , double        , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , wchar_t       , double        , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , unsigned short, double        , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , short         , double        , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , unsigned int  , double        , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , int           , double        , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , unsigned long , double        , unsigned long  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , long          , double        , long           );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , std::size_t   , double        , std::size_t    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , std::ptrdiff_t, double        , std::ptrdiff_t );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , float         , double        , float          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , double        , double        , double         );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( double        , long double   , long double   , double         );
+/*! \endcond */
+//*************************************************************************************************
+
+
+
+
+//=================================================================================================
+//
+//  LONG DOUBLE SPECIALIZATIONS
+//
+//=================================================================================================
+
+//*************************************************************************************************
+/*! \cond internal */
+//                                  Type 1          Type 2          High type       Low type
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , unsigned char , long double   , unsigned char  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , char          , long double   , char           );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , signed char   , long double   , signed char    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , wchar_t       , long double   , wchar_t        );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , unsigned short, long double   , unsigned short );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , short         , long double   , short          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , unsigned int  , long double   , unsigned int   );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , int           , long double   , int            );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , unsigned long , long double   , unsigned long  );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , long          , long double   , long           );
+#if defined(_WIN64)
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , std::size_t   , long double   , std::size_t    );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , std::ptrdiff_t, long double   , std::ptrdiff_t );
+#endif
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , float         , long double   , float          );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , double        , long double   , double         );
+PAIRS_CREATE_MATHTRAIT_SPECIALIZATION( long double   , long double   , long double   , long double    );
+/*! \endcond */
+//*************************************************************************************************
+
+#undef PAIRS_CREATE_MATHTRAIT_SPECIALIZATION
+
+}
diff --git a/runtime/math/Vector3.hpp b/runtime/math/Vector3.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e5b4f0b984953251d7a9791cc6a7354fc4b0bfb
--- /dev/null
+++ b/runtime/math/Vector3.hpp
@@ -0,0 +1,43 @@
+#pragma once
+#include <iostream>
+
+#include "../pairs_common.hpp"
+#include "MathTrait.h"
+
+
+namespace pairs {
+
+#define HIGH typename MathTrait<Type,Other>::High
+
+template< typename Type >
+class Vector3 {
+public:
+   Vector3() = default;
+
+   // If the constructor is called from device, v_ is automatically allocated on 
+   // device because it's a static array embeded in the object itself 
+   PAIRS_ATTR_HOST_DEVICE Vector3( Type x, Type y, Type z ) {
+      v_[0] = x;
+      v_[1] = y;
+      v_[2] = z;
+   }
+
+   template< typename Other >
+   PAIRS_ATTR_HOST_DEVICE inline Vector3<HIGH> operator+( const Vector3<Other>& rhs ) const{
+      return Vector3<HIGH>( v_[0]+static_cast<Type>(rhs.v_[0]), v_[1]+static_cast<Type>(rhs.v_[1]), v_[2]+static_cast<Type>(rhs.v_[2]) );
+   }
+
+   PAIRS_ATTR_HOST_DEVICE Type& operator[]( int index ) { 
+      return v_[index]; 
+   }
+
+   PAIRS_ATTR_HOST_DEVICE const Type& operator[] ( int index ) const { 
+      return v_[index]; 
+   }
+
+private:
+   Type v_[3] = {Type(), Type(), Type()};
+};
+#undef HIGH
+
+}
diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp
index f3d56730ab38e04fd4f9ea975643069d78cfe2db..6efead8d8cb598c1fcd481beb35c12464c235830 100644
--- a/runtime/pairs.cpp
+++ b/runtime/pairs.cpp
@@ -7,39 +7,61 @@
 #include "pairs.hpp"
 #include "pairs_common.hpp"
 #include "devices/device.hpp"
+#include "domain/block_forest.hpp"
 #include "domain/regular_6d_stencil.hpp"
 
 namespace pairs {
 
-void PairsSimulation::initDomain(
+void PairsRuntime::initDomain(
     int *argc, char ***argv,
-    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) {
+    real_t xmin, real_t ymin, real_t zmin, real_t xmax, real_t ymax, real_t zmax, 
+    bool pbcx, bool pbcy, bool pbcz, 
+    bool balance_workload) {
+
+    int mpi_initialized=0;
+    MPI_Initialized(&mpi_initialized);
+    
+    if(mpi_initialized){ 
+        PAIRS_ERROR("MPI is already initialized!\n"); 
+        exit(-1);
+    }
+    if(dom_part){ 
+        PAIRS_ERROR("DomainPartitioner already exists!\n"); 
+        exit(-1);
+    }
 
-    if(dom_part_type == Regular) {
+    if(dom_part_type == RegularPartitioning) {
         const int flags[] = {1, 1, 1};
         dom_part = new Regular6DStencil(xmin, xmax, ymin, ymax, zmin, zmax, flags);
-    } else if(dom_part_type == RegularXY) {
+    } else if(dom_part_type == RegularXYPartitioning) {
         const int flags[] = {1, 1, 0};
         dom_part = new Regular6DStencil(xmin, xmax, ymin, ymax, zmin, zmax, flags);
-    } else {
-        PAIRS_EXCEPTION("Domain partitioning type not implemented!\n");
+    } 
+    
+#ifdef USE_WALBERLA
+    else if(dom_part_type == BlockForestPartitioning) {
+        dom_part = new BlockForest(this, xmin, xmax, ymin, ymax, zmin, zmax, pbcx, pbcy, pbcz, balance_workload);
+    } 
+#endif
+
+    else {
+        PAIRS_ERROR("(initDomain) Domain partitioning type not implemented!\n");
+        exit(-1);
     }
 
     dom_part->initialize(argc, argv);
 }
 
-void PairsSimulation::addArray(Array array) {
-    int id = array.getId();
-    auto a = std::find_if(
-        arrays.begin(),
-        arrays.end(),
-        [id](Array _a) { return _a.getId() == id; });
+void PairsRuntime::addArray(Array array) {
+    PAIRS_ASSERT(
+        std::find_if(arrays.begin(), arrays.end(), [array](Array _a) {
+            return _a.getId() == array.getId();
+        }) == std::end(arrays));
 
-    PAIRS_ASSERT(a == std::end(arrays));
     arrays.push_back(array);
 }
 
-Array &PairsSimulation::getArray(array_t id) {
+Array &PairsRuntime::getArray(array_t id) {
     auto a = std::find_if(
         arrays.begin(),
         arrays.end(),
@@ -49,7 +71,7 @@ Array &PairsSimulation::getArray(array_t id) {
     return *a;
 }
 
-Array &PairsSimulation::getArrayByName(std::string name) {
+Array &PairsRuntime::getArrayByName(std::string name) {
     auto a = std::find_if(
         arrays.begin(),
         arrays.end(),
@@ -59,7 +81,7 @@ Array &PairsSimulation::getArrayByName(std::string name) {
     return *a;
 }
 
-Array &PairsSimulation::getArrayByHostPointer(const void *h_ptr) {
+Array &PairsRuntime::getArrayByHostPointer(const void *h_ptr) {
     auto a = std::find_if(
         arrays.begin(),
         arrays.end(),
@@ -69,18 +91,16 @@ Array &PairsSimulation::getArrayByHostPointer(const void *h_ptr) {
     return *a;
 }
 
-void PairsSimulation::addProperty(Property prop) {
-    int id = prop.getId();
-    auto p = std::find_if(
-        properties.begin(),
-        properties.end(),
-        [id](Property _p) { return _p.getId() == id; });
+void PairsRuntime::addProperty(Property prop) {
+    PAIRS_ASSERT(
+        std::find_if(properties.begin(), properties.end(), [prop](Property _p) {
+            return _p.getId() == prop.getId();
+        }) == std::end(properties));
 
-    PAIRS_ASSERT(p == std::end(properties));
     properties.push_back(prop);
 }
 
-Property &PairsSimulation::getProperty(property_t id) {
+Property &PairsRuntime::getProperty(property_t id) {
     auto p = std::find_if(
         properties.begin(),
         properties.end(),
@@ -90,7 +110,7 @@ Property &PairsSimulation::getProperty(property_t id) {
     return *p;
 }
 
-Property &PairsSimulation::getPropertyByName(std::string name) {
+Property &PairsRuntime::getPropertyByName(std::string name) {
     auto p = std::find_if(
         properties.begin(),
         properties.end(),
@@ -100,18 +120,19 @@ Property &PairsSimulation::getPropertyByName(std::string name) {
     return *p;
 }
 
-void PairsSimulation::addContactProperty(ContactProperty contact_prop) {
-    int id = contact_prop.getId();
-    auto cp = std::find_if(
-        contact_properties.begin(),
-        contact_properties.end(),
-        [id](ContactProperty _cp) { return _cp.getId() == id; });
+void PairsRuntime::addContactProperty(ContactProperty contact_prop) {
+    PAIRS_ASSERT(
+        std::find_if(
+            contact_properties.begin(),
+            contact_properties.end(),
+            [contact_prop](ContactProperty _cp) {
+                return _cp.getId() == contact_prop.getId();
+            }) == std::end(contact_properties));
 
-    PAIRS_ASSERT(cp == std::end(contact_properties));
     contact_properties.push_back(contact_prop);
 }
 
-ContactProperty &PairsSimulation::getContactProperty(property_t id) {
+ContactProperty &PairsRuntime::getContactProperty(property_t id) {
     auto cp = std::find_if(
         contact_properties.begin(),
         contact_properties.end(),
@@ -121,7 +142,7 @@ ContactProperty &PairsSimulation::getContactProperty(property_t id) {
     return *cp;
 }
 
-ContactProperty &PairsSimulation::getContactPropertyByName(std::string name) {
+ContactProperty &PairsRuntime::getContactPropertyByName(std::string name) {
     auto cp = std::find_if(
         contact_properties.begin(),
         contact_properties.end(),
@@ -131,18 +152,19 @@ ContactProperty &PairsSimulation::getContactPropertyByName(std::string name) {
     return *cp;
 }
 
-void PairsSimulation::addFeatureProperty(FeatureProperty feature_prop) {
-    int id = feature_prop.getId();
-    auto fp = std::find_if(
-        feature_properties.begin(),
-        feature_properties.end(),
-        [id](FeatureProperty _fp) { return _fp.getId() == id; });
+void PairsRuntime::addFeatureProperty(FeatureProperty feature_prop) {
+    PAIRS_ASSERT(
+        std::find_if(
+            feature_properties.begin(),
+            feature_properties.end(),
+            [feature_prop](FeatureProperty _fp) {
+                return _fp.getId() == feature_prop.getId();
+            }) == std::end(feature_properties));
 
-    PAIRS_ASSERT(fp == std::end(feature_properties));
     feature_properties.push_back(feature_prop);
 }
 
-FeatureProperty &PairsSimulation::getFeatureProperty(property_t id) {
+FeatureProperty &PairsRuntime::getFeatureProperty(property_t id) {
     auto fp = std::find_if(feature_properties.begin(),
                            feature_properties.end(),
                            [id](FeatureProperty _fp) { return _fp.getId() == id; });
@@ -150,7 +172,7 @@ FeatureProperty &PairsSimulation::getFeatureProperty(property_t id) {
     return *fp;
 }
 
-FeatureProperty &PairsSimulation::getFeaturePropertyByName(std::string name) {
+FeatureProperty &PairsRuntime::getFeaturePropertyByName(std::string name) {
     auto fp = std::find_if(feature_properties.begin(),
                            feature_properties.end(),
                            [name](FeatureProperty _fp) { return _fp.getName() == name; });
@@ -158,7 +180,7 @@ FeatureProperty &PairsSimulation::getFeaturePropertyByName(std::string name) {
     return *fp;
 }
 
-void PairsSimulation::copyArraySliceToDevice(
+void PairsRuntime::copyArraySliceToDevice(
     Array &array, action_t action, size_t offset, size_t size) {
 
     int array_id = array.getId();
@@ -167,7 +189,7 @@ void PairsSimulation::copyArraySliceToDevice(
         if(action == Ignore || !array_flags->isDeviceFlagSet(array_id)) {
             if(!array.isStatic()) {
                 PAIRS_DEBUG(
-                    "Copying array %s to device (offset=%d, n=%d)\n",
+                    "Copying array %s to device (offset=%lu, n=%lu)\n",
                     array.getName().c_str(), offset, size);
 
                 pairs::copy_slice_to_device(
@@ -183,16 +205,23 @@ void PairsSimulation::copyArraySliceToDevice(
     array_flags->setDeviceFlag(array_id);
 }
 
-void PairsSimulation::copyArrayToDevice(Array &array, action_t action, size_t size) {
+void PairsRuntime::copyArrayToDevice(Array &array, action_t action, size_t size) {
     int array_id = array.getId();
 
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !array_flags->isDeviceFlagSet(array_id)) {
             if(array.isStatic()) {
-                PAIRS_DEBUG("Copying static array %s to device (n=%d)\n", array.getName().c_str(), size);
-                pairs::copy_static_symbol_to_device(array.getHostPointer(), array.getDevicePointer(), size);
+                PAIRS_DEBUG(
+                    "Copying static array %s to device (n=%lu)\n",
+                    array.getName().c_str(), size);
+
+                pairs::copy_static_symbol_to_device(
+                    array.getHostPointer(), array.getDevicePointer(), size);
             } else {
-                PAIRS_DEBUG("Copying array %s to device (n=%d)\n", array.getName().c_str(), size);
+                PAIRS_DEBUG(
+                    "Copying array %s to device (n=%lu)\n",
+                    array.getName().c_str(), size);
+
                 pairs::copy_to_device(array.getHostPointer(), array.getDevicePointer(), size);
             }
         }
@@ -205,14 +234,14 @@ void PairsSimulation::copyArrayToDevice(Array &array, action_t action, size_t si
     array_flags->setDeviceFlag(array_id);
 }
 
-void PairsSimulation::copyArraySliceToHost(Array &array, action_t action, size_t offset, size_t size) {
+void PairsRuntime::copyArraySliceToHost(Array &array, action_t action, size_t offset, size_t size) {
     int array_id = array.getId();
 
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !array_flags->isHostFlagSet(array_id)) {
             if(!array.isStatic()) {
                 PAIRS_DEBUG(
-                    "Copying array %s to host (offset=%d, n=%d)\n",
+                    "Copying array %s to host (offset=%lu, n=%lu)\n",
                     array.getName().c_str(), offset, size);
 
                 pairs::copy_slice_to_host(
@@ -228,16 +257,19 @@ void PairsSimulation::copyArraySliceToHost(Array &array, action_t action, size_t
     array_flags->setHostFlag(array_id);
 }
 
-void PairsSimulation::copyArrayToHost(Array &array, action_t action, size_t size) {
+void PairsRuntime::copyArrayToHost(Array &array, action_t action, size_t size) {
     int array_id = array.getId();
 
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !array_flags->isHostFlagSet(array_id)) {
             if(array.isStatic()) {
-                PAIRS_DEBUG("Copying static array %s to host (n=%d)\n", array.getName().c_str(), size);
-                pairs::copy_static_symbol_to_host(array.getDevicePointer(), array.getHostPointer(), size);
+                PAIRS_DEBUG(
+                    "Copying static array %s to host (n=%lu)\n", array.getName().c_str(), size);
+
+                pairs::copy_static_symbol_to_host(
+                    array.getDevicePointer(), array.getHostPointer(), size);
             } else {
-                PAIRS_DEBUG("Copying array %s to host (n=%d)\n", array.getName().c_str(), size);
+                PAIRS_DEBUG("Copying array %s to host (n=%lu)\n", array.getName().c_str(), size);
                 pairs::copy_to_host(array.getDevicePointer(), array.getHostPointer(), size);
             }
         }
@@ -250,12 +282,12 @@ void PairsSimulation::copyArrayToHost(Array &array, action_t action, size_t size
     array_flags->setHostFlag(array_id);
 }
 
-void PairsSimulation::copyPropertyToDevice(Property &prop, action_t action, size_t size) {
+void PairsRuntime::copyPropertyToDevice(Property &prop, action_t action, size_t size) {
     int prop_id = prop.getId();
 
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !prop_flags->isDeviceFlagSet(prop_id)) {
-            PAIRS_DEBUG("Copying property %s to device (n=%d)\n", prop.getName().c_str(), size);
+            PAIRS_DEBUG("Copying property %s to device (n=%lu)\n", prop.getName().c_str(), size);
             pairs::copy_to_device(prop.getHostPointer(), prop.getDevicePointer(), size);
         }
     }
@@ -267,12 +299,12 @@ void PairsSimulation::copyPropertyToDevice(Property &prop, action_t action, size
     prop_flags->setDeviceFlag(prop_id);
 }
 
-void PairsSimulation::copyPropertyToHost(Property &prop, action_t action, size_t size) {
+void PairsRuntime::copyPropertyToHost(Property &prop, action_t action, size_t size) {
     int prop_id = prop.getId();
 
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !prop_flags->isHostFlagSet(prop_id)) {
-            PAIRS_DEBUG("Copying property %s to host (n=%d)\n", prop.getName().c_str(), size);
+            PAIRS_DEBUG("Copying property %s to host (n=%lu)\n", prop.getName().c_str(), size);
             pairs::copy_to_host(prop.getDevicePointer(), prop.getHostPointer(), size);
         }
     }
@@ -284,15 +316,19 @@ void PairsSimulation::copyPropertyToHost(Property &prop, action_t action, size_t
     prop_flags->setHostFlag(prop_id);
 }
 
-void PairsSimulation::copyContactPropertyToDevice(
+void PairsRuntime::copyContactPropertyToDevice(
     ContactProperty &contact_prop, action_t action, size_t size) {
 
     int prop_id = contact_prop.getId();
 
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !contact_prop_flags->isDeviceFlagSet(prop_id)) {
-            PAIRS_DEBUG("Copying contact property %s to device (n=%d)\n", contact_prop.getName().c_str(), size);
-            pairs::copy_to_device(contact_prop.getHostPointer(), contact_prop.getDevicePointer(), size);
+            PAIRS_DEBUG("Copying contact property %s to device (n=%lu)\n",
+                contact_prop.getName().c_str(), size);
+
+            pairs::copy_to_device(
+                contact_prop.getHostPointer(), contact_prop.getDevicePointer(), size);
+
             contact_prop_flags->setDeviceFlag(prop_id);
         }
     }
@@ -302,15 +338,19 @@ void PairsSimulation::copyContactPropertyToDevice(
     }
 }
 
-void PairsSimulation::copyContactPropertyToHost(
+void PairsRuntime::copyContactPropertyToHost(
     ContactProperty &contact_prop, action_t action, size_t size) {
 
     int prop_id = contact_prop.getId();
 
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(!contact_prop_flags->isHostFlagSet(contact_prop.getId())) {
-            PAIRS_DEBUG("Copying contact property %s to host (n=%d)\n", contact_prop.getName().c_str(), size);
-            pairs::copy_to_host(contact_prop.getDevicePointer(), contact_prop.getHostPointer(), size);
+            PAIRS_DEBUG("Copying contact property %s to host (n=%lu)\n",
+                contact_prop.getName().c_str(), size);
+
+            pairs::copy_to_host(
+                contact_prop.getDevicePointer(), contact_prop.getHostPointer(), size);
+
             contact_prop_flags->setHostFlag(prop_id);
         }
     }
@@ -320,13 +360,17 @@ void PairsSimulation::copyContactPropertyToHost(
     }
 }
 
-void PairsSimulation::copyFeaturePropertyToDevice(FeatureProperty &feature_prop) {
+void PairsRuntime::copyFeaturePropertyToDevice(FeatureProperty &feature_prop) {
     const size_t n = feature_prop.getArraySize();
-    PAIRS_DEBUG("Copying feature property %s to device (n=%d)\n", feature_prop.getName().c_str(), n);
-    pairs::copy_static_symbol_to_device(feature_prop.getHostPointer(), feature_prop.getDevicePointer(), n);
+
+    PAIRS_DEBUG("Copying feature property %s to device (n=%lu)\n",
+        feature_prop.getName().c_str(), n);
+
+    pairs::copy_static_symbol_to_device(
+        feature_prop.getHostPointer(), feature_prop.getDevicePointer(), n);
 }
 
-void PairsSimulation::communicateSizes(int dim, const int *send_sizes, int *recv_sizes) {
+void PairsRuntime::communicateSizes(int dim, const int *send_sizes, int *recv_sizes) {
     auto nsend_id = getArrayByHostPointer(send_sizes).getId();
     auto nrecv_id = getArrayByHostPointer(recv_sizes).getId();
 
@@ -341,7 +385,7 @@ void PairsSimulation::communicateSizes(int dim, const int *send_sizes, int *recv
     this->getTimers()->stop(Communication);
 }
 
-void PairsSimulation::communicateData(
+void PairsRuntime::communicateData(
     int dim, int elem_size,
     const real_t *send_buf, const int *send_offsets, const int *nsend,
     real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
@@ -369,13 +413,22 @@ void PairsSimulation::communicateData(
     #else
     int nsend_all = 0;
     int nrecv_all = 0;
-    for(int d = 0; d <= dim; d++) {
-        nsend_all += nsend[d * 2 + 0];
-        nsend_all += nsend[d * 2 + 1];
-        nrecv_all += nrecv[d * 2 + 0];
-        nrecv_all += nrecv[d * 2 + 1];
+    if(this->dom_part_type == RegularPartitioning || this->dom_part_type == RegularXYPartitioning){
+        for(int d = 0; d <= dim; d++) {
+            nsend_all += nsend[d * 2 + 0];
+            nsend_all += nsend[d * 2 + 1];
+            nrecv_all += nrecv[d * 2 + 0];
+            nrecv_all += nrecv[d * 2 + 1];
+        }
     }
-
+    else if (this->dom_part_type == BlockForestPartitioning){
+        int nranks = this->getDomainPartitioner()->getNumberOfNeighborRanks();
+        for (int n=0; n<nranks; ++n){
+            nsend_all += nsend[n];
+            nrecv_all += nrecv[n];
+        }
+    }
+    
     copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t));
     array_flags->setHostFlag(recv_buf_id);
     array_flags->clearDeviceFlag(recv_buf_id);
@@ -395,7 +448,70 @@ void PairsSimulation::communicateData(
     #endif
 }
 
-void PairsSimulation::communicateAllData(
+void PairsRuntime::communicateDataReverse(
+    int dim, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+    const real_t *send_buf_ptr = send_buf;
+    real_t *recv_buf_ptr = recv_buf;
+    auto send_buf_array = getArrayByHostPointer(send_buf);
+    auto recv_buf_array = getArrayByHostPointer(recv_buf);
+    auto send_buf_id = send_buf_array.getId();
+    auto recv_buf_id = recv_buf_array.getId();
+    auto send_offsets_id = getArrayByHostPointer(send_offsets).getId();
+    auto recv_offsets_id = getArrayByHostPointer(recv_offsets).getId();
+    auto nsend_id = getArrayByHostPointer(nsend).getId();
+    auto nrecv_id = getArrayByHostPointer(nrecv).getId();
+
+    this->getTimers()->start(DeviceTransfers);
+    copyArrayToHost(send_offsets_id, ReadOnly);
+    copyArrayToHost(recv_offsets_id, ReadOnly);
+    copyArrayToHost(nsend_id, ReadOnly);
+    copyArrayToHost(nrecv_id, ReadOnly);
+
+    #ifdef ENABLE_CUDA_AWARE_MPI
+    send_buf_ptr = (real_t *) send_buf_array.getDevicePointer();
+    recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer();
+    #else
+    int nsend_all = 0;
+    int nrecv_all = 0;
+    if(this->dom_part_type == RegularPartitioning || this->dom_part_type == RegularXYPartitioning){
+        for(int d = 2; d >= dim; d--) {
+            nsend_all += nsend[d * 2 + 0];
+            nsend_all += nsend[d * 2 + 1];
+            nrecv_all += nrecv[d * 2 + 0];
+            nrecv_all += nrecv[d * 2 + 1];
+        }
+    }
+    else if (this->dom_part_type == BlockForestPartitioning){
+        int nranks = this->getDomainPartitioner()->getNumberOfNeighborRanks();
+        for (int n=0; n<nranks; ++n){   // blockforest doesn't need reverse loop
+            nsend_all += nsend[n];
+            nrecv_all += nrecv[n];
+        }
+    }
+
+    copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t));
+    array_flags->setHostFlag(recv_buf_id);
+    array_flags->clearDeviceFlag(recv_buf_id);
+    #endif
+
+    this->getTimers()->stop(DeviceTransfers);
+
+    this->getTimers()->start(Communication);
+    this->getDomainPartitioner()->communicateDataReverse(
+        dim, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv);
+    this->getTimers()->stop(Communication);
+
+    #ifndef ENABLE_CUDA_AWARE_MPI
+    this->getTimers()->start(DeviceTransfers);
+    copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
+    this->getTimers()->stop(DeviceTransfers);
+    #endif
+}
+
+void PairsRuntime::communicateAllData(
     int ndims, int elem_size,
     const real_t *send_buf, const int *send_offsets, const int *nsend,
     real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
@@ -423,11 +539,20 @@ void PairsSimulation::communicateAllData(
     #else
     int nsend_all = 0;
     int nrecv_all = 0;
-    for(int d = 0; d <= ndims; d++) {
-        nsend_all += nsend[d * 2 + 0];
-        nsend_all += nsend[d * 2 + 1];
-        nrecv_all += nrecv[d * 2 + 0];
-        nrecv_all += nrecv[d * 2 + 1];
+    if(this->dom_part_type == RegularPartitioning || this->dom_part_type == RegularXYPartitioning){
+        for(int d = 0; d <= ndims; d++) {
+            nsend_all += nsend[d * 2 + 0];
+            nsend_all += nsend[d * 2 + 1];
+            nrecv_all += nrecv[d * 2 + 0];
+            nrecv_all += nrecv[d * 2 + 1];
+        }
+    }
+    else if (this->dom_part_type == BlockForestPartitioning){
+        int nranks = this->getDomainPartitioner()->getNumberOfNeighborRanks();
+        for (int n=0; n<nranks; ++n){
+            nsend_all += nsend[n];
+            nrecv_all += nrecv[n];
+        }
     }
 
     copyArrayToHost(send_buf_id, Ignore, nsend_all * elem_size * sizeof(real_t));
@@ -449,7 +574,7 @@ void PairsSimulation::communicateAllData(
     #endif
 }
 
-void PairsSimulation::communicateContactHistoryData(
+void PairsRuntime::communicateContactHistoryData(
     int dim, int nelems_per_contact,
     const real_t *send_buf, const int *contact_soffsets, const int *nsend_contact,
     real_t *recv_buf, int *contact_roffsets, int *nrecv_contact) {
@@ -515,8 +640,8 @@ void PairsSimulation::communicateContactHistoryData(
     #endif
 }
 
-void PairsSimulation::fillCommunicationArrays(int *neighbor_ranks, int *pbc, real_t *subdom) {
-    this->getDomainPartitioner()->fillArrays(neighbor_ranks, pbc, subdom);
+void PairsRuntime::copyRuntimeArray(const std::string& name, void *dest, const int size) {
+    this->getDomainPartitioner()->copyRuntimeArray(name, dest, size);
 }
 
 }
diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp
index 8944dfda738602867d7a9fb768cf81fd7e083d90..e87dec06224d830f8f15fdc2e593278c00d100e6 100644
--- a/runtime/pairs.hpp
+++ b/runtime/pairs.hpp
@@ -12,32 +12,37 @@
 #include "property.hpp"
 #include "runtime_var.hpp"
 #include "timers.hpp"
+#include "tracked_variable.hpp"
 #include "devices/device.hpp"
+#include "domain/block_forest.hpp"
 #include "domain/regular_6d_stencil.hpp"
 
 #pragma once
 
-#define FLAGS_INFINITE  (1 << 0)
-#define FLAGS_GHOST     (1 << 1)
-#define FLAGS_FIXED     (1 << 2)
-#define FLAGS_GLOBAL    (1 << 3)
 
 namespace pairs {
 
-class PairsSimulation {
+class PairsRuntime {
 private:
-    Regular6DStencil *dom_part;
-    //DomainPartitioner *dom_part;
+    DomainPartitioner *dom_part;
     DomainPartitioners dom_part_type;
     std::vector<Property> properties;
     std::vector<ContactProperty> contact_properties;
     std::vector<FeatureProperty> feature_properties;
     std::vector<Array> arrays;
+    std::vector<TrackedVariable> tracked_variables;
     DeviceFlags *prop_flags, *contact_prop_flags, *array_flags;
     Timers<double> *timers;
+    int *nlocal, *nghost;
 
 public:
-    PairsSimulation(int nprops_, int ncontactprops_, int narrays_, DomainPartitioners dom_part_type_) {
+    PairsRuntime(
+        int nprops_,
+        int ncontactprops_,
+        int narrays_,
+        DomainPartitioners dom_part_type_) {
+
+        dom_part = nullptr;
         dom_part_type = dom_part_type_;
         prop_flags = new DeviceFlags(nprops_);
         contact_prop_flags = new DeviceFlags(ncontactprops_);
@@ -45,7 +50,7 @@ public:
         timers = new Timers<double>(1e-6);
     }
 
-    ~PairsSimulation() {
+    ~PairsRuntime() {
         dom_part->finalize();
         delete prop_flags;
         delete contact_prop_flags;
@@ -55,8 +60,40 @@ public:
 
     // Variables
     template<typename T>
-    RuntimeVar<T> addDeviceVariable(T *h_ptr) {
-       return RuntimeVar<T>(h_ptr); 
+    RuntimeVar<T> &addDeviceVariable(T *h_ptr) {
+        // TODO: Proper memory mangement for RuntimeVar variables
+        RuntimeVar<T> *ret = new RuntimeVar<T>(h_ptr);
+        return *ret; 
+    }
+
+    void trackVariable(std::string variable_name, void *ptr) {
+        PAIRS_ASSERT(
+            std::find_if(tracked_variables.begin(), tracked_variables.end(),
+            [variable_name](TrackedVariable _v) {
+                return _v.getName() == variable_name;
+            }) == std::end(tracked_variables));
+
+        tracked_variables.push_back(TrackedVariable(variable_name, ptr)); 
+    }
+
+    TrackedVariable &getTrackedVariable(std::string variable_name) {
+        auto v = std::find_if(
+            tracked_variables.begin(),
+            tracked_variables.end(),
+            [variable_name](TrackedVariable _v) { return _v.getName() == variable_name; });
+
+        PAIRS_ASSERT(v != std::end(tracked_variables));
+        return *v;
+    }
+
+    void setTrackedVariableAsInteger(std::string variable_name, int value) {
+        auto& tv = getTrackedVariable(variable_name);
+        *(static_cast<int *>(tv.getPointer())) = value;
+    }
+
+    int getTrackedVariableAsInteger(std::string variable_name) {
+        auto& tv = getTrackedVariable(variable_name);
+        return *(static_cast<int *>(tv.getPointer()));
     }
 
     // Arrays
@@ -108,6 +145,7 @@ public:
     void copyArraySliceToHost(Array &array, action_t action, size_t offset, size_t size);
 
     // Properties
+    std::vector<Property> &getProperties() { return properties; };
     Property &getProperty(property_t id);
     Property &getPropertyByName(std::string name);
     void addProperty(Property prop);
@@ -115,11 +153,11 @@ public:
     template<typename T_ptr>
     void addProperty(
         property_t id, std::string name, T_ptr **h_ptr, std::nullptr_t,
-        PropertyType type, layout_t layout, size_t sx, size_t sy = 1);
+        PropertyType type, layout_t layout, int vol, size_t sx, size_t sy = 1);
 
     template<typename T_ptr> void addProperty(
         property_t id, std::string name, T_ptr **h_ptr, T_ptr **d_ptr,
-        PropertyType type, layout_t layout, size_t sx, size_t sy = 1);
+        PropertyType type, layout_t layout, int vol, size_t sx, size_t sy = 1);
 
     template<typename T_ptr>
     void reallocProperty(property_t id, T_ptr **h_ptr, std::nullptr_t, size_t sx = 1, size_t sy = 1);
@@ -131,6 +169,10 @@ public:
         return static_cast<IntProperty&>(prop);
     }
 
+    inline UInt64Property &getAsUInt64Property(Property &prop) {
+        return static_cast<UInt64Property&>(prop);
+    }
+
     inline FloatProperty &getAsFloatProperty(Property &prop) {
         return static_cast<FloatProperty&>(prop);
     }
@@ -151,6 +193,10 @@ public:
         return static_cast<IntProperty&>(getProperty(property));
     }
 
+    inline UInt64Property &getUInt64Property(property_t property) {
+        return static_cast<UInt64Property&>(getProperty(property));
+    }
+
     inline FloatProperty &getFloatProperty(property_t property) {
         return static_cast<FloatProperty&>(getProperty(property));
     }
@@ -176,6 +222,10 @@ public:
         copyPropertyToDevice(getProperty(id), action, size);
     }
 
+    void copyPropertyToDevice(Property &prop, action_t action) {
+        copyPropertyToDevice(prop, action, prop.getTotalSize());
+    }
+    
     void copyPropertyToDevice(Property &prop, action_t action, size_t size);
 
     void copyPropertyToHost(property_t id, action_t action) {
@@ -193,6 +243,14 @@ public:
 
     void copyPropertyToHost(Property &prop, action_t action, size_t size);
 
+    DeviceFlags* getPropFlags(){
+        return prop_flags;
+    }
+
+    DeviceFlags* getArrayFlags(){
+        return array_flags;
+    }
+    
     // Contact properties
     ContactProperty &getContactProperty(property_t id);
     ContactProperty &getContactPropertyByName(std::string name);
@@ -262,9 +320,15 @@ public:
     // Communication
     void initDomain(
         int *argc, char ***argv,
-        real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax);
+        real_t xmin, real_t ymin, real_t zmin, real_t xmax, real_t ymax, real_t zmax, 
+        bool pbcx = 0, bool pbcy = 0, bool pbcz = 0, bool balance_workload = 0);
 
-    Regular6DStencil *getDomainPartitioner() { return dom_part; }
+    template<typename Domain_T>
+    void useDomain(const std::shared_ptr<Domain_T> &domain_ptr);
+
+    void updateDomain() { dom_part->update(); }
+
+    DomainPartitioner *getDomainPartitioner() { return dom_part; }
     void communicateSizes(int dim, const int *send_sizes, int *recv_sizes);
 
     void communicateData(
@@ -272,6 +336,11 @@ public:
         const real_t *send_buf, const int *send_offsets, const int *nsend,
         real_t *recv_buf, const int *recv_offsets, const int *nrecv);
 
+    void communicateDataReverse(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+        
     void communicateAllData(
         int ndims, int elem_size,
         const real_t *send_buf, const int *send_offsets, const int *nsend,
@@ -282,7 +351,9 @@ public:
         const real_t *send_buf, const int *contact_soffsets, const int *nsend_contact,
         real_t *recv_buf, int *contact_roffsets, int *nrecv_contact);
 
-    void fillCommunicationArrays(int neighbor_ranks[], int pbc[], real_t subdom[]);
+    void copyRuntimeArray(const std::string& name, void *dest, const int size);
+    int getNumberOfNeighborRanks() { return this->getDomainPartitioner()->getNumberOfNeighborRanks(); }
+    int getNumberOfNeighborAABBs() { return this->getDomainPartitioner()->getNumberOfNeighborAABBs(); }
 
     // Device functions
     void sync() { device_synchronize(); }
@@ -296,8 +367,38 @@ public:
     }
 };
 
+template<typename Domain_T>
+void PairsRuntime::useDomain(const std::shared_ptr<Domain_T> &domain_ptr){
+    
+    if(dom_part){ 
+        PAIRS_ERROR("DomainPartitioner already exists!\n"); 
+        exit(-1);
+    }
+
+    if(dom_part_type == RegularPartitioning) {
+        PAIRS_ERROR("useDomain not implemented for Regular6DStencil!\n");
+        exit(-1);
+
+    } else if(dom_part_type == RegularXYPartitioning) {        
+        PAIRS_ERROR("useDomain not implemented for Regular6DStencil!\n");
+        exit(-1);
+
+    } 
+    
+#ifdef USE_WALBERLA
+    else if(dom_part_type == BlockForestPartitioning) {
+        dom_part = new BlockForest(this, domain_ptr);
+    } 
+#endif
+
+    else {
+        PAIRS_ERROR("Domain partitioning type not implemented!\n");
+        exit(-1);
+    }
+}
+
 template<typename T_ptr>
-void PairsSimulation::addArray(array_t id, std::string name, T_ptr **h_ptr, std::nullptr_t, size_t size) {
+void PairsRuntime::addArray(array_t id, std::string name, T_ptr **h_ptr, std::nullptr_t, size_t size) {
     PAIRS_ASSERT(size > 0);
 
     *h_ptr = (T_ptr *) pairs::host_alloc(size);
@@ -306,7 +407,7 @@ void PairsSimulation::addArray(array_t id, std::string name, T_ptr **h_ptr, std:
 }
 
 template<typename T_ptr>
-void PairsSimulation::addArray(array_t id, std::string name, T_ptr **h_ptr, T_ptr **d_ptr, size_t size) {
+void PairsRuntime::addArray(array_t id, std::string name, T_ptr **h_ptr, T_ptr **d_ptr, size_t size) {
     PAIRS_ASSERT(size > 0);
 
     *h_ptr = (T_ptr *) pairs::host_alloc(size);
@@ -316,19 +417,19 @@ void PairsSimulation::addArray(array_t id, std::string name, T_ptr **h_ptr, T_pt
 }
 
 template<typename T_ptr>
-void PairsSimulation::addStaticArray(array_t id, std::string name, T_ptr *h_ptr, std::nullptr_t, size_t size) {
+void PairsRuntime::addStaticArray(array_t id, std::string name, T_ptr *h_ptr, std::nullptr_t, size_t size) {
     addArray(Array(id, name, h_ptr, nullptr, size, true));
 }
 
 template<typename T_ptr>
-void PairsSimulation::addStaticArray(array_t id, std::string name, T_ptr *h_ptr, T_ptr *d_ptr, size_t size) {
+void PairsRuntime::addStaticArray(array_t id, std::string name, T_ptr *h_ptr, T_ptr *d_ptr, size_t size) {
     addArray(Array(id, name, h_ptr, d_ptr, size, true));
 }
 
 template<typename T_ptr>
-void PairsSimulation::reallocArray(array_t id, T_ptr **h_ptr, std::nullptr_t, size_t size) {
+void PairsRuntime::reallocArray(array_t id, T_ptr **h_ptr, std::nullptr_t, size_t size) {
     // This should be a pointer (and not a reference) in order to be modified
-    auto a = std::find_if(arrays.begin(), arrays.end(), [id](Array a) { return a.getId() == id; });
+    auto a = std::find_if(arrays.begin(), arrays.end(), [id](Array _a) { return _a.getId() == id; });
     PAIRS_ASSERT(a != std::end(arrays));
     PAIRS_ASSERT(size > 0);
 
@@ -341,9 +442,9 @@ void PairsSimulation::reallocArray(array_t id, T_ptr **h_ptr, std::nullptr_t, si
 }
 
 template<typename T_ptr>
-void PairsSimulation::reallocArray(array_t id, T_ptr **h_ptr, T_ptr **d_ptr, size_t size) {
+void PairsRuntime::reallocArray(array_t id, T_ptr **h_ptr, T_ptr **d_ptr, size_t size) {
     // This should be a pointer (and not a reference) in order to be modified
-    auto a = std::find_if(arrays.begin(), arrays.end(), [id](Array a) { return a.getId() == id; });
+    auto a = std::find_if(arrays.begin(), arrays.end(), [id](Array _a) { return _a.getId() == id; });
     PAIRS_ASSERT(a != std::end(arrays));
     PAIRS_ASSERT(size > 0);
 
@@ -363,20 +464,22 @@ void PairsSimulation::reallocArray(array_t id, T_ptr **h_ptr, T_ptr **d_ptr, siz
 }
 
 template<typename T_ptr>
-void PairsSimulation::addProperty(
-    property_t id, std::string name, T_ptr **h_ptr, std::nullptr_t, PropertyType type, layout_t layout, size_t sx, size_t sy) {
+void PairsRuntime::addProperty(
+    property_t id, std::string name, T_ptr **h_ptr, std::nullptr_t,
+    PropertyType type, layout_t layout, int vol, size_t sx, size_t sy) {
 
     size_t size = sx * sy * sizeof(T_ptr);
     PAIRS_ASSERT(size > 0);
 
     *h_ptr = (T_ptr *) pairs::host_alloc(size);
     PAIRS_ASSERT(*h_ptr != nullptr);
-    addProperty(Property(id, name, *h_ptr, nullptr, type, layout, sx, sy));
+    addProperty(Property(id, name, *h_ptr, nullptr, type, layout, vol, sx, sy));
 }
 
 template<typename T_ptr>
-void PairsSimulation::addProperty(
-    property_t id, std::string name, T_ptr **h_ptr, T_ptr **d_ptr, PropertyType type, layout_t layout, size_t sx, size_t sy) {
+void PairsRuntime::addProperty(
+    property_t id, std::string name, T_ptr **h_ptr, T_ptr **d_ptr,
+    PropertyType type, layout_t layout, int vol, size_t sx, size_t sy) {
 
     size_t size = sx * sy * sizeof(T_ptr);
     PAIRS_ASSERT(size > 0);
@@ -384,11 +487,11 @@ void PairsSimulation::addProperty(
     *h_ptr = (T_ptr *) pairs::host_alloc(size);
     *d_ptr = (T_ptr *) pairs::device_alloc(size);
     PAIRS_ASSERT(*h_ptr != nullptr && *d_ptr != nullptr);
-    addProperty(Property(id, name, *h_ptr, *d_ptr, type, layout, sx, sy));
+    addProperty(Property(id, name, *h_ptr, *d_ptr, type, layout, vol, sx, sy));
 }
 
 template<typename T_ptr>
-void PairsSimulation::reallocProperty(property_t id, T_ptr **h_ptr, std::nullptr_t, size_t sx, size_t sy) {
+void PairsRuntime::reallocProperty(property_t id, T_ptr **h_ptr, std::nullptr_t, size_t sx, size_t sy) {
     // This should be a pointer (and not a reference) in order to be modified
     auto p = std::find_if(properties.begin(),
 		    	  properties.end(),
@@ -407,7 +510,7 @@ void PairsSimulation::reallocProperty(property_t id, T_ptr **h_ptr, std::nullptr
 }
 
 template<typename T_ptr>
-void PairsSimulation::reallocProperty(property_t id, T_ptr **h_ptr, T_ptr **d_ptr, size_t sx, size_t sy) {
+void PairsRuntime::reallocProperty(property_t id, T_ptr **h_ptr, T_ptr **d_ptr, size_t sx, size_t sy) {
     // This should be a pointer (and not a reference) in order to be modified
     auto p = std::find_if(properties.begin(),
 		    	  properties.end(),
@@ -433,7 +536,7 @@ void PairsSimulation::reallocProperty(property_t id, T_ptr **h_ptr, T_ptr **d_pt
 }
 
 template<typename T_ptr>
-void PairsSimulation::addContactProperty(
+void PairsRuntime::addContactProperty(
     property_t id, std::string name, T_ptr **h_ptr, std::nullptr_t, PropertyType type, layout_t layout, size_t sx, size_t sy) {
 
     size_t size = sx * sy * sizeof(T_ptr);
@@ -445,7 +548,7 @@ void PairsSimulation::addContactProperty(
 }
 
 template<typename T_ptr>
-void PairsSimulation::addContactProperty(
+void PairsRuntime::addContactProperty(
     property_t id, std::string name, T_ptr **h_ptr, T_ptr **d_ptr, PropertyType type, layout_t layout, size_t sx, size_t sy) {
 
     size_t size = sx * sy * sizeof(T_ptr);
@@ -458,7 +561,7 @@ void PairsSimulation::addContactProperty(
 }
 
 template<typename T_ptr>
-void PairsSimulation::reallocContactProperty(property_t id, T_ptr **h_ptr, std::nullptr_t, size_t sx, size_t sy) {
+void PairsRuntime::reallocContactProperty(property_t id, T_ptr **h_ptr, std::nullptr_t, size_t sx, size_t sy) {
     // This should be a pointer (and not a reference) in order to be modified
     auto cp = std::find_if(contact_properties.begin(),
 		    	   contact_properties.end(),
@@ -477,7 +580,7 @@ void PairsSimulation::reallocContactProperty(property_t id, T_ptr **h_ptr, std::
 }
 
 template<typename T_ptr>
-void PairsSimulation::reallocContactProperty(property_t id, T_ptr **h_ptr, T_ptr **d_ptr, size_t sx, size_t sy) {
+void PairsRuntime::reallocContactProperty(property_t id, T_ptr **h_ptr, T_ptr **d_ptr, size_t sx, size_t sy) {
     // This should be a pointer (and not a reference) in order to be modified
     auto cp = std::find_if(contact_properties.begin(),
 		    	   contact_properties.end(),
@@ -503,14 +606,14 @@ void PairsSimulation::reallocContactProperty(property_t id, T_ptr **h_ptr, T_ptr
 }
 
 template<typename T_ptr>
-void PairsSimulation::addFeatureProperty(property_t id, std::string name, T_ptr *h_ptr, std::nullptr_t, PropertyType type, int nkinds, int array_size) {
+void PairsRuntime::addFeatureProperty(property_t id, std::string name, T_ptr *h_ptr, std::nullptr_t, PropertyType type, int nkinds, int array_size) {
     PAIRS_ASSERT(nkinds > 0 && array_size > 0);
     PAIRS_ASSERT(h_ptr != nullptr);
     addFeatureProperty(FeatureProperty(id, name, h_ptr, nullptr, type, nkinds, array_size));
 }
 
 template<typename T_ptr>
-void PairsSimulation::addFeatureProperty(property_t id, std::string name, T_ptr *h_ptr, T_ptr *d_ptr, PropertyType type, int nkinds, int array_size) {
+void PairsRuntime::addFeatureProperty(property_t id, std::string name, T_ptr *h_ptr, T_ptr *d_ptr, PropertyType type, int nkinds, int array_size) {
     PAIRS_ASSERT(nkinds > 0 && array_size > 0);
     PAIRS_ASSERT(h_ptr != nullptr && d_ptr != nullptr);
     addFeatureProperty(FeatureProperty(id, name, h_ptr, d_ptr, type, nkinds, array_size));
diff --git a/runtime/pairs_common.hpp b/runtime/pairs_common.hpp
index c3cf60da1156d9ea6dc5c7e0e1c3bd581a62c8c5..74237423ee5d3de07462bcc40739484ef4fd9781 100644
--- a/runtime/pairs_common.hpp
+++ b/runtime/pairs_common.hpp
@@ -3,12 +3,39 @@
 
 #pragma once
 
+namespace pairs {
+
+#ifdef PAIRS_TARGET_CUDA
+    #define PAIRS_ATTR_HOST __host__ 
+    #define PAIRS_ATTR_DEVICE __device__ 
+    #define PAIRS_ATTR_HOST_DEVICE __host__ __device__
+#else
+    #define PAIRS_ATTR_HOST
+    #define PAIRS_ATTR_DEVICE
+    #define PAIRS_ATTR_HOST_DEVICE
+#endif
+
+namespace flags{
+    constexpr int INFINITE = 1 << 0 ;
+    constexpr int GHOST    = 1 << 1 ;
+    constexpr int FIXED    = 1 << 2 ;
+    constexpr int GLOBAL   = 1 << 3 ;
+}
+
+namespace shapes{
+    enum Type {
+        Sphere = 0,
+        Halfspace = 1,
+        PointMass = 2
+    };
+}
 //#ifdef USE_DOUBLE_PRECISION
 typedef double real_t;
 //#else
 //typedef float real_t;
 //#endif
 
+typedef uint64_t id_t;
 typedef int array_t;
 typedef int property_t;
 typedef int layout_t;
@@ -17,12 +44,25 @@ typedef int action_t;
 enum PropertyType {
     Prop_Invalid = -1,
     Prop_Integer = 0,
+    Prop_UInt64,
     Prop_Real,
     Prop_Vector,
     Prop_Matrix,
     Prop_Quaternion
 };
 
+constexpr size_t get_proptype_size(PropertyType type){
+    switch (type) {
+        case pairs::Prop_Integer:       return sizeof(int);
+        case pairs::Prop_UInt64:        return sizeof(uint64_t);
+        case pairs::Prop_Real:          return sizeof(real_t);
+        case pairs::Prop_Vector:        return 3*sizeof(real_t);
+        case pairs::Prop_Matrix:        return 9*sizeof(real_t);
+        case pairs::Prop_Quaternion:    return 4*sizeof(real_t);
+        default:             return 0;
+    }
+}
+
 enum DataLayout {
     Invalid = -1,
     AoS = 0,
@@ -38,7 +78,7 @@ enum Actions {
     Ignore = 5
 };
 
-enum Timers {
+enum TimerMarkers {
     All = 0,
     Communication = 1,
     DeviceTransfers = 2,
@@ -46,11 +86,28 @@ enum Timers {
 };
 
 enum DomainPartitioners {
-    Regular = 0,
-    RegularXY = 1,
-    BoxList = 2,
+    RegularPartitioning = 0,
+    RegularXYPartitioning = 1,
+    BlockForestPartitioning = 2
+};
+
+enum LoadBalancingAlgorithms {
+    Morton = 0,
+    Hilbert = 1,
+    Metis = 2,
+    Diffusive = 3
 };
 
+constexpr const char* getAlgorithmName(LoadBalancingAlgorithms alg) {
+    switch (alg) {
+        case Morton:    return "Morton";
+        case Hilbert:   return "Hilbert";
+        case Metis:     return "Metis";
+        case Diffusive: return "Diffusive";
+        default:        return "Invalid";
+    }
+}
+
 #ifdef DEBUG
 #   include <assert.h>
 #   define PAIRS_DEBUG(...)     {                                                   \
@@ -78,3 +135,5 @@ enum DomainPartitioners {
 #define PAIRS_ERROR(...)        fprintf(stderr, __VA_ARGS__)
 #define MIN(a,b)                ((a) < (b) ? (a) : (b))
 #define MAX(a,b)                ((a) > (b) ? (a) : (b))
+
+}
\ No newline at end of file
diff --git a/runtime/property.hpp b/runtime/property.hpp
index 741594d1745edf923f63bf8eae422aa11218f41a..fd2c5e41c583892318e1d27be91d671abe103793 100644
--- a/runtime/property.hpp
+++ b/runtime/property.hpp
@@ -11,32 +11,39 @@ protected:
     void *h_ptr, *d_ptr;
     PropertyType type;
     layout_t layout;
+    int vol;
     size_t sx, sy;
 
 public:
-    Property(property_t id_, std::string name_, void *h_ptr_, void *d_ptr_, PropertyType type_, layout_t layout_, size_t sx_, size_t sy_=1) :
+    Property(
+        property_t id_, std::string name_, void *h_ptr_, void *d_ptr_,
+        PropertyType type_, layout_t layout_, int vol_, size_t sx_, size_t sy_=1) :
+
         id(id_),
         name(name_),
         h_ptr(h_ptr_),
         d_ptr(d_ptr_),
         type(type_),
         layout(layout_),
+        vol(vol_),
         sx(sx_), sy(sy_) {
 
         PAIRS_ASSERT(type != Prop_Invalid && layout_ != Invalid && sx_ > 0 && sy_ > 0);
     }
 
-    property_t getId() { return id; }
-    std::string getName() { return name; }
-    void *getHostPointer() { return h_ptr; }
-    void *getDevicePointer() { return d_ptr; }
+    property_t getId() const { return id; }
+    std::string getName() const { return name; }
+    void *getHostPointer() const { return h_ptr; }
+    void *getDevicePointer() const { return d_ptr; }
     void setPointers(void *h_ptr_, void *d_ptr_) { h_ptr = h_ptr_, d_ptr = d_ptr_; }
     void setSizes(size_t sx_, size_t sy_) { sx = sx_, sy = sy_; }
-    size_t getTotalSize() { return sx * sy * getPrimitiveTypeSize(); };
-    PropertyType getType() { return type; }
-    layout_t getLayout() { return layout; }
-    size_t getPrimitiveTypeSize() {
+    size_t getTotalSize() const { return sx * sy * getPrimitiveTypeSize(); };
+    PropertyType getType() const { return type; }
+    layout_t getLayout() const { return layout; }
+    int isVolatile() const { return vol != 0; }
+    size_t getPrimitiveTypeSize() const {
         return  (type == Prop_Integer) ? sizeof(int) :
+                (type == Prop_UInt64) ? sizeof(uint64_t) :
                 (type == Prop_Real) ? sizeof(real_t) :
                 (type == Prop_Vector) ? sizeof(real_t) :
                 (type == Prop_Matrix) ? sizeof(real_t) :
@@ -49,6 +56,11 @@ public:
     inline int &operator()(int i) { return static_cast<int *>(h_ptr)[i]; }
 };
 
+class UInt64Property : public Property {
+public:
+    inline uint64_t &operator()(int i) { return static_cast<uint64_t *>(h_ptr)[i]; }
+};
+
 class FloatProperty : public Property {
 public:
     inline real_t &operator()(int i) { return static_cast<real_t *>(h_ptr)[i]; }
diff --git a/runtime/read_from_file.cpp b/runtime/read_from_file.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25dcc97c2545474c1ea974824db665fc7e4af414
--- /dev/null
+++ b/runtime/read_from_file.cpp
@@ -0,0 +1,176 @@
+#include "read_from_file.hpp"
+
+
+namespace pairs {
+
+void read_grid_data(PairsRuntime *ps, const char *filename, real_t *grid_buffer) {
+    std::ifstream in_file(filename, std::ifstream::in);
+    std::string line;
+
+    if(!in_file.is_open()) {
+        std::cerr << "Error: Could not open file \"" << filename << "\"" << std::endl;
+        exit(-1);
+    }
+
+    std::getline(in_file, line);
+    std::stringstream line_stream(line);
+    std::string in0;
+    int i = 0;
+
+    while(std::getline(line_stream, in0, ',')) {
+        //PAIRS_ASSERT(i < ndims * 2);
+        grid_buffer[i] = std::stod(in0);
+        i++;
+    }
+
+    in_file.close();
+}
+
+size_t read_particle_data(
+    PairsRuntime *ps, const char *filename, const property_t properties[],
+    size_t nprops, int shape_id, int start) {
+
+    std::ifstream in_file(filename, std::ifstream::in);
+    std::string line;
+    auto shape_ptr = ps->getAsIntegerProperty(ps->getPropertyByName("shape"));
+    auto uid_ptr = ps->getAsUInt64Property(ps->getPropertyByName("uid"));
+    int n = start;
+
+    if(!in_file.is_open()) {
+        std::cerr << "Error: Could not open file \"" << filename << "\"" << std::endl;
+        exit(-1);
+    }
+
+    while(std::getline(in_file, line)) {
+        std::stringstream line_stream(line);
+        std::string in0;
+        int within_domain = 1;
+        int i = 0;
+        int flags = 0;
+
+        while(std::getline(line_stream, in0, ',')) {
+            property_t p_id = properties[i];
+            auto prop = ps->getProperty(p_id);
+            auto prop_type = prop.getType();
+
+            if(prop_type == Prop_Vector) {
+                auto vector_ptr = ps->getAsVectorProperty(prop);
+                std::string in1, in2;
+                std::getline(line_stream, in1, ',');
+                std::getline(line_stream, in2, ',');
+                real_t x = std::stod(in0);
+                real_t y = std::stod(in1);
+                real_t z = std::stod(in2);
+                vector_ptr(n, 0) = x;
+                vector_ptr(n, 1) = y;
+                vector_ptr(n, 2) = z;
+
+                if(prop.getName() == "position") {
+                    within_domain = ps->getDomainPartitioner()->isWithinSubdomain(x, y, z);
+                }
+            } else if(prop_type == Prop_Matrix) {
+                auto matrix_ptr = ps->getAsMatrixProperty(prop);
+                constexpr int nelems = 9;
+                std::string in_buf;
+
+                matrix_ptr(n, 0) = std::stod(in0);
+                for(int e = 1; e < nelems; e++) {
+                    std::getline(line_stream, in_buf, ',');
+                    matrix_ptr(n, e) = std::stod(in_buf);
+                }
+            } else if(prop_type == Prop_Quaternion) {
+                auto quat_ptr = ps->getAsQuaternionProperty(prop);
+                constexpr int nelems = 4;
+                std::string in_buf;
+
+                quat_ptr(n, 0) = std::stod(in0);
+                for(int e = 1; e < nelems; e++) {
+                    std::getline(line_stream, in_buf, ',');
+                    quat_ptr(n, e) = std::stod(in_buf);
+                }
+            } else if(prop_type == Prop_Integer) {
+                auto int_ptr = ps->getAsIntegerProperty(prop);
+                int_ptr(n) = std::stoi(in0);
+
+                if(prop.getName() == "flags") {
+                    flags = int_ptr(n);
+                }
+            } else if(prop_type == Prop_UInt64) {
+                auto uint64_ptr = ps->getAsUInt64Property(prop);
+                uint64_ptr(n) = std::stoi(in0);
+
+                if(prop.getName() == "uid") {
+                    std::cerr << "Can't read uid from file." << std::endl;
+                    exit(-1);
+                }
+            } else if(prop_type == Prop_Real) {
+                auto float_ptr = ps->getAsFloatProperty(prop);
+                float_ptr(n) = std::stod(in0);
+            } else {
+                std::cerr << "read_particle_data(): Invalid property type!" << std::endl;
+                return 0;
+            }
+
+            i++;
+        }
+
+        if(within_domain || flags & (flags::INFINITE | flags::FIXED | flags::GLOBAL)) {
+            uid_ptr(n) = (flags & flags::GLOBAL) ? UniqueID::createGlobal(ps) : UniqueID::create(ps);
+            shape_ptr(n++) = shape_id;
+        }
+    }
+
+    return n;
+}
+
+/*
+size_t read_feature_data(PairsRuntime *ps, const char *filename, const int feature_id, const property_t properties[], size_t nprops) {
+    std::ifstream in_file(filename, std::ifstream::in);
+    std::string line;
+
+    if(in_file.is_open()) {
+        while(std::getline(in_file, line)) {
+            std::stringstream line_stream(line);
+            std::string istr, jstr, in0;
+            std::getline(line_stream, istr, ',');
+            std::getline(line_stream, jstr, ',');
+            int i = std::stoi(istr);
+            int j = std::stoi(jstr);
+
+            while(std::getline(line_stream, in0, ',')) {
+                property_t p_id = properties[i];
+                auto prop = ps->getProperty(p_id);
+                auto prop_type = prop.getType();
+
+                if(prop_type == Prop_Vector) {
+                    auto vector_ptr = ps->getAsVectorFeatureProperty(prop);
+                    std::string in1, in2;
+                    std::getline(line_stream, in1, ',');
+                    std::getline(line_stream, in2, ',');
+                    real_t x = std::stod(in0);
+                    real_t y = std::stod(in1);
+                    real_t z = std::stod(in2);
+                    vector_ptr(i, j, 0) = x;
+                    vector_ptr(i, j, 1) = y;
+                    vector_ptr(i, j, 2) = z;
+                } else if(prop_type == Prop_Integer) {
+                    auto int_ptr = ps->getAsIntegerFeatureProperty(prop);
+                    int_ptr(i, j) = std::stoi(in0);
+                } else if(prop_type == Prop_Real) {
+                    auto float_ptr = ps->getAsFloatFeatureProperty(prop);
+                    float_ptr(i, j) = std::stod(in0);
+                } else {
+                    std::cerr << "read_feature_data(): Invalid property type!" << std::endl;
+                    return 0;
+                }
+            }
+        }
+
+        in_file.close();
+    }
+
+    return n;
+}
+*/
+
+}
diff --git a/runtime/read_from_file.hpp b/runtime/read_from_file.hpp
index 0173b46778809d13019cfbb42da2a579d2a8c212..abe1acf3edf4133ce2d17bd7c7ea82144279bc48 100644
--- a/runtime/read_from_file.hpp
+++ b/runtime/read_from_file.hpp
@@ -5,163 +5,16 @@
 //---
 #include "pairs.hpp"
 #include "pairs_common.hpp"
+#include "unique_id.hpp"
 
 #pragma once
 
 namespace pairs {
 
-void read_grid_data(PairsSimulation *ps, const char *filename, real_t *grid_buffer) {
-    std::ifstream in_file(filename, std::ifstream::in);
-    std::string line;
+void read_grid_data(PairsRuntime *ps, const char *filename, real_t *grid_buffer);
 
-    if(in_file.is_open()) {
-        std::getline(in_file, line);
-        std::stringstream line_stream(line);
-        std::string in0;
-        int i = 0;
-
-        while(std::getline(line_stream, in0, ',')) {
-            //PAIRS_ASSERT(i < ndims * 2);
-            grid_buffer[i] = std::stod(in0);
-            i++;
-        }
-
-        in_file.close();
-    }
-}
-
-size_t read_particle_data(PairsSimulation *ps, const char *filename, const property_t properties[], size_t nprops, int shape_id, int start) {
-    std::ifstream in_file(filename, std::ifstream::in);
-    std::string line;
-    auto shape_ptr = ps->getAsIntegerProperty(ps->getPropertyByName("shape"));
-    size_t n = start;
-
-    if(in_file.is_open()) {
-        //std::getline(in_file, line);
-        while(std::getline(in_file, line)) {
-            std::stringstream line_stream(line);
-            std::string in0;
-            int within_domain = 1;
-            int i = 0;
-            int flags = 0;
-
-            while(std::getline(line_stream, in0, ',')) {
-                property_t p_id = properties[i];
-                auto prop = ps->getProperty(p_id);
-                auto prop_type = prop.getType();
-
-                if(prop_type == Prop_Vector) {
-                    auto vector_ptr = ps->getAsVectorProperty(prop);
-                    std::string in1, in2;
-                    std::getline(line_stream, in1, ',');
-                    std::getline(line_stream, in2, ',');
-                    real_t x = std::stod(in0);
-                    real_t y = std::stod(in1);
-                    real_t z = std::stod(in2);
-                    vector_ptr(n, 0) = x;
-                    vector_ptr(n, 1) = y;
-                    vector_ptr(n, 2) = z;
-
-                    if(prop.getName() == "position") {
-                        within_domain = ps->getDomainPartitioner()->isWithinSubdomain(x, y, z);
-                    }
-                } else if(prop_type == Prop_Matrix) {
-                    auto matrix_ptr = ps->getAsMatrixProperty(prop);
-                    constexpr int nelems = 9;
-                    std::string in_buf;
-
-                    matrix_ptr(n, 0) = std::stod(in0);
-                    for(int i = 1; i < nelems; i++) {
-                        std::getline(line_stream, in_buf, ',');
-                        matrix_ptr(n, i) = std::stod(in_buf);
-                    }
-                } else if(prop_type == Prop_Quaternion) {
-                    auto quat_ptr = ps->getAsQuaternionProperty(prop);
-                    constexpr int nelems = 4;
-                    std::string in_buf;
-
-                    quat_ptr(n, 0) = std::stod(in0);
-                    for(int i = 1; i < nelems; i++) {
-                        std::getline(line_stream, in_buf, ',');
-                        quat_ptr(n, i) = std::stod(in_buf);
-                    }
-                } else if(prop_type == Prop_Integer) {
-                    auto int_ptr = ps->getAsIntegerProperty(prop);
-                    int_ptr(n) = std::stoi(in0);
-
-                    if(prop.getName() == "flags") {
-                        flags = int_ptr(n);
-                    }
-                } else if(prop_type == Prop_Real) {
-                    auto float_ptr = ps->getAsFloatProperty(prop);
-                    float_ptr(n) = std::stod(in0);
-                } else {
-                    std::cerr << "read_particle_data(): Invalid property type!" << std::endl;
-                    return 0;
-                }
-
-                i++;
-            }
-
-            if(within_domain || flags & (FLAGS_INFINITE | FLAGS_FIXED | FLAGS_GLOBAL)) {
-                shape_ptr(n++) = shape_id;
-            }
-        }
-
-        in_file.close();
-    }
-
-    return n;
-}
-
-/*
-size_t read_feature_data(PairsSimulation *ps, const char *filename, const int feature_id, const property_t properties[], size_t nprops) {
-    std::ifstream in_file(filename, std::ifstream::in);
-    std::string line;
-
-    if(in_file.is_open()) {
-        while(std::getline(in_file, line)) {
-            std::stringstream line_stream(line);
-            std::string istr, jstr, in0;
-            std::getline(line_stream, istr, ',');
-            std::getline(line_stream, jstr, ',');
-            int i = std::stoi(istr);
-            int j = std::stoi(jstr);
-
-            while(std::getline(line_stream, in0, ',')) {
-                property_t p_id = properties[i];
-                auto prop = ps->getProperty(p_id);
-                auto prop_type = prop.getType();
-
-                if(prop_type == Prop_Vector) {
-                    auto vector_ptr = ps->getAsVectorFeatureProperty(prop);
-                    std::string in1, in2;
-                    std::getline(line_stream, in1, ',');
-                    std::getline(line_stream, in2, ',');
-                    real_t x = std::stod(in0);
-                    real_t y = std::stod(in1);
-                    real_t z = std::stod(in2);
-                    vector_ptr(i, j, 0) = x;
-                    vector_ptr(i, j, 1) = y;
-                    vector_ptr(i, j, 2) = z;
-                } else if(prop_type == Prop_Integer) {
-                    auto int_ptr = ps->getAsIntegerFeatureProperty(prop);
-                    int_ptr(i, j) = std::stoi(in0);
-                } else if(prop_type == Prop_Real) {
-                    auto float_ptr = ps->getAsFloatFeatureProperty(prop);
-                    float_ptr(i, j) = std::stod(in0);
-                } else {
-                    std::cerr << "read_feature_data(): Invalid property type!" << std::endl;
-                    return 0;
-                }
-            }
-        }
-
-        in_file.close();
-    }
-
-    return n;
-}
-*/
+size_t read_particle_data(
+    PairsRuntime *ps, const char *filename, const property_t properties[],
+    size_t nprops, int shape_id, int start);
 
 }
diff --git a/runtime/runtime_var.hpp b/runtime/runtime_var.hpp
index 7cf3aeaa9b6c32883299bfcb44044db457f4af48..7599b9771fab009be05df1a13ea6764e2c0d78ba 100644
--- a/runtime/runtime_var.hpp
+++ b/runtime/runtime_var.hpp
@@ -5,11 +5,12 @@
 namespace pairs {
 
 template<typename T>
-class RuntimeVar{
+class RuntimeVar {
 protected:
     T *h_ptr, *d_ptr;
 
 public:
+    RuntimeVar() = default;
     RuntimeVar(T *ptr) {
         h_ptr = ptr;
         d_ptr = (T *) pairs::device_alloc(sizeof(T));
diff --git a/runtime/stats.cpp b/runtime/stats.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d66c34162937c36d2827be8d1bc8c3852461013c
--- /dev/null
+++ b/runtime/stats.cpp
@@ -0,0 +1,29 @@
+#include "pairs.hpp"
+
+namespace pairs {
+
+void print_stats(PairsRuntime *ps, int nlocal, int nghost) {
+    int min_nlocal = nlocal;
+    int max_nlocal = nlocal;
+    int min_nghost = nghost;
+    int max_nghost = nghost;
+    int nglobal;
+
+    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
+        MPI_Allreduce(&nlocal, &nglobal, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+        min_nlocal = nglobal;
+        MPI_Allreduce(&nlocal, &nglobal, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+        max_nlocal = nglobal;
+        MPI_Allreduce(&nghost, &nglobal, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+        min_nghost = nglobal;
+        MPI_Allreduce(&nghost, &nglobal, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+        max_nghost = nglobal;
+    }
+
+    if(ps->getDomainPartitioner()->getRank() == 0) {
+        std::cout << "Number of local particles: " << min_nlocal << " / " << max_nlocal << std::endl;
+        std::cout << "Number of ghost particles: " << min_nghost << " / " << max_nghost << std::endl;
+    }
+}
+
+}
diff --git a/runtime/stats.hpp b/runtime/stats.hpp
index 413ffab8425bc31d877e263d29fdfa4f9f343405..e6c51c306f83b347534963aac7c24f7a42f1b58d 100644
--- a/runtime/stats.hpp
+++ b/runtime/stats.hpp
@@ -2,32 +2,8 @@
 
 #pragma once
 
-using namespace std;
-
 namespace pairs {
 
-void print_stats(PairsSimulation *ps, int nlocal, int nghost) {
-    int min_nlocal = nlocal;
-    int max_nlocal = nlocal;
-    int min_nghost = nghost;
-    int max_nghost = nghost;
-    int nglobal;
-
-    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
-        MPI_Allreduce(&nlocal, &nglobal, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
-        min_nlocal = nglobal;
-        MPI_Allreduce(&nlocal, &nglobal, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
-        max_nlocal = nglobal;
-        MPI_Allreduce(&nghost, &nglobal, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
-        min_nghost = nglobal;
-        MPI_Allreduce(&nghost, &nglobal, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
-        max_nghost = nglobal;
-    }
-
-    if(ps->getDomainPartitioner()->getRank() == 0) {
-        std::cout << "Number of local particles: " << min_nlocal << " / " << max_nlocal << std::endl;
-        std::cout << "Number of ghost particles: " << min_nghost << " / " << max_nghost << std::endl;
-    }
-}
+void print_stats(PairsRuntime *ps, int nlocal, int nghost);
 
 }
diff --git a/runtime/thermo.cpp b/runtime/thermo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..044f6d3639f8db286099a19b112b67b065f2cfa0
--- /dev/null
+++ b/runtime/thermo.cpp
@@ -0,0 +1,101 @@
+#include <iostream>
+#include <math.h>
+#include <mpi.h>
+//---
+#include "pairs.hpp"
+
+namespace pairs {
+
+double compute_thermo(
+    PairsRuntime *ps, int nlocal, double xprd, double yprd, double zprd, int print) {
+
+    auto masses = ps->getAsFloatProperty(ps->getPropertyByName("mass"));
+    auto velocities = ps->getAsVectorProperty(ps->getPropertyByName("linear_velocity"));
+    int natoms = nlocal;
+
+    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
+        int global_natoms;
+        MPI_Allreduce(&natoms, &global_natoms, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+        natoms = global_natoms;
+    }
+
+    const double mvv2e = 1.0;
+    const double dof_boltz = (natoms * 3 - 3);
+    const double t_scale = mvv2e / dof_boltz;
+    const double p_scale = 1.0 / 3 / xprd / yprd / zprd;
+    //const double e_scale = 0.5;
+    double t = 0.0, p;
+
+    ps->copyPropertyToHost(masses, ReadOnly);
+    ps->copyPropertyToHost(velocities, ReadOnly);
+
+    for(int i = 0; i < nlocal; i++) {
+        t += masses(i) * (  velocities(i, 0) * velocities(i, 0) +
+                            velocities(i, 1) * velocities(i, 1) +
+                            velocities(i, 2) * velocities(i, 2)   );
+    }
+
+    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
+        double global_t;
+        MPI_Allreduce(&t, &global_t, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+        t = global_t;
+    }
+
+    t = t * t_scale;
+    if(print == 1 && ps->getDomainPartitioner()->getRank() == 0) {
+        p = (t * dof_boltz) * p_scale;
+        std::cout << t << "\t" << p << std::endl;
+    }
+
+    return t;
+}
+
+void adjust_thermo(
+    PairsRuntime *ps, int nlocal, double xprd, double yprd, double zprd, double temp) {
+
+    auto velocities = ps->getAsVectorProperty(ps->getPropertyByName("linear_velocity"));
+    double vxtot = 0.0;
+    double vytot = 0.0;
+    double vztot = 0.0;
+    double tmp;
+    int natoms = nlocal;
+
+    for(int i = 0; i < nlocal; i++) {
+        vxtot += velocities(i, 0);
+        vytot += velocities(i, 1);
+        vztot += velocities(i, 2);
+    }
+
+    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
+        int global_natoms;
+        MPI_Allreduce(&natoms, &global_natoms, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+        natoms = global_natoms;
+        MPI_Allreduce(&vxtot, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+        vxtot = tmp / natoms;
+        MPI_Allreduce(&vytot, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+        vytot = tmp / natoms;
+        MPI_Allreduce(&vztot, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+        vztot = tmp / natoms;
+    } else {
+        vxtot /= natoms;
+        vytot /= natoms;
+        vztot /= natoms;
+    }
+
+    for(int i = 0; i < nlocal; i++) {
+        velocities(i, 0) -= vxtot;
+        velocities(i, 1) -= vytot;
+        velocities(i, 2) -= vztot;
+    }
+
+    double t = pairs::compute_thermo(ps, nlocal, xprd, yprd, zprd, 0);
+    double factor = sqrt(temp / t);
+
+    for(int i = 0; i < nlocal; i++) {
+        velocities(i, 0) *= factor;
+        velocities(i, 1) *= factor;
+        velocities(i, 2) *= factor;
+    }
+}
+
+}
diff --git a/runtime/thermo.hpp b/runtime/thermo.hpp
index b09693ab9ca47ea71c650205f68f30bbb44bcd2b..6902b007603478ec2f4ca23136c58a88b025eef1 100644
--- a/runtime/thermo.hpp
+++ b/runtime/thermo.hpp
@@ -1,99 +1,13 @@
-#include <iostream>
-#include <math.h>
-#include <mpi.h>
-//---
 #include "pairs.hpp"
 
 #pragma once
 
 namespace pairs {
 
-double compute_thermo(PairsSimulation *ps, int nlocal, double xprd, double yprd, double zprd, int print) {
-    auto masses = ps->getAsFloatProperty(ps->getPropertyByName("mass"));
-    auto velocities = ps->getAsVectorProperty(ps->getPropertyByName("linear_velocity"));
-    int natoms = nlocal;
+double compute_thermo(
+    PairsRuntime *ps, int nlocal, double xprd, double yprd, double zprd, int print);
 
-    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
-        int global_natoms;
-        MPI_Allreduce(&natoms, &global_natoms, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-        natoms = global_natoms;
-    }
-
-    const double mvv2e = 1.0;
-    const double dof_boltz = (natoms * 3 - 3);
-    const double t_scale = mvv2e / dof_boltz;
-    const double p_scale = 1.0 / 3 / xprd / yprd / zprd;
-    //const double e_scale = 0.5;
-    double t = 0.0, p;
-
-    ps->copyPropertyToHost(masses, ReadOnly);
-    ps->copyPropertyToHost(velocities, ReadOnly);
-
-    for(int i = 0; i < nlocal; i++) {
-        t += masses(i) * (  velocities(i, 0) * velocities(i, 0) +
-                            velocities(i, 1) * velocities(i, 1) +
-                            velocities(i, 2) * velocities(i, 2)   );
-    }
-
-    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
-        double global_t;
-        MPI_Allreduce(&t, &global_t, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        t = global_t;
-    }
-
-    t = t * t_scale;
-    if(print == 1 && ps->getDomainPartitioner()->getRank() == 0) {
-        p = (t * dof_boltz) * p_scale;
-        std::cout << t << "\t" << p << std::endl;
-    }
-
-    return t;
-}
-
-void adjust_thermo(PairsSimulation *ps, int nlocal, double xprd, double yprd, double zprd, double temp) {
-    auto velocities = ps->getAsVectorProperty(ps->getPropertyByName("linear_velocity"));
-    double vxtot = 0.0;
-    double vytot = 0.0;
-    double vztot = 0.0;
-    double tmp;
-    int natoms = nlocal;
-
-    for(int i = 0; i < nlocal; i++) {
-        vxtot += velocities(i, 0);
-        vytot += velocities(i, 1);
-        vztot += velocities(i, 2);
-    }
-
-    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
-        int global_natoms;
-        MPI_Allreduce(&natoms, &global_natoms, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-        natoms = global_natoms;
-        MPI_Allreduce(&vxtot, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        vxtot = tmp / natoms;
-        MPI_Allreduce(&vytot, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        vytot = tmp / natoms;
-        MPI_Allreduce(&vztot, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        vztot = tmp / natoms;
-    } else {
-        vxtot /= natoms;
-        vytot /= natoms;
-        vztot /= natoms;
-    }
-
-    for(int i = 0; i < nlocal; i++) {
-        velocities(i, 0) -= vxtot;
-        velocities(i, 1) -= vytot;
-        velocities(i, 2) -= vztot;
-    }
-
-    double t = pairs::compute_thermo(ps, nlocal, xprd, yprd, zprd, 0);
-    double factor = sqrt(temp / t);
-
-    for(int i = 0; i < nlocal; i++) {
-        velocities(i, 0) *= factor;
-        velocities(i, 1) *= factor;
-        velocities(i, 2) *= factor;
-    }
-}
+void adjust_thermo(
+    PairsRuntime *ps, int nlocal, double xprd, double yprd, double zprd, double temp);
 
 }
diff --git a/runtime/timers.hpp b/runtime/timers.hpp
index 02058924ada457213214511a4099a05cf87695ac..c4cdc943aa5faeed57b4971684277e0844d3e7da 100644
--- a/runtime/timers.hpp
+++ b/runtime/timers.hpp
@@ -3,6 +3,8 @@
 #include <iostream>
 #include <unordered_map>
 
+#pragma once
+
 using namespace std;
 
 namespace pairs {
diff --git a/runtime/timing.cpp b/runtime/timing.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0068d8117b89d2529e3f8dc10b24e1d2483672e8
--- /dev/null
+++ b/runtime/timing.cpp
@@ -0,0 +1,23 @@
+#include "pairs.hpp"
+
+using namespace std;
+
+namespace pairs {
+
+void register_timer(PairsRuntime *ps, int id, std::string name) {
+    ps->getTimers()->add(id, name);
+}
+
+void start_timer(PairsRuntime *ps, int id) {
+    ps->getTimers()->start(id);
+}
+
+void stop_timer(PairsRuntime *ps, int id) {
+    ps->getTimers()->stop(id);
+}
+
+void print_timers(PairsRuntime *ps) {
+    ps->printTimers();
+}
+
+}
diff --git a/runtime/timing.hpp b/runtime/timing.hpp
index 6c35e222e049fefd9aac991cf7e9e400ef3cc1ce..f7544603549232cce89c00e7071948da32511693 100644
--- a/runtime/timing.hpp
+++ b/runtime/timing.hpp
@@ -6,20 +6,9 @@ using namespace std;
 
 namespace pairs {
 
-void register_timer(PairsSimulation *ps, int id, std::string name) {
-    ps->getTimers()->add(id, name);
-}
-
-void start_timer(PairsSimulation *ps, int id) {
-    ps->getTimers()->start(id);
-}
-
-void stop_timer(PairsSimulation *ps, int id) {
-    ps->getTimers()->stop(id);
-}
-
-void print_timers(PairsSimulation *ps) {
-    ps->printTimers();
-}
+void register_timer(PairsRuntime *ps, int id, std::string name);
+void start_timer(PairsRuntime *ps, int id);
+void stop_timer(PairsRuntime *ps, int id);
+void print_timers(PairsRuntime *ps);
 
 }
diff --git a/runtime/tracked_variable.hpp b/runtime/tracked_variable.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9855125818bea7fc72be668c7d01f1af9a5ff3f
--- /dev/null
+++ b/runtime/tracked_variable.hpp
@@ -0,0 +1,18 @@
+#include "pairs_common.hpp"
+
+#pragma once
+
+namespace pairs {
+
+class TrackedVariable {
+protected:
+    std::string name;
+    void *ptr;
+
+public:
+    TrackedVariable(std::string name_, void *ptr_) : name(name_), ptr(ptr_) {}
+    std::string getName() { return name; }
+    void *getPointer() { return ptr; }
+};
+
+}
diff --git a/runtime/unique_id.hpp b/runtime/unique_id.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cfc95a7298335a5c10b535a3296340a6d6552a41
--- /dev/null
+++ b/runtime/unique_id.hpp
@@ -0,0 +1,38 @@
+#include "pairs.hpp"
+
+#pragma once
+
+namespace pairs {
+
+class UniqueID{
+public:
+    inline static id_t create(PairsRuntime *pr);
+    inline static id_t createGlobal(PairsRuntime *pr);
+    inline static id_t getNumGlobals();
+
+private:
+    static const id_t capacity = 1000000000;   // max number of particles per rank
+    inline static id_t counter = 1;
+    inline static id_t globalCounter = 1;
+
+};
+
+inline id_t UniqueID::getNumGlobals(){
+    return globalCounter - 1;
+}
+
+inline id_t UniqueID::create(PairsRuntime *pr){
+    id_t rank = static_cast<id_t>(pr->getDomainPartitioner()->getRank());
+    id_t id = rank*capacity + counter;
+    ++counter;
+    return id;
+}
+
+inline id_t UniqueID::createGlobal(PairsRuntime *pr){
+    id_t numranks = static_cast<id_t>(pr->getDomainPartitioner()->getWorldSize());
+    id_t id = numranks*capacity + globalCounter;
+    ++globalCounter;
+    return id;
+}
+
+}
diff --git a/runtime/vtk.cpp b/runtime/vtk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6235725c9d7d10cb38e4033a1e456b6b50ab32a
--- /dev/null
+++ b/runtime/vtk.cpp
@@ -0,0 +1,198 @@
+#include <iomanip>
+#include <iostream>
+#include <fstream>
+//---
+#include "pairs.hpp"
+
+namespace pairs {
+
+void vtk_write_aabb(PairsRuntime *ps, const char *filename, int num,
+    double xmin, double xmax, 
+    double ymin, double ymax, 
+    double zmin, double zmax){
+
+    std::string output_filename(filename);
+    const int prec = 8;
+    std::ostringstream filename_oss;
+
+    filename_oss << filename << "_" << num;
+    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
+        filename_oss << "r" << ps->getDomainPartitioner()->getRank() ;
+    }
+
+    filename_oss <<".vtk";
+    std::ofstream out_file(filename_oss.str());
+
+    out_file << std::fixed << std::setprecision(prec);
+    if(out_file.is_open()) {
+        out_file << "# vtk DataFile Version 2.0\n";
+        out_file << "Subdomains\n";
+        out_file << "ASCII\n";
+        out_file << "DATASET POLYDATA\n";
+        out_file << "POINTS 8 double\n";
+
+        out_file << xmin << " " << ymin << " " << zmin << "\n";
+        out_file << xmax << " " << ymin << " " << zmin << "\n";
+        out_file << xmax << " " << ymax << " " << zmin << "\n";
+        out_file << xmin << " " << ymax << " " << zmin << "\n";
+        out_file << xmin << " " << ymin << " " << zmax << "\n";
+        out_file << xmax << " " << ymin << " " << zmax << "\n";
+        out_file << xmax << " " << ymax << " " << zmax << "\n";
+        out_file << xmin << " " << ymax << " " << zmax << "\n";
+
+        out_file << "POLYGONS 6 30\n";
+
+        out_file << "4 0 1 2 3 \n";
+        out_file << "4 4 5 6 7 \n";
+        out_file << "4 0 1 5 4 \n";
+        out_file << "4 3 2 6 7 \n";
+        out_file << "4 0 4 7 3 \n";
+        out_file << "4 1 2 6 5 \n";
+
+        out_file << "\n\n";
+        out_file.close();
+    }
+    else {
+        std::cerr << "vtk_write_aabb: Failed to open " << filename_oss.str() << std::endl;
+        exit(-1);
+    }
+
+}
+
+void vtk_write_subdom(PairsRuntime *ps, const char *filename, int timestep, int frequency){
+    std::string output_filename(filename);
+    const int prec = 8;
+    std::ostringstream filename_oss;
+
+    if(frequency != 0 && timestep % frequency != 0) {
+        return;
+    }
+
+    filename_oss << filename << "_";
+    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
+        filename_oss << "r" << ps->getDomainPartitioner()->getRank() << "_";
+    }
+
+    filename_oss << timestep << ".vtk";
+    std::ofstream out_file(filename_oss.str());
+
+    double aabb[3][3];
+    for (int d=0; d<3; ++d){
+        aabb[d][0] = ps->getDomainPartitioner()->getSubdomMin(d);
+        aabb[d][1] = ps->getDomainPartitioner()->getSubdomMax(d);
+    }
+
+    out_file << std::fixed << std::setprecision(prec);
+    if(out_file.is_open()) {
+        out_file << "# vtk DataFile Version 2.0\n";
+        out_file << "Subdomains\n";
+        out_file << "ASCII\n";
+        out_file << "DATASET POLYDATA\n";
+        out_file << "POINTS 8 double\n";
+
+        out_file << aabb[0][0] << " " << aabb[1][0] << " " << aabb[2][0] << "\n";
+        out_file << aabb[0][1] << " " << aabb[1][0] << " " << aabb[2][0] << "\n";
+        out_file << aabb[0][1] << " " << aabb[1][1] << " " << aabb[2][0] << "\n";
+        out_file << aabb[0][0] << " " << aabb[1][1] << " " << aabb[2][0] << "\n";
+        out_file << aabb[0][0] << " " << aabb[1][0] << " " << aabb[2][1] << "\n";
+        out_file << aabb[0][1] << " " << aabb[1][0] << " " << aabb[2][1] << "\n";
+        out_file << aabb[0][1] << " " << aabb[1][1] << " " << aabb[2][1] << "\n";
+        out_file << aabb[0][0] << " " << aabb[1][1] << " " << aabb[2][1] << "\n";
+
+        out_file << "POLYGONS 6 30\n";
+
+        out_file << "4 0 1 2 3 \n";
+        out_file << "4 4 5 6 7 \n";
+        out_file << "4 0 1 5 4 \n";
+        out_file << "4 3 2 6 7 \n";
+        out_file << "4 0 4 7 3 \n";
+        out_file << "4 1 2 6 5 \n";
+
+        out_file << "\n\n";
+        out_file.close();
+    }
+    else {
+        std::cerr << "vtk_write_subdoms: Failed to open " << filename_oss.str() << std::endl;
+        exit(-1);
+    }
+}
+
+void vtk_write_data(
+    PairsRuntime *ps, const char *filename, int start, int end, int timestep, int frequency) {
+
+    std::string output_filename(filename);
+    auto masses = ps->getAsFloatProperty(ps->getPropertyByName("mass"));
+    auto positions = ps->getAsVectorProperty(ps->getPropertyByName("position"));
+    auto flags = ps->getAsIntegerProperty(ps->getPropertyByName("flags"));
+    auto radius = ps->getAsFloatProperty(ps->getPropertyByName("radius"));
+    const int prec = 8;
+    int n = end - start;
+    std::ostringstream filename_oss;
+
+    if(frequency != 0 && timestep % frequency != 0) {
+        return;
+    }
+
+    filename_oss << filename << "_";
+    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
+        filename_oss << "r" << ps->getDomainPartitioner()->getRank() << "_";
+    }
+
+    filename_oss << timestep << ".vtk";
+    std::ofstream out_file(filename_oss.str());
+
+    ps->copyPropertyToHost(masses, ReadOnly);
+    ps->copyPropertyToHost(positions, ReadOnly);
+    ps->copyPropertyToHost(flags, ReadOnly);
+    ps->copyPropertyToHost(radius, ReadOnly);
+
+    for(int i = start; i < end; i++) {
+        if(flags(i) & flags::INFINITE) {
+            n--;
+        }
+    }
+
+    if(out_file.is_open()) {
+        out_file << "# vtk DataFile Version 2.0\n";
+        out_file << "Particle data\n";
+        out_file << "ASCII\n";
+        out_file << "DATASET POLYDATA\n";
+        out_file << "POINTS " << n << " double\n";
+
+        for(int i = start; i < end; i++) {
+            if(!(flags(i) & flags::INFINITE)) {
+                out_file << std::fixed << std::setprecision(prec) << positions(i, 0) << " ";
+                out_file << std::fixed << std::setprecision(prec) << positions(i, 1) << " ";
+                out_file << std::fixed << std::setprecision(prec) << positions(i, 2) << "\n";
+            }
+        }
+
+        out_file << "\n\n";
+        out_file << "POINT_DATA " << n << "\n";
+        out_file << "SCALARS mass double 1\n";
+        out_file << "LOOKUP_TABLE default\n";
+        for(int i = start; i < end; i++) {
+            if(!(flags(i) & flags::INFINITE)) {
+                out_file << std::fixed << std::setprecision(prec) << masses(i) << "\n";
+            }
+        }
+
+        out_file << "\n\n";
+        out_file << "SCALARS radius double 1\n";
+        out_file << "LOOKUP_TABLE default\n";
+        for(int i = start; i < end; i++) {
+            if(!(flags(i) & flags::INFINITE)) {
+                out_file << std::fixed << std::setprecision(prec) << radius(i) << "\n";
+            }
+        }
+
+        out_file << "\n\n";
+        out_file.close();
+    }
+    else {
+        std::cerr << "vtk_write_data: Failed to open " << filename_oss.str() << std::endl;
+        exit(-1);
+    }
+}
+
+}
diff --git a/runtime/vtk.hpp b/runtime/vtk.hpp
index f122565694634594e0733017df8624a280681fc0..dcd97c020f1b49cdf82df548083687e1874f7c51 100644
--- a/runtime/vtk.hpp
+++ b/runtime/vtk.hpp
@@ -1,88 +1,17 @@
-#include <iomanip>
-#include <iostream>
-#include <fstream>
-//---
 #include "pairs.hpp"
 
 #pragma once
 
 namespace pairs {
 
-void vtk_write_data(PairsSimulation *ps, const char *filename, int start, int end, int timestep, int frequency) {
-    std::string output_filename(filename);
-    auto masses = ps->getAsFloatProperty(ps->getPropertyByName("mass"));
-    auto positions = ps->getAsVectorProperty(ps->getPropertyByName("position"));
-    auto flags = ps->getAsIntegerProperty(ps->getPropertyByName("flags"));
-    const int prec = 8;
-    int n = end - start;
-    std::ostringstream filename_oss;
+void vtk_write_aabb(PairsRuntime *ps, const char *filename, int num,
+    double xmin, double xmax, 
+    double ymin, double ymax, 
+    double zmin, double zmax);
 
-    if(frequency != 0 && timestep % frequency != 0) {
-        return;
-    }
+void vtk_write_subdom(PairsRuntime *ps, const char *filename, int timestep, int frequency=1);
 
-    filename_oss << filename << "_";
-    if(ps->getDomainPartitioner()->getWorldSize() > 1) {
-        filename_oss << "r" << ps->getDomainPartitioner()->getRank() << "_";
-    }
-
-    filename_oss << timestep << ".vtk";
-    std::ofstream out_file(filename_oss.str());
-
-    ps->copyPropertyToHost(masses, ReadOnly);
-    ps->copyPropertyToHost(positions, ReadOnly);
-    ps->copyPropertyToHost(flags, ReadOnly);
-
-    for(int i = start; i < end; i++) {
-        if(flags(i) & FLAGS_INFINITE) {
-            n--;
-        }
-    }
-
-    if(out_file.is_open()) {
-        out_file << "# vtk DataFile Version 2.0\n";
-        out_file << "Particle data\n";
-        out_file << "ASCII\n";
-        out_file << "DATASET UNSTRUCTURED_GRID\n";
-        out_file << "POINTS " << n << " double\n";
-
-        for(int i = start; i < end; i++) {
-            if(!(flags(i) & FLAGS_INFINITE)) {
-                out_file << std::fixed << std::setprecision(prec) << positions(i, 0) << " ";
-                out_file << std::fixed << std::setprecision(prec) << positions(i, 1) << " ";
-                out_file << std::fixed << std::setprecision(prec) << positions(i, 2) << "\n";
-            }
-        }
-
-        out_file << "\n\n";
-        out_file << "CELLS " << n << " " << (n * 2) << "\n";
-        for(int i = start; i < end; i++) {
-            if(!(flags(i) & FLAGS_INFINITE)) {
-                out_file << "1 " << (i - start) << "\n";
-            }
-        }
-
-        out_file << "\n\n";
-        out_file << "CELL_TYPES " << n << "\n";
-        for(int i = start; i < end; i++) {
-            if(!(flags(i) & FLAGS_INFINITE)) {
-                out_file << "1\n";
-            }
-        }
-
-        out_file << "\n\n";
-        out_file << "POINT_DATA " << n << "\n";
-        out_file << "SCALARS mass double\n";
-        out_file << "LOOKUP_TABLE default\n";
-        for(int i = start; i < end; i++) {
-            if(!(flags(i) & FLAGS_INFINITE)) {
-                out_file << std::fixed << std::setprecision(prec) << masses(i) << "\n";
-            }
-        }
-
-        out_file << "\n\n";
-        out_file.close();
-    }
-}
+void vtk_write_data(
+    PairsRuntime *ps, const char *filename, int start, int end, int timestep, int frequency=1);
 
 }
diff --git a/src/pairs/__init__.py b/src/pairs/__init__.py
index e89e0a79cbb8f3e39143dc3461a9734a6dcf8d7b..6525a9814b0fc4729c0cf3e24e5a748c5db3ae4b 100644
--- a/src/pairs/__init__.py
+++ b/src/pairs/__init__.py
@@ -2,6 +2,7 @@ from pairs.ir.types import Types
 from pairs.code_gen.cgen import CGen
 from pairs.code_gen.target import Target
 from pairs.sim.domain_partitioners import DomainPartitioners
+from pairs.sim.load_balancing_algorithms import LoadBalancingAlgorithms
 from pairs.sim.shapes import Shapes
 from pairs.sim.simulation import Simulation
 
@@ -15,11 +16,12 @@ def simulation(
     use_contact_history=False,
     particle_capacity=800000,
     neighbor_capacity=100,
-    debug=False):
+    debug=False,
+    generate_whole_program=False):
 
     return Simulation(
         CGen(ref, debug), shapes, dims, timesteps, double_prec, use_contact_history,
-        particle_capacity, neighbor_capacity)
+        particle_capacity, neighbor_capacity, generate_whole_program)
 
 def target_cpu(parallel=False):
     if parallel:
@@ -65,3 +67,18 @@ def regular_domain_partitioner():
 
 def regular_domain_partitioner_xy():
     return DomainPartitioners.RegularXY
+
+def block_forest():
+    return DomainPartitioners.BlockForest
+
+def morton():
+    return LoadBalancingAlgorithms.Morton
+
+def hilbert():
+    return LoadBalancingAlgorithms.Hilbert
+
+def metis():
+    return LoadBalancingAlgorithms.Metis
+
+def diffusive():
+    return LoadBalancingAlgorithms.Diffusive
\ No newline at end of file
diff --git a/src/pairs/analysis/__init__.py b/src/pairs/analysis/__init__.py
index 846843c3c72ee73a6359a74f397195eed453f8f2..7b200b201ef6b1126275c6656c98419b36e2d89a 100644
--- a/src/pairs/analysis/__init__.py
+++ b/src/pairs/analysis/__init__.py
@@ -2,18 +2,23 @@ import time
 from pairs.analysis.expressions import DetermineExpressionsTerminals, ResetInPlaceOperations, DetermineInPlaceOperations, ListDeclaredExpressions
 from pairs.analysis.blocks import DiscoverBlockVariants, DetermineExpressionsOwnership, DetermineParentBlocks
 from pairs.analysis.devices import FetchKernelReferences, MarkCandidateLoops
-from pairs.analysis.modules import FetchModulesReferences
+from pairs.analysis.modules import FetchModulesReferences, InferModulesReturnTypes
 
 
 class Analysis:
+    """Compiler analysis performed on P4IRS"""
+
     def __init__(self, ast):
-        self._ast = ast
+        self._ast_list = ast if isinstance(ast, list) else [ast]
 
     def apply(self, analysis):
         print(f"Performing analysis: {type(analysis).__name__}... ", end="")
         start = time.time()
-        analysis.set_ast(self._ast)
-        analysis.visit()
+
+        for ast in self._ast_list:
+            analysis.set_ast(ast)
+            analysis.visit()
+
         elapsed = time.time() - start
         print(f"{elapsed:.2f}s elapsed.")
 
@@ -46,3 +51,6 @@ class Analysis:
 
     def mark_candidate_loops(self):
         self.apply(MarkCandidateLoops())
+
+    def infer_modules_return_types(self):
+        self.apply(InferModulesReturnTypes())
\ No newline at end of file
diff --git a/src/pairs/analysis/devices.py b/src/pairs/analysis/devices.py
index d4552eba7183ab9a5371cb69e6bc6e07b22ed592..29e554e4606776693cdfd2dd784fc24d0b6995ea 100644
--- a/src/pairs/analysis/devices.py
+++ b/src/pairs/analysis/devices.py
@@ -12,30 +12,24 @@ from pairs.ir.vectors import VectorOp
 class MarkCandidateLoops(Visitor):
     def __init__(self, ast=None):
         super().__init__(ast)
+        self.device_module = False
+
+    def visit_For(self, ast_node):
+        if self.device_module and not ast_node.not_kernel and (not isinstance(ast_node.min, Lit) or not isinstance(ast_node.max, Lit)):
+            ast_node.mark_as_kernel_candidate()
+        else:
+            ast_node.mark_iter_as_ref_candidate()
+            self.visit(ast_node.block)
+
 
     def visit_Module(self, ast_node):
-        possible_candidates = []
-        for stmt in ast_node._block.stmts:
-            if stmt is not None:
-                if isinstance(stmt, Branch):
-                    for branch_stmt in stmt.block_if.stmts:
-                        if isinstance(branch_stmt, For):
-                            possible_candidates.append(branch_stmt)
-
-                    if stmt.block_else is not None:
-                        for branch_stmt in stmt.block_else.stmts:
-                            if isinstance(branch_stmt, For):
-                                possible_candidates.append(branch_stmt)
-
-                if isinstance(stmt, For):
-                    possible_candidates.append(stmt)
-
-        for stmt in possible_candidates:
-            if not isinstance(stmt.min, Lit) or not isinstance(stmt.max, Lit):
-                stmt.mark_as_kernel_candidate()
+        parent_runs_on_device = self.device_module
+        if ast_node.run_on_device:
+            self.device_module = True
 
         self.visit_children(ast_node)
-
+        self.device_module = parent_runs_on_device
+        
 
 class FetchKernelReferences(Visitor):
     def __init__(self, ast=None):
@@ -205,3 +199,12 @@ class FetchKernelReferences(Visitor):
             # Variables only have a device version when changed within kernels
             if self.writing:
                 ast_node.device_flag = True
+
+    def visit_Parameter(self, ast_node):
+        for k in self.kernel_stack:
+            k.add_parameter(ast_node, self.writing)
+
+    def visit_Iter(self, ast_node):
+        for k in self.kernel_stack:
+            if ast_node.is_ref_candidate():
+                k.add_iter(ast_node, self.writing)
diff --git a/src/pairs/analysis/modules.py b/src/pairs/analysis/modules.py
index 4cf0bf8002af97ad66ff2057301235c7756ab5ad..fd7bd11393525e91f96cd0e3a8baaa451209a7d2 100644
--- a/src/pairs/analysis/modules.py
+++ b/src/pairs/analysis/modules.py
@@ -1,5 +1,17 @@
 from pairs.ir.visitor import Visitor
 
+class InferModulesReturnTypes(Visitor):
+    def __init__(self, ast=None):
+        super().__init__(ast)
+
+    def visit_Module(self, ast_node):
+        self.current_module = ast_node
+        self.visit_children(ast_node)
+
+    def visit_Return(self, ast_node):
+        self.current_module._return_type = ast_node.expr.type()
+        self.visit_children(ast_node)
+        
 
 class FetchModulesReferences(Visitor):
     def __init__(self, ast=None):
@@ -39,8 +51,16 @@ class FetchModulesReferences(Visitor):
             self.visit(ast_node.capacity)
 
     def visit_AtomicInc(self, ast_node):
+        visit_once = self.visit_nodes_once
+        self.visit_nodes_once = False
+        # Force write after read for the same node (visited twice)
+        self.writing = False
+        self.visit(ast_node.elem)
         self.writing = True
         self.visit(ast_node.elem)
+        self.visit_nodes_once = visit_once
+
+
         self.writing = False
         self.visit(ast_node.value)
 
@@ -115,3 +135,8 @@ class FetchModulesReferences(Visitor):
         for m in self.module_stack:
             if not ast_node.temporary():
                 m.add_variable(ast_node, self.writing)
+
+    def visit_Parameter(self, ast_node):
+        for m in self.module_stack:
+            # parameters are restricted to read-only, passed by value
+            m.add_parameter(ast_node, write=False)
\ No newline at end of file
diff --git a/src/pairs/code_gen/accessor.py b/src/pairs/code_gen/accessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..34421cb64cde53f7a75a9079b39ee90c25b7b235
--- /dev/null
+++ b/src/pairs/code_gen/accessor.py
@@ -0,0 +1,497 @@
+from pairs.ir.types import Types
+from pairs.ir.features import FeatureProperty
+from pairs.ir.properties import Property
+
+class PairsAcessor:
+    def __init__(self, cgen):
+        self.sim = cgen.sim
+        self.target = cgen.target
+        self.print = cgen.print
+        self.debug = cgen.debug
+        self.host_device_attr = ""
+        self.host_attr = ""
+        
+    def generate(self):
+        self.print("")
+
+        if self.target.is_gpu():
+            self.print("namespace pairs::internal{")
+            self.print.add_indent(4)
+            self.DeviceProps_struct()
+            self.HostProps_struct()
+            self.print.add_indent(-4)
+            self.print("}")
+            self.print("")
+
+        if self.target.is_gpu():
+            self.host_device_attr = "__host__ __device__ "
+            self.host_attr = "__host__ "
+        self.print("#include \"math/Vector3.hpp\"")
+        # self.print("#include \"math/Quaternion.hpp\"")
+        # self.print("#include \"math/Matrix3.hpp\"")
+        self.print("")
+
+        self.print("class PairsAccessor {")
+        self.print("private:")
+        self.print.add_indent(4)
+        self.member_variables()
+        self.print.add_indent(-4)
+        self.print("public:")
+        self.print.add_indent(4)
+
+        self.sync_ctx_enum()
+        self.update()
+        self.constructor()
+        # self.destructor()
+
+        for p in self.sim.properties:
+            if (p.type()==Types.Vector) or (Types.is_scalar(p.type())):
+                self.get_property(p)    
+                self.set_property(p)
+                self.sync_property(p)
+
+        for fp in self.sim.feature_properties:
+            self.get_property(fp)
+            self.set_property(fp)
+            self.sync_feature_property(fp)
+
+        self.utility_funcs()
+            
+        self.print.add_indent(-4)
+        self.print("};")
+        self.print("")
+
+    def DeviceProps_struct(self):
+        self.print("struct DeviceProps{")
+        self.print.add_indent(4)
+
+        self.print("int nlocal;")
+        self.print("int nghost;")
+        self.print("")
+
+        self.print("//Property device pointers")
+        for p in self.sim.properties:
+            pname = p.name()
+            tkw = Types.c_keyword(self.sim, p.type())
+            self.print(f"{tkw} *{pname}_d;")
+
+        self.print("")
+        self.print("//Property device flag pointers")
+        for p in self.sim.properties:
+            pname = p.name()
+            tkw = Types.c_keyword(self.sim, Types.Boolean)
+            self.print(f"{tkw} *{pname}_device_flag_d;")
+
+        self.print("")
+        self.print("//Feature properties on device are global")
+
+        self.print("")
+        self.print("//Feature properties have no flags on device since they can't be modified on device")
+
+        self.print.add_indent(-4)
+        self.print("};")   
+        self.print("")
+
+    def HostProps_struct(self):
+        self.print("// HostProps only contains property flags, since properties themselves can be directly accessed through ps->pobj")
+        self.print("// TODO: Move properties out of PairsObjects into DeviceProps and HostProps, so that all 3 structs have mutually exclusive members")
+        self.print("struct HostProps{")
+        self.print.add_indent(4)
+
+        self.print("")
+        self.print("//Property host pointers are in PairsObjects")
+
+        self.print("")
+        self.print("//Property host flags")
+        for p in self.sim.properties:
+            pname = p.name()
+            tkw = Types.c_keyword(self.sim, Types.Boolean)
+            self.print(f"{tkw} {pname}_host_flag = false;")
+        
+        self.print("")
+        self.print("//Property device flags")
+        for p in self.sim.properties:
+            pname = p.name()
+            tkw = Types.c_keyword(self.sim, Types.Boolean)
+            self.print(f"{tkw} {pname}_device_flag_h = false;")
+
+        self.print("")
+        self.print("//Feature property host pointers are in PairsObjects")
+
+        self.print("")
+        self.print("//Feature property host flags")
+        for fp in self.sim.feature_properties:
+            fpname = fp.name()
+            tkw = Types.c_keyword(self.sim, Types.Boolean)
+            self.print(f"{tkw} {fpname}_host_flag = false;")
+        
+        self.print("")
+        self.print("//Feature properties have no device flags")
+
+        self.print.add_indent(-4)
+        self.print("};")
+        self.print("")
+
+    def member_variables(self):
+        self.print("PairsSimulation *ps;")
+        if self.target.is_gpu():
+            self.print("pairs::internal::HostProps *hp;")
+            self.print("pairs::internal::DeviceProps *dp_h;")
+            self.print("pairs::internal::DeviceProps *dp_d;")
+
+    def update(self):
+        self.print(f"{self.host_attr}void update(){{")
+        if self.target.is_gpu():
+            self.print.add_indent(4)
+            self.print(f"dp_h->nlocal = ps->pobj->nlocal;")
+            self.print(f"dp_h->nghost = ps->pobj->nghost;")
+
+            for p in self.sim.properties:
+                pname = p.name()
+                self.print(f"dp_h->{pname}_d = ps->pobj->{pname}_d;")
+
+            self.print(f"cudaMemcpy(dp_d, dp_h, sizeof(pairs::internal::DeviceProps), cudaMemcpyHostToDevice);")
+            self.print.add_indent(-4)
+        self.print("}")
+        self.print("")
+
+    def constructor(self):
+        if self.target.is_gpu():
+            self.print(f"{self.host_attr}PairsAccessor(PairsSimulation *ps_): ps(ps_){{")
+            self.print.add_indent(4)
+
+            self.print(f"hp = new pairs::internal::HostProps;")
+            self.print(f"dp_h = new pairs::internal::DeviceProps;")
+            self.print(f"cudaMalloc(&dp_d, sizeof(pairs::internal::DeviceProps));")
+
+            for p in self.sim.properties:
+                pname = p.name()
+                tkw = Types.c_keyword(self.sim, Types.Boolean)
+                self.print(f"cudaMalloc(&(dp_h->{pname}_device_flag_d), sizeof({tkw}));")
+        
+            self.print("this->update();")
+            self.print.add_indent(-4)
+            self.print("}")
+
+        else:
+            self.print("PairsAccessor(PairsSimulation *ps_): ps(ps_){}")
+
+        self.print("")
+
+    def destructor(self):
+        if self.target.is_gpu():
+            self.print(f"{self.host_attr}~PairsAccessor(){{")
+            self.print.add_indent(4)
+
+            for p in self.sim.properties:
+                pname = p.name()
+                tkw = Types.c_keyword(self.sim, Types.Boolean)
+                self.print(f"cudaFree(dp_h->{pname}_device_flag_d);")
+
+            self.print(f"delete hp;")
+            self.print(f"delete dp_h;")
+            self.print(f"cudaFree(dp_d);")
+
+            self.print.add_indent(-4)
+            self.print("}")
+            self.print("")
+
+    def ifdef_else(self, ifdef, func1, args1, func2, args2):
+        self.print.add_indent(4)
+        self.print(f"#ifdef {ifdef}")
+        func1(*args1)
+        self.print("#else")
+        func2(*args2)
+        self.print("#endif")
+        self.print.add_indent(-4)
+    
+    def generate_ref_name(self, prop, device):
+        pname = prop.name()
+
+        if self.target.is_gpu() and device:
+            if isinstance(prop, Property):
+                return f"dp_d->{pname}_d"
+            
+            elif isinstance(prop, FeatureProperty):
+                return f"{pname}_d"
+        else:
+            return f"ps->pobj->{pname}"
+
+    def getter_body(self, prop, device=False):
+        self.print.add_indent(4)
+        tkw = Types.c_accessor_keyword(self.sim, prop.type())
+        ptr = self.generate_ref_name(prop, device)
+
+        if isinstance(prop, Property):
+            idx = "i"
+        elif isinstance(prop, FeatureProperty):
+            fname = prop.feature().name()
+            idx = f"({prop.feature().nkinds()}*{fname}1 + {fname}2)"
+
+        if Types.is_scalar(prop.type()):
+            self.print(f"return {ptr}[{idx}];")
+        else:
+            nelems = Types.number_of_elements(self.sim, prop.type())
+            return_values = [f"{ptr}[{idx}*{nelems} + {n}]" for n in range(nelems)] 
+            self.print(f"return {tkw}(" + ", ".join(rv for rv in return_values) + ");")
+        self.print.add_indent(-4)
+
+    def get_property(self, prop):
+        pname = prop.name()
+        tkw = Types.c_accessor_keyword(self.sim, prop.type())
+
+        if isinstance(prop, Property):
+            splitname = pname.split('_')
+            funcname = ''.join(word.capitalize() for word in splitname)
+            params = "const size_t i"
+
+        elif isinstance(prop, FeatureProperty):
+            fname = prop.feature().name()
+            splitname = fname.split('_') + pname.split('_')
+            funcname = ''.join(word.capitalize() for word in splitname)
+            params = f"const size_t {fname}1, const size_t {fname}2"
+        
+        self.print(f"{self.host_device_attr}{tkw} get{funcname}({params}) const{{")
+
+        if self.target.is_gpu():
+            self.ifdef_else("__CUDA_ARCH__", self.getter_body, [prop, True], self.getter_body, [prop, False])
+        else:
+            self.getter_body(prop, False)
+
+        self.print("}")
+        self.print("")
+
+    def setter_body(self, prop, device=False):
+        self.print.add_indent(4)
+        ptr = self.generate_ref_name(prop, device)
+
+        if isinstance(prop, Property):
+            idx = "i"
+        elif isinstance(prop, FeatureProperty):
+            fname = prop.feature().name()
+            idx = f"({prop.feature().nkinds()}*{fname}1 + {fname}2)"
+
+        if Types.is_scalar(prop.type()):
+            self.print(f"{ptr}[{idx}] = value;")
+        else:
+            nelems = Types.number_of_elements(self.sim, prop.type())
+            for n in range(nelems):
+                self.print(f"{ptr}[{idx}*{nelems} + {n}] = value[{n}];")
+
+        if self.target.is_gpu():
+            pname = prop.name()
+            flag = f"*(dp_d->{pname}_device_flag_d)" if device else f"hp->{pname}_host_flag"
+            self.print(f"{flag} = true;")
+
+        self.print.add_indent(-4)
+
+    def set_property(self, prop):
+        pname = prop.name()
+        tkw = Types.c_accessor_keyword(self.sim, prop.type())
+
+        if isinstance(prop, Property):
+            splitname = pname.split('_')
+            funcname = ''.join(word.capitalize() for word in splitname)
+            self.print(f"{self.host_device_attr}void set{funcname}(const size_t i, const {tkw} &value){{")
+
+        elif isinstance(prop, FeatureProperty):
+            fname = prop.feature().name()
+            splitname = fname.split('_') + pname.split('_')
+            funcname = ''.join(word.capitalize() for word in splitname)
+            # Feature properties can only be set from host
+            self.print(f"{self.host_attr}void set{funcname}(const size_t {fname}1, const size_t {fname}2, const {tkw} &value){{")
+
+        if self.target.is_gpu():
+            if isinstance(prop, Property):
+                self.ifdef_else("__CUDA_ARCH__", self.setter_body, [prop, True], self.setter_body, [prop, False])
+            
+            elif isinstance(prop, FeatureProperty):
+                self.setter_body(prop, False)
+        else:
+            self.setter_body(prop, False)
+
+        self.print("}")
+        self.print("")
+    
+    def sync_ctx_enum(self):
+        self.print("enum SyncContext{")
+        self.print("    Host = 0,")
+        self.print("    Device")
+        self.print("};")
+        self.print("")
+
+    def sync_property(self, prop):
+        pname = prop.name()
+        pid = prop.id()
+        splitname = pname.split('_')
+        funcname = ''.join(word.capitalize() for word in splitname)
+
+        self.print(f"{self.host_attr}void sync{funcname}(SyncContext sync_ctx = Host, bool overwrite = false){{")
+
+        if self.target.is_gpu():
+            self.print.add_indent(4)
+            self.print(f"cudaMemcpy(&(hp->{pname}_device_flag_h), dp_h->{pname}_device_flag_d, sizeof(bool), cudaMemcpyDeviceToHost);")
+            self.print("")
+            
+            #####################################################################################################################
+            #####################################################################################################################
+
+            self.print(f"if (hp->{pname}_host_flag && hp->{pname}_device_flag_h){{")
+            self.print(f"    PAIRS_ERROR(\"OUT OF SYNC 1! Both host and device versions of {pname} are in a modified state.\\n\");")
+            self.print("    exit(-1);")
+            self.print("}")
+            self.print(f"else if(sync_ctx==Host && overwrite==false){{")
+            self.print(f"   if (hp->{pname}_host_flag && !ps->pairs_runtime->getPropFlags()->isHostFlagSet({pid})){{")
+            self.print(f"       PAIRS_ERROR(\"OUT OF SYNC 2! Did you forget to sync{funcname}(Host) before calling set{funcname} from host? Use sync{funcname}(Host,true) if you want to overwrite {pname} values in host.\\n\");")
+            self.print("        exit(-1);")
+            self.print("    }")
+            self.print("}")
+            self.print(f"else if(sync_ctx==Device && overwrite==false){{")
+            self.print(f"   if (hp->{pname}_device_flag_h && !ps->pairs_runtime->getPropFlags()->isDeviceFlagSet({pid})){{")
+            self.print(f"       PAIRS_ERROR(\"OUT OF SYNC 3! Did you forget to sync{funcname}(Device) before calling set{funcname} from device? Use sync{funcname}(Device,true) if you want to overwrite {pname} values in device.\\n\");")
+            self.print("        exit(-1);")
+            self.print("    }")
+            self.print("}")
+            self.print("")
+
+            #####################################################################################################################
+            #####################################################################################################################
+
+            self.print(f"if (hp->{pname}_host_flag){{")
+            self.print(f"    ps->pairs_runtime->getPropFlags()->setHostFlag({pid});")
+            self.print(f"    ps->pairs_runtime->getPropFlags()->clearDeviceFlag({pid});")
+            self.print("}")
+            
+            self.print(f"else if (hp->{pname}_device_flag_h){{")
+            self.print(f"    ps->pairs_runtime->getPropFlags()->setDeviceFlag({pid});")
+            self.print(f"    ps->pairs_runtime->getPropFlags()->clearHostFlag({pid});")
+            self.print("}")
+            self.print("")
+
+            nelems = Types.number_of_elements(self.sim, prop.type())
+            tkw = Types.c_keyword(self.sim, prop.type())
+
+            self.print(f"if (sync_ctx==Device) {{")
+            self.print(f"    ps->pairs_runtime->copyPropertyToDevice({pid}, ReadOnly, (((ps->pobj->nlocal + ps->pobj->nghost) * {nelems}) * sizeof({tkw})));")
+            self.print("}")
+            self.print("")
+
+            self.print(f"if (sync_ctx==Host) {{")
+            self.print(f"    ps->pairs_runtime->copyPropertyToHost({pid}, ReadOnly, (((ps->pobj->nlocal + ps->pobj->nghost) * {nelems}) * sizeof({tkw})));")
+            self.print("}")
+            self.print("")
+
+            self.print(f"hp->{pname}_host_flag = false;")
+            self.print(f"hp->{pname}_device_flag_h = false;")
+            self.print(f"cudaMemcpy(dp_h->{pname}_device_flag_d, &(hp->{pname}_device_flag_h), sizeof(bool), cudaMemcpyHostToDevice);")
+
+            self.print.add_indent(-4)
+        self.print("}")
+        self.print("")
+        
+    def sync_feature_property(self, fp):
+        fp_id = fp.id()
+        fp_name = fp.name()
+        f_name = fp.feature().name()
+        splitname = f_name.split('_') + fp_name.split('_')
+        funcname = ''.join(word.capitalize() for word in splitname)
+
+        self.print(f"{self.host_attr}void sync{funcname}(SyncContext sync_ctx = Host){{")
+
+        if self.target.is_gpu():
+            self.print.add_indent(4)
+            self.print(f"if (hp->{fp_name}_host_flag && sync_ctx==Device) {{")
+            self.print(f"    ps->pairs_runtime->copyFeaturePropertyToDevice({fp_id});")
+            self.print("}")
+            self.print("")
+
+            self.print(f"hp->{fp_name}_host_flag = false;")
+            self.print.add_indent(-4)
+
+        self.print("}")
+        self.print("")
+
+    def utility_funcs(self):
+        nlocal = "ps->pobj->nlocal"
+        nlocal_d = "dp_d->nlocal"
+        nghost = "ps->pobj->nghost"
+        nghost_d = "dp_d->nghost"
+
+        if self.target.is_gpu():
+            self.print(f"{self.host_device_attr}int size() const {{")
+            self.print(f"    #ifdef __CUDA_ARCH__")
+            self.print(f"        return {nlocal_d} + {nghost_d};")
+            self.print(f"    #else")
+            self.print(f"        return {nlocal} + {nghost};")
+            self.print(f"    #endif")
+            self.print("}")
+            self.print("")
+        else:
+            self.print(f"int size() const {{return {nlocal} + {nghost};}}")
+
+        if self.target.is_gpu():
+            self.print(f"{self.host_device_attr}int nlocal() const {{")
+            self.print(f"    #ifdef __CUDA_ARCH__")
+            self.print(f"        return {nlocal_d};")
+            self.print(f"    #else")
+            self.print(f"        return {nlocal};")
+            self.print(f"    #endif")
+            self.print("}")
+            self.print("")
+        else:
+            self.print(f"int nlocal() const {{return {nlocal};}}")
+
+        if self.target.is_gpu():
+            self.print(f"{self.host_device_attr}int nghost() const {{")
+            self.print(f"    #ifdef __CUDA_ARCH__")
+            self.print(f"        return {nghost_d};")
+            self.print(f"    #else")
+            self.print(f"        return {nghost};")
+            self.print(f"    #endif")
+            self.print("}")
+            self.print("")
+        else:
+            self.print(f"int nghost() const {{return {nghost};}}")
+
+
+        self.print(f"{self.host_device_attr}int getInvalidIdx(){{return -1;}}")
+        self.print("")
+
+        self.print(f"{self.host_device_attr}pairs::id_t getInvalidUid(){{return 0;}}")
+        self.print("")
+
+        self.print(f"{self.host_device_attr}int uidToIdx(pairs::id_t uid){{")
+        self.print("    int idx = getInvalidIdx();")
+        self.print("    for(int i=0; i<size(); ++i){")
+        self.print("        if (getUid(i) == uid){")
+        self.print("            idx = i;")
+        self.print("            break;")
+        self.print("        }")
+        self.print("    }")
+        self.print("    return idx;")
+        self.print("}")
+        self.print("")
+
+        self.print(f"{self.host_device_attr}int uidToIdxLocal(pairs::id_t uid){{")
+        self.print("    int idx = getInvalidIdx();")
+        self.print("    for(int i=0; i<nlocal(); ++i){")
+        self.print("        if (getUid(i) == uid){")
+        self.print("            idx = i;")
+        self.print("            break;")
+        self.print("        }")
+        self.print("    }")
+        self.print("    return idx;")
+        self.print("}")
+        self.print("")
+
+        self.print(f"{self.host_device_attr}int uidToIdxGhost(pairs::id_t uid){{")
+        self.print("    int idx = getInvalidIdx();")
+        self.print("    for(int i=nlocal(); i<size(); ++i){")
+        self.print("        if (getUid(i) == uid){")
+        self.print("            idx = i;")
+        self.print("            break;")
+        self.print("        }")
+        self.print("    }")
+        self.print("    return idx;")
+        self.print("}")
+        self.print("")
diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py
index a053d942e72fabf44dba726c8bd6c4e93a919f5e..76e22283330f531eed2f449a30eee798bf62c00e 100644
--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -9,7 +9,7 @@ from pairs.ir.cast import Cast
 from pairs.ir.contexts import Contexts
 from pairs.ir.declaration import Decl
 from pairs.ir.scalars import ScalarOp
-from pairs.ir.device import CopyArray, CopyContactProperty, CopyProperty, CopyVar, DeviceStaticRef, HostRef
+from pairs.ir.device import CopyArray, CopyContactProperty, CopyProperty, CopyFeatureProperty, CopyVar, DeviceStaticRef, HostRef
 from pairs.ir.features import FeatureProperty, FeaturePropertyAccess, RegisterFeatureProperty
 from pairs.ir.functions import Call
 from pairs.ir.kernel import KernelLaunch
@@ -26,12 +26,15 @@ from pairs.ir.properties import Property, PropertyAccess, RegisterProperty, Real
 from pairs.ir.select import Select
 from pairs.ir.sizeof import Sizeof
 from pairs.ir.types import Types
-from pairs.ir.utils import Print
+from pairs.ir.print import Print, PrintCode
 from pairs.ir.variables import Var, DeclareVariable, Deref
+from pairs.ir.parameters import Parameter
 from pairs.ir.vectors import Vector, VectorAccess, VectorOp, ZeroVector
+from pairs.ir.ret import Return
 from pairs.sim.domain_partitioners import DomainPartitioners
 from pairs.sim.timestep import Timestep
 from pairs.code_gen.printer import Printer
+from pairs.code_gen.accessor import PairsAcessor
 
 
 class CGen:
@@ -42,6 +45,7 @@ class CGen:
         self.target = None
         self.print = None
         self.kernel_context = False
+        self.generate_full_object_names = False
         self.ref = ref
         self.debug = debug
 
@@ -53,15 +57,104 @@ class CGen:
 
     def real_type(self):
         return Types.c_keyword(self.sim, Types.Real)
+    
+    # def generate_cmake_config_file(self):
+    #     self.print = Printer("pairs_cmake_params.txt")
+    #     self.print.start()
+    #     self.print(f"PAIRS_TARGET={self.ref}")
+    #     self.print(f"GENERATE_WHOLE_PROGRAM={'ON' if self.sim._generate_whole_program else 'OFF'}")
+    #     self.print(f"USE_WALBERLA={'ON' if self.sim._partitioner == DomainPartitioners.BlockForest else 'OFF'}")
+    #     # self.print(f"COMPILE_CUDA={'ON' if self.target.is_gpu() else 'OFF'}")
+    #     self.print.end()
+
+    def generate_object_reference(self, obj, device=False, index=None):
+        if device and (not self.target.is_gpu() or not obj.device_flag):
+            # Ideally this should never be called
+            return "nullptr"
+        
+        name = obj.name() if not device else f"{obj.name()}_d"
+        t = obj.type()
+        if not Types.is_scalar(t) and index is not None:
+            name += f"_{index}"
+
+        if isinstance(obj, Var):
+            if self.generate_full_object_names:
+                if not obj.temporary():
+                    if obj.device_flag and self.target.is_gpu() and device:
+                        return f"pobj->rv_{obj.name()}"
+                    else:
+                        return f"pobj->{name}"
+            return name
+
+        if isinstance(obj, FeatureProperty) and device and obj.device_flag:
+            return name
+        
+        if isinstance(obj, Array) and device and obj.device_flag:
+            if obj.is_static():
+                return name
+        
+
+        if self.generate_full_object_names:
+            return f"pobj->{name}"
+        else:
+            return name
+        
 
-    def generate_program(self, ast_node):
-        ext = ".cu" if self.target.is_gpu() else ".cpp"
-        self.print = Printer(self.ref + ext)
+    def generate_object_address(self, obj, device=False, index=None):
+        if device and (not self.target.is_gpu() or not obj.device_flag):
+            return "nullptr"
+
+        ref = self.generate_object_reference(obj, device, index)
+        return f"&({ref})"
+
+    def generate_interfaces(self):
+        #self.print = Printer(f"interfaces/{self.ref}.hpp")
+        self.print = Printer("internal_interfaces/last_generated.hpp")
         self.print.start()
+        self.print("#pragma once")
+        self.generate_interface_namespace('pairs_host_interface')
 
         if self.target.is_gpu():
-            self.print("#define PAIRS_TARGET_CUDA")
+            self.generate_interface_namespace('pairs_cuda_interface', "__inline__ __device__")
+            
+        self.print.end()
+
+    def generate_interface_namespace(self, namespace, prefix=None):
+        self.print("")
+        self.print(f"namespace {namespace} {{")
+        self.print("")
 
+        for prop in self.sim.properties.all():
+            prop_name = prop.name()
+            t = prop.type()
+            tkw = Types.c_keyword(self.sim, t)
+            func_decl = "" if prefix is None else f"{prefix} "
+            if Types.is_scalar(t):
+                func_decl += f"{tkw} get_{prop_name}({tkw} *{prop_name}, int i) {{ return {prop_name}[i]; }}"
+
+            else:
+                nelems = Types.number_of_elements(self.sim, t)
+                func_decl += f"{tkw} get_{prop_name}({tkw} *{prop_name}, int i, int j, int capacity) {{ return {prop_name}["
+
+                if prop.layout() == Layouts.AoS:
+                    func_decl += f"i * {nelems} + j"
+
+                else:
+                    func_decl += f"j * capacity + i"
+
+                func_decl += "]; }"
+
+            self.print(func_decl)
+
+        self.print("")
+        self.print("}")
+
+    def generate_preamble(self):
+        # self.print(f"#define APPLICATION_REFERENCE \"{self.ref}\"")
+
+        if self.target.is_gpu():
+            self.print("#include <math_constants.h>")
+             
         if self.target.is_openmp():
             self.print("#define PAIRS_TARGET_OPENMP")
             self.print("#include <omp.h>")
@@ -72,135 +165,346 @@ class CGen:
         self.print("#include <stdio.h>")
         self.print("#include <stdlib.h>")
         self.print("//---")
-        self.print("#include \"runtime/likwid-marker.h\"")
-        self.print("#include \"runtime/copper_fcc_lattice.hpp\"")
-        self.print("#include \"runtime/dem_sc_grid.hpp\"")
-        self.print("#include \"runtime/pairs.hpp\"")
-        self.print("#include \"runtime/read_from_file.hpp\"")
-        self.print("#include \"runtime/stats.hpp\"")
-        self.print("#include \"runtime/timing.hpp\"")
-        self.print("#include \"runtime/thermo.hpp\"")
-        self.print("#include \"runtime/vtk.hpp\"")
-
-        #if self.target.is_gpu():
-        #    self.print("#include \"runtime/devices/cuda.hpp\"")
-        #else:
-        #    self.print("#include \"runtime/devices/dummy.hpp\"")
-
+        self.print("#include \"likwid-marker.h\"")
+        self.print("#include \"copper_fcc_lattice.hpp\"")
+        self.print("#include \"create_body.hpp\"")
+        self.print("#include \"dem_sc_grid.hpp\"")
+        self.print("#include \"pairs.hpp\"")
+        self.print("#include \"read_from_file.hpp\"")
+        self.print("#include \"stats.hpp\"")
+        self.print("#include \"timing.hpp\"")
+        self.print("#include \"thermo.hpp\"")
+        self.print("#include \"vtk.hpp\"")
         self.print("")
         self.print("using namespace pairs;")
         self.print("")
 
+    def generate_module_header(self, module, definition=True):
+        module_params = []
+
+        if not module.interface:
+            module_params += ["PairsRuntime *pairs_runtime", "struct PairsObjects *pobj"]
+
+        if module.name=="initialize" and self.sim.create_domain_at_initialization:
+            module_params += ["int argc", "char **argv"]
+
+        if module.name=="set_domain":
+            module_params += ["int argc", "char **argv"]
+
+        module_params += [f"{Types.c_keyword(self.sim, param.type())} {param.name()}" for param in module.parameters()]
+
+        print_params = ", ".join(module_params)
+        ending = "{" if definition else ";"
+        tkw = Types.c_keyword(self.sim, module.return_type)
+        self.print(f"{tkw} {module.name}({print_params}){ending}")
+
+    def generate_module_decls(self):
+        self.print("")
+        self.print("namespace pairs::internal {")
+        self.print.add_indent(4)
+
+        # All modules except the interface ones are declared in the pairs::internal scope
+        for module in self.sim.modules() + self.sim.udf_modules():
+            assert not module.interface
+            self.generate_module_header(module, definition=False)
+        
+        self.print.add_indent(-4)
+        self.print("}")
+        self.print("")
+        
+    def generate_pairs_object_structure(self):
+        self.print("")
+        externkw = "" if self.sim._generate_whole_program else "extern "
         if self.target.is_gpu():
             for array in self.sim.arrays.statics():
                 if array.device_flag:
                     t = array.type()
                     tkw = Types.c_keyword(self.sim, t)
                     size = self.generate_expression(ScalarOp.inline(array.alloc_size()))
-                    self.print(f"__constant__ {tkw} d_{array.name()}[{size}];")
+                    self.print(f"{externkw}__constant__ {tkw} {array.name()}_d[{size}];")
 
             for feature_prop in self.sim.feature_properties:
                 if feature_prop.device_flag:
                     t = feature_prop.type()
                     tkw = Types.c_keyword(self.sim, t)
                     size = feature_prop.array_size()
-                    self.print(f"__constant__ {tkw} d_{feature_prop.name()}[{size}];")
+                    self.print(f"{externkw}__constant__ {tkw} {feature_prop.name()}_d[{size}];")
 
         self.print("")
+        self.print("struct PairsObjects {")
+        self.print.add_indent(4)
+
+        self.print("// Arrays")
+        for a in self.sim.arrays.all():
+            ptr = a.name()
+            tkw = Types.c_keyword(self.sim, a.type())
+
+            if a.is_static():
+                size = self.generate_expression(ScalarOp.inline(a.alloc_size()))
+                self.print(f"{tkw} {ptr}[{size}];")
+
+            else:
+                self.print(f"{tkw} *{ptr};")
+
+            if self.target.is_gpu() and a.device_flag:
+                if a.is_static():
+                    continue
+                else:
+                    self.print(f"{tkw} *{ptr}_d;")
+
+        self.print("// Properties")
+        for p in self.sim.properties:
+            ptr = p.name()
+            tkw = Types.c_keyword(self.sim, p.type())
+            self.print(f"{tkw} *{ptr};")
+
+            if self.target.is_gpu() and p.device_flag:
+                self.print(f"{tkw} *{ptr}_d;")
+
+        self.print("// Contact properties")
+        for cp in self.sim.contact_properties:
+            ptr = cp.name()
+            tkw = Types.c_keyword(self.sim, cp.type())
+            self.print(f"{tkw} *{ptr};")
+
+            if self.target.is_gpu() and cp.device_flag:
+                self.print(f"{tkw} *{ptr}_d;")
+
+        self.print("// Feature properties")
+        for fp in self.sim.feature_properties:
+            ptr = fp.name()
+            array_size = fp.array_size()
+            tkw = Types.c_keyword(self.sim, fp.type())
+            self.print(f"{tkw} {ptr}[{array_size}];")
+
+        self.print("// Variables")
+        for v in self.sim.vars.all():
+            vname = v.name()
+            tkw = Types.c_keyword(self.sim, v.type())
+            self.print(f"{tkw} {vname};")
+
+            if self.target.is_gpu() and v.device_flag:
+                self.print(f"RuntimeVar<{tkw}> rv_{vname};")
+
+        self.print.add_indent(-4)
+        self.print("};")
+        self.print("")
+
+    def generate_program(self, ast_node):
+        self.generate_interfaces()
+        ext = ".cu" if self.target.is_gpu() else ".cpp"
+        self.print = Printer(self.ref + ext)
+        self.print.start()
+        self.generate_preamble()
+        self.generate_pairs_object_structure()
+        self.generate_module_decls()
+
+        self.print("namespace pairs::internal {")
+        self.print.add_indent(4)
 
         for kernel in self.sim.kernels():
             self.generate_kernel(kernel)
 
         for module in self.sim.modules():
+            if module.name!='main':
+                self.generate_module(module)
+
+        self.print.add_indent(-4)
+        self.print("}")
+
+        for module in self.sim.modules():
+            if module.name=='main':
+                self.generate_main(module)
+
+        self.print.end()
+
+    def generate_library(self):
+        self.generate_interfaces()
+        # Generate CUDA/CPP file with modules
+        ext = ".cu" if self.target.is_gpu() else ".cpp"
+        self.print = Printer(self.ref + ext)
+        self.print.start()
+        self.generate_preamble()
+        self.print(f"#include \"{self.ref}.hpp\"")
+        self.print("")
+
+        if self.target.is_gpu():
+            for array in self.sim.arrays.statics():
+                if array.device_flag:
+                    t = array.type()
+                    tkw = Types.c_keyword(self.sim, t)
+                    size = self.generate_expression(ScalarOp.inline(array.alloc_size()))
+                    self.print(f"__constant__ {tkw} {array.name()}_d[{size}];")
+
+            for feature_prop in self.sim.feature_properties:
+                if feature_prop.device_flag:
+                    t = feature_prop.type()
+                    tkw = Types.c_keyword(self.sim, t)
+                    size = feature_prop.array_size()
+                    self.print(f"__constant__ {tkw} {feature_prop.name()}_d[{size}];")
+
+        self.print("")
+                    
+        self.print("namespace pairs::internal {")
+        self.print.add_indent(4)
+
+        for kernel in self.sim.kernels():
+            self.generate_kernel(kernel)
+
+        # All modules except the interface ones are defined in the pairs::internal scope
+        for module in self.sim.modules() + self.sim.udf_modules():
+            assert not module.interface
             self.generate_module(module)
 
+        self.print.add_indent(-4)
+        self.print("}")
+
         self.print.end()
 
-    def generate_module(self, module):
-        if module.name == 'main':
-            ndims = module.sim.ndims()
-            nprops = module.sim.properties.nprops()
-            ncontactprops = module.sim.contact_properties.nprops()
-            narrays = module.sim.arrays.narrays()
-            part = DomainPartitioners.c_keyword(module.sim.partitioner())
+        # Generate library header
+        self.print = Printer(self.ref + ".hpp")
+        self.print.start()
+        self.print("#pragma once")
+
+        self.generate_preamble()
+        self.generate_pairs_object_structure()
+        self.generate_module_decls()
+
+        self.generate_full_object_names = True
+        self.print("class PairsSimulation {")
+        self.print("private:")
+        self.print("    PairsRuntime *pairs_runtime;")
+        self.print("    struct PairsObjects *pobj;")
+        self.print("    friend class PairsAccessor;")
+        self.print("")
+        self.print("public:")
+        self.print.add_indent(4)
 
-            self.print("int main(int argc, char **argv) {")
-            self.print(f"    PairsSimulation *pairs = new PairsSimulation({nprops}, {ncontactprops}, {narrays}, {part});")
+        self.print("PairsRuntime* getPairsRuntime() {")
+        self.print("    return pairs_runtime;")
+        self.print("}")
 
-            if module.sim._enable_profiler:
-                self.print("    LIKWID_MARKER_INIT;")
+        # Only interface modules are generated in the PairsSimulation class
+        for module in self.sim.interface_modules():
+            self.generate_module(module)
 
-            self.generate_statement(module.block)
+        self.print.add_indent(-4)
+        self.print("};")
 
-            if module.sim._enable_profiler:
-                self.print("    LIKWID_MARKER_CLOSE;")
+        PairsAcessor(self).generate()
+        
+        self.print.end()
+        self.generate_full_object_names = False
 
-            self.print("    pairs::print_timers(pairs);")
-            self.print("    pairs::print_stats(pairs, nlocal, nghost);")
-            self.print("    delete pairs;")
-            self.print("    return 0;")
-            self.print("}")
+    def generate_module_declerations(self, module):
+        device_cond = module.run_on_device and self.target.is_gpu()
 
-        else:
-            module_params = "PairsSimulation *pairs"
-            for var in module.read_only_variables():
-                type_kw = Types.c_keyword(self.sim, var.type())
-                decl = f"{type_kw} {var.name()}"
-                module_params += f", {decl}"
-
-            for var in module.write_variables():
-                type_kw = Types.c_keyword(self.sim, var.type())
-                decl = f"{type_kw} *{var.name()}"
-                module_params += f", {decl}"
-
-            for array in module.arrays():
-                type_kw = Types.c_keyword(self.sim, array.type())
-                decl = f"{type_kw} *{array.name()}"
-                module_params += f", {decl}"
-
-                if array in module.host_references():
-                    decl = f"{type_kw} *h_{array.name()}"
-                    module_params += f", {decl}"
-
-            for prop in module.properties():
-                type_kw = Types.c_keyword(self.sim, prop.type())
-                decl = f"{type_kw} *{prop.name()}"
-                module_params += f", {decl}"
-
-                if prop in module.host_references():
-                    decl = f"{type_kw} *h_{prop.name()}"
-                    module_params += f", {decl}"
-
-            for contact_prop in module.contact_properties():
-                type_kw = Types.c_keyword(self.sim, contact_prop.type())
-                decl = f"{type_kw} *{contact_prop.name()}"
-                module_params += f", {decl}"
-
-                if contact_prop in module.host_references():
-                    decl = f"{type_kw} *h_{contact_prop.name()}"
-                    module_params += f", {decl}"
-
-            for feature_prop in module.feature_properties():
-                type_kw = Types.c_keyword(self.sim, feature_prop.type())
-                decl = f"{type_kw} *{feature_prop.name()}"
-                module_params += f", {decl}"
-
-                if feature_prop in module.host_references():
-                    decl = f"{type_kw} *h_{feature_prop.name()}"
-                    module_params += f", {decl}"
-
-            self.print(f"void {module.name}({module_params}) {{")
-
-            if self.debug:
-                self.print.add_indent(4)
-                self.print(f"PAIRS_DEBUG(\"{module.name}\\n\");")
-                self.print.add_indent(-4)
-
-            self.generate_statement(module.block)
-            self.print("}")
+        for var in module.read_only_variables():
+            type_kw = Types.c_keyword(self.sim, var.type())
+            self.print(f"{type_kw} {var.name()} = pobj->{var.name()};")
+
+        for var in module.write_variables():
+            type_kw = Types.c_keyword(self.sim, var.type())
+
+            if device_cond and var.device_flag:
+                self.print(f"{type_kw} *{var.name()} = pobj->rv_{var.name()}.getDevicePointer();")
+            elif var.force_read:
+                self.print(f"{type_kw} {var.name()} = pobj->{var.name()};")
+            else:
+                self.print(f"{type_kw} *{var.name()} = &(pobj->{var.name()});")
+
+        for array in module.arrays():
+            type_kw = Types.c_keyword(self.sim, array.type())
+            name = array.name() if not device_cond else f"{array.name()}_d"
+            if not array.is_static() or (array.is_static() and not device_cond):
+                self.print(f"{type_kw} *{array.name()} = pobj->{name};")
+
+            if array in module.host_references():
+                self.print(f"{type_kw} *{array.name()}_h = pobj->{array.name()};")
+
+
+        for prop in module.properties():
+            type_kw = Types.c_keyword(self.sim, prop.type())
+            name = prop.name() if not device_cond else f"{prop.name()}_d"
+            self.print(f"{type_kw} *{prop.name()} = pobj->{name};")
+
+            if prop in module.host_references():
+                self.print(f"{type_kw} *{prop.name()}_h = pobj->{prop.name()};")
+
+        for contact_prop in module.contact_properties():
+            type_kw = Types.c_keyword(self.sim, contact_prop.type())
+            name = contact_prop.name() if not device_cond else f"{contact_prop.name()}_d"
+            self.print(f"{type_kw} *{contact_prop.name()} = pobj->{name};")
+
+            if contact_prop in module.host_references():
+                self.print(f"{type_kw} *{contact_prop.name()}_h = pobj->{contact_prop.name()};")
+
+        for feature_prop in module.feature_properties():
+            type_kw = Types.c_keyword(self.sim, feature_prop.type())
+            name = feature_prop.name() if not device_cond else f"{feature_prop.name()}_d"
+
+            if feature_prop.device_flag and device_cond:
+                # self.print(f"{type_kw} *{feature_prop.name()} = {self.generate_object_reference(feature_prop, device=device_cond)};")
+                continue
+            else:
+                self.print(f"{type_kw} *{feature_prop.name()} = pobj->{name};")
+
+            if feature_prop in module.host_references():
+                self.print(f"{type_kw} *{feature_prop.name()}_h = pobj->{feature_prop.name()};")
+
+    def generate_main(self, module):
+        assert module.name=='main'
+
+        ndims = module.sim.ndims()
+        nprops = module.sim.properties.nprops()
+        ncontactprops = module.sim.contact_properties.nprops()
+        narrays = module.sim.arrays.narrays()
+        part = DomainPartitioners.c_keyword(module.sim.partitioner())
+
+        self.generate_full_object_names = True
+        self.print("int main(int argc, char **argv) {")
+        self.print(f"    PairsRuntime *pairs_runtime = new PairsRuntime({nprops}, {ncontactprops}, {narrays}, {part});")
+        self.print(f"    struct PairsObjects *pobj = new PairsObjects();")
+
+        if module.sim._enable_profiler:
+            self.print("    LIKWID_MARKER_INIT;")
+
+        self.generate_statement(module.block)
+
+        if module.sim._enable_profiler:
+            self.print("    LIKWID_MARKER_CLOSE;")
+
+        self.print("    pairs::print_timers(pairs_runtime);")
+        self.print("    pairs::print_stats(pairs_runtime, pobj->nlocal, pobj->nghost);")
+        self.print("    delete pobj;")
+        self.print("    delete pairs_runtime;")
+        self.print("    return 0;")
+        self.print("}")
+        self.generate_full_object_names = False
+
+    def generate_module(self, module):
+        self.generate_module_header(module, definition=True)
+        self.print.add_indent(4)
+
+        if self.debug:
+            self.print(f"PAIRS_DEBUG(\"\\n{module.name}\\n\");")
+
+        if not module.interface:
+            self.generate_module_declerations(module)
+
+        self.print.add_indent(-4)
+        self.generate_statement(module.block)
+        self.print("}")
+        self.print("")
 
     def generate_kernel(self, kernel):
         kernel_params = "int range_start"
+        has_resizes = False
+        for param in kernel.parameters():
+            type_kw = Types.c_keyword(self.sim, param.type())
+            decl = f"{type_kw} {param.name()}"
+            kernel_params += f", {decl}"
+
         for var in kernel.read_only_variables():
             type_kw = Types.c_keyword(self.sim, var.type())
             decl = f"{type_kw} {var.name()}"
@@ -211,10 +515,19 @@ class CGen:
             decl = f"{type_kw} *{var.name()}"
             kernel_params += f", {decl}"
 
+        for it in kernel.iters():
+            type_kw = Types.c_keyword(self.sim, it.type())
+            decl = f"{type_kw} {it.name()}"
+            kernel_params += f", {decl}"
+
         for array in kernel.arrays():
+            if array.is_static():
+                continue
             type_kw = Types.c_keyword(self.sim, array.type())
             decl = f"{type_kw} *{array.name()}"
             kernel_params += f", {decl}"
+            if array.name() == "resizes":
+                has_resizes = True
 
         for prop in kernel.properties():
             type_kw = Types.c_keyword(self.sim, prop.type())
@@ -227,6 +540,8 @@ class CGen:
             kernel_params += f", {decl}"
 
         for feature_prop in kernel.feature_properties():
+            if feature_prop.device_flag:
+                continue
             type_kw = Types.c_keyword(self.sim, feature_prop.type())
             decl = f"{type_kw} *{feature_prop.name()}"
             kernel_params += f", {decl}"
@@ -245,7 +560,9 @@ class CGen:
         self.print(f"    const int {kernel.iterator.name()} = blockIdx.x * blockDim.x + threadIdx.x + range_start;")
         self.print.add_indent(4)
         self.kernel_context = True
+
         self.generate_statement(kernel.block)
+
         self.kernel_context = False
         self.print.add_indent(-4)
         self.print("}")
@@ -255,6 +572,7 @@ class CGen:
             t = ast_node.array.type()
             tkw = Types.c_keyword(self.sim, t)
             size = self.generate_expression(ScalarOp.inline(ast_node.array.alloc_size()))
+
             if ast_node.array.init_value is not None:
                 v_str = str(ast_node.array.init_value)
                 if t == Types.Int64:
@@ -262,10 +580,8 @@ class CGen:
                 if t == Types.UInt64:
                     v_str += "ULL"
 
-                init_string = v_str + (f", {v_str}" * (size - 1))
-                self.print(f"{tkw} {ast_node.array.name()}[{size}] = {{{init_string}}};")
-            else:
-                self.print(f"{tkw} {ast_node.array.name()}[{size}];")
+                for i in range(size):
+                    self.print(f"{ast_node.array.name()}[{i}] = {v_str};")
 
         if isinstance(ast_node, Assign):
             if not Types.is_scalar(ast_node._dest.type()):
@@ -287,7 +603,10 @@ class CGen:
             if ast_node.check_for_resize():
                 resize = self.generate_expression(ast_node.resize)
                 capacity = self.generate_expression(ast_node.capacity)
+                # self.print(f"printf (\" %d -- before AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});")
                 self.print(f"pairs::{prefix}atomic_add_resize_check(&({elem}), {value}, &({resize}), {capacity});")
+                # self.print(f"printf (\" %d -- after AtomicInc: nsend = %d -- send_capacity = %d -- resizes[0] = %d\\n\", {Printer.line_id}, {elem}, {capacity}, {resize});")
+
             else:
                 self.print(f"pairs::{prefix}atomic_add(&({elem}), {value});")
 
@@ -484,10 +803,12 @@ class CGen:
             size = self.generate_expression(ast_node.size())
 
             if size is not None:
-                self.print(f"pairs->copyArrayTo{ctx_suffix}({array_id}, {action}, {size}); // {array_name}")
+                self.print(f"pairs_runtime->copyArrayTo{ctx_suffix}({array_id}, {action}, {size}); // {array_name}")
 
             else:
-                self.print(f"pairs->copyArrayTo{ctx_suffix}({array_id}, {action}); // {array_name}")
+                # self.print(f"std::cout<< \"{Printer.line_id} -- before {array_name} copyArrayTo{ctx_suffix}({action}) === \" <<  pobj->{array_name}[0]  << \" \" << pobj->{array_name}[1]  << \" \" << pobj->{array_name}[2]  << std::endl;")
+                self.print(f"pairs_runtime->copyArrayTo{ctx_suffix}({array_id}, {action}); // {array_name}")
+                # self.print(f"std::cout<< \"{Printer.line_id} -- after {array_name} copyArrayTo{ctx_suffix}({action}) === \" <<  pobj->{array_name}[0]  << \" \" << pobj->{array_name}[1]  << \" \" << pobj->{array_name}[2]  << std::endl;")
 
         if isinstance(ast_node, CopyContactProperty):
             prop_id = ast_node.contact_prop().id()
@@ -495,7 +816,7 @@ class CGen:
             action = Actions.c_keyword(ast_node.action())
             ctx_suffix = "Device" if ast_node.context() == Contexts.Device else "Host"
             size = self.generate_expression(ast_node.contact_prop().copy_size())
-            self.print(f"pairs->copyContactPropertyTo{ctx_suffix}({prop_id}, {action}, {size}); // {prop_name}")
+            self.print(f"pairs_runtime->copyContactPropertyTo{ctx_suffix}({prop_id}, {action}, {size}); // {prop_name}")
 
         if isinstance(ast_node, CopyProperty):
             prop_id = ast_node.prop().id()
@@ -503,12 +824,20 @@ class CGen:
             action = Actions.c_keyword(ast_node.action())
             ctx_suffix = "Device" if ast_node.context() == Contexts.Device else "Host"
             size = self.generate_expression(ast_node.prop().copy_size())
-            self.print(f"pairs->copyPropertyTo{ctx_suffix}({prop_id}, {action}, {size}); // {prop_name}")
+            self.print(f"pairs_runtime->copyPropertyTo{ctx_suffix}({prop_id}, {action}, {size}); // {prop_name}")
+
+        if isinstance(ast_node, CopyFeatureProperty):
+            prop_id = ast_node.prop().id()
+            prop_name = ast_node.prop().name()
+            if ast_node.context() == Contexts.Device:
+                assert ast_node.action()==Actions.ReadOnly, "Feature properties can only be read from device."
+                self.print(f"pairs_runtime->copyFeaturePropertyToDevice({prop_id}); // {prop_name}")
 
         if isinstance(ast_node, CopyVar):
             var_name = ast_node.variable().name()
             ctx_suffix = "Device" if ast_node.context() == Contexts.Device else "Host"
-            self.print(f"rv_{var_name}.copyTo{ctx_suffix}();")
+            ref = self.generate_object_reference(ast_node.variable(), device=True)
+            self.print(f"{ref}.copyTo{ctx_suffix}();")
 
         if isinstance(ast_node, For):
             iterator = self.generate_expression(ast_node.iterator)
@@ -531,24 +860,32 @@ class CGen:
             if ast_node.decl:
                 self.print(f"{tkw} *{array_name} = ({tkw} *) malloc({size});")
                 if self.target.is_gpu() and ast_node.array.device_flag:
-                    self.print(f"{tkw} *d_{array_name} = ({tkw} *) pairs::device_alloc({size});")
+                    self.print(f"{tkw} *{array_name}_d = ({tkw} *) pairs::device_alloc({size});")
             else:
                 self.print(f"{array_name} = ({tkw} *) malloc({size});")
                 if self.target.is_gpu() and ast_node.array.device_flag:
-                    self.print(f"d_{array_name} = ({tkw} *) pairs::device_alloc({size});")
+                    self.print(f"{array_name}_d = ({tkw} *) pairs::device_alloc({size});")
 
         if isinstance(ast_node, KernelLaunch):
             range_start = self.generate_expression(ScalarOp.inline(ast_node.min))
             kernel = ast_node.kernel
             kernel_params = f"{range_start}"
 
+            for param in kernel.parameters():
+                kernel_params += f", {param.name()}"
+
             for var in kernel.read_only_variables():
                 kernel_params += f", {var.name()}"
 
             for var in kernel.write_variables():
                 kernel_params += f", {var.name()}"
 
+            for it in kernel.iters():
+                kernel_params += f", {it.name()}"
+
             for array in kernel.arrays():
+                if array.is_static():
+                    continue
                 kernel_params += f", {array.name()}"
 
             for prop in kernel.properties():
@@ -558,6 +895,8 @@ class CGen:
                 kernel_params += f", {contact_prop.name()}"
 
             for feature_prop in kernel.feature_properties():
+                if feature_prop.device_flag:
+                    continue     
                 kernel_params += f", {feature_prop.name()}"
 
             for array_access in kernel.array_accesses():
@@ -571,177 +910,152 @@ class CGen:
             self.print(f"if({nblocks} > 0 && {threads_per_block} > 0) {{")
             self.print.add_indent(4)
             self.print(f"{kernel.name}<<<{nblocks}, {threads_per_block}>>>({kernel_params});")
-            self.print("pairs->sync();")
+            self.print("pairs_runtime->sync();")
             self.print.add_indent(-4)
             self.print("}")
 
         if isinstance(ast_node, ModuleCall):
-            module = ast_node.module
-            module_params = "pairs"
-            device_cond = module.run_on_device and self.target.is_gpu()
-
-            for var in module.read_only_variables():
-                decl = var.name()
-                module_params += f", {decl}"
-
-            for var in module.write_variables():
-                decl = f"rv_{var.name()}.getDevicePointer()" if device_cond and var.device_flag else f"&{var.name()}"
-                module_params += f", {decl}"
-
-            for array in module.arrays():
-                decl = f"d_{array.name()}" if device_cond else array.name()
-                module_params += decl if len(module_params) <= 0 else f", {decl}"
-                if array in module.host_references():
-                    decl = array.name()
-                    module_params += f", {decl}"
-
-            for prop in module.properties():
-                decl = f"d_{prop.name()}" if device_cond else prop.name()
-                module_params += f", {decl}"
-                if prop in module.host_references():
-                    decl = prop.name()
-                    module_params += f", {decl}"
-
-            for contact_prop in module.contact_properties():
-                decl = f"d_{contact_prop.name()}" if device_cond else contact_prop.name()
-                module_params += f", {decl}"
-                if contact_prop in module.host_references():
-                    decl = contact_prop.name()
-                    module_params += f", {decl}"
-
-            for feature_prop in module.feature_properties():
-                decl = f"d_{feature_prop.name()}" if device_cond else feature_prop.name()
-                module_params += f", {decl}"
-                if feature_prop in module.host_references():
-                    decl = feature_prop.name()
-                    module_params += f", {decl}"
-
-            self.print(f"{module.name}({module_params});")
+            module_params = ["pairs_runtime", "pobj"]
+
+            if ast_node.module.name=="init_domain":
+                module_params += ["argc", "argv"]
+
+            module_params += [f"{param.name()}" for param in ast_node.module.parameters()]
+
+            print_params = ", ".join(module_params)
+            self.print(f"pairs::internal::{ast_node.module.name}({print_params});")
 
         if isinstance(ast_node, Print):
-            self.print(f"PAIRS_DEBUG(\"{ast_node.string}\\n\");")
+            args = ast_node.args
+            exprs = [self.generate_expression(arg) for arg in args]
+            toPrint = "PAIRS_DEBUG(\""
+            for arg in args:
+                if Types.is_real(arg.type()):
+                    format = "%f "
+                elif Types.is_integer(arg.type()):
+                    format = "%d "
+                else:
+                    format = "%s "
+                toPrint += format
+
+            toPrint = toPrint + "\\n\", " + ", ".join(map(str, exprs)) + ");"
+            self.print(toPrint)
+
+        if isinstance(ast_node, PrintCode):
+            toPrint = self.generate_expression(ast_node.arg)
+            self.print(toPrint[1:-1])
 
         if isinstance(ast_node, Realloc):
             tkw = Types.c_keyword(self.sim, ast_node.array.type())
             size = self.generate_expression(ast_node.size)
             array_name = ast_node.array.name()
-            self.print(f"{array_name} = ({tkw} *) realloc({array_name}, {size});")
+            ptr = self.generate_object_reference(ast_node)
+            self.print(f"{ptr} = ({tkw} *) realloc({ptr}, {size});")
+
             if self.target.is_gpu() and ast_node.array.device_flag:
-                self.print(f"d_{array_name} = ({tkw} *) pairs::device_realloc(d_{array_name}, {size});")
+                d_ptr = self.generate_object_reference(ast_node, device=True)
+                self.print(f"{d_ptr} = ({tkw} *) pairs::device_realloc({d_ptr}, {size});")
 
         if isinstance(ast_node, RegisterArray):
             a = ast_node.array()
-            ptr = a.name()
-            d_ptr = f"d_{ptr}" if self.target.is_gpu() and a.device_flag else "nullptr"
             tkw = Types.c_keyword(self.sim, a.type())
             size = self.generate_expression(ast_node.size())
 
             if a.is_static():
-                self.print(f"pairs->addStaticArray({a.id()}, \"{a.name()}\", {ptr}, {d_ptr}, {size});") 
+                ptr_ref = self.generate_object_reference(a)
+                d_ptr_ref = self.generate_object_reference(a, device=True)
+                self.print(f"pairs_runtime->addStaticArray({a.id()}, \"{a.name()}\", {ptr_ref}, {d_ptr_ref}, {size});")
 
             else:
-                if self.target.is_gpu() and a.device_flag:
-                    self.print(f"{tkw} *{ptr}, *{d_ptr};")
-                    d_ptr = f"&{d_ptr}"
-                else:
-                    self.print(f"{tkw} *{ptr};")
-
-                self.print(f"pairs->addArray({a.id()}, \"{a.name()}\", &{ptr}, {d_ptr}, {size});")
+                ptr_addr = self.generate_object_address(a)
+                d_ptr_addr = self.generate_object_address(a, device=True)
+                self.print(f"pairs_runtime->addArray({a.id()}, \"{a.name()}\", {ptr_addr}, {d_ptr_addr}, {size});")
 
         if isinstance(ast_node, RegisterProperty):
             p = ast_node.property()
-            ptr = p.name()
-            d_ptr = f"d_{ptr}" if self.target.is_gpu() and p.device_flag else "nullptr"
+            ptr_addr = self.generate_object_address(p)
+            d_ptr_addr = self.generate_object_address(p, device=True)
             tkw = Types.c_keyword(self.sim, p.type())
             ptype = Types.c_property_keyword(p.type())
             assert ptype != "Prop_Invalid", "Invalid property type!"
 
             playout = Layouts.c_keyword(p.layout())
+            vol = 1 if p.is_volatile() else 0
             sizes = ", ".join([str(self.generate_expression(ScalarOp.inline(size))) for size in ast_node.sizes()])
-
-            if self.target.is_gpu() and p.device_flag:
-                self.print(f"{tkw} *{ptr}, *{d_ptr};")
-                d_ptr = f"&{d_ptr}"
-            else:
-                self.print(f"{tkw} *{ptr};")
-
-            self.print(f"pairs->addProperty({p.id()}, \"{p.name()}\", &{ptr}, {d_ptr}, {ptype}, {playout}, {sizes});")
+            self.print(f"pairs_runtime->addProperty({p.id()}, \"{p.name()}\", {ptr_addr}, {d_ptr_addr}, {ptype}, {playout}, {vol}, {sizes});")
 
         if isinstance(ast_node, RegisterContactProperty):
             p = ast_node.property()
-            ptr = p.name()
-            d_ptr = f"d_{ptr}" if self.target.is_gpu() and p.device_flag else "nullptr"
+            ptr_addr = self.generate_object_address(p)
+            d_ptr_addr = self.generate_object_address(p, device=True)
             tkw = Types.c_keyword(self.sim, p.type())
             ptype = Types.c_property_keyword(p.type())
             assert ptype != "Prop_Invalid", "Invalid property type!"
 
             playout = Layouts.c_keyword(p.layout())
             sizes = ", ".join([str(self.generate_expression(ScalarOp.inline(size))) for size in ast_node.sizes()])
-
-            if self.target.is_gpu() and p.device_flag:
-                self.print(f"{tkw} *{ptr}, *{d_ptr};")
-                d_ptr = f"&{d_ptr}"
-            else:
-                self.print(f"{tkw} *{ptr};")
-
-            self.print(f"pairs->addContactProperty({p.id()}, \"{p.name()}\", &{ptr}, {d_ptr}, {ptype}, {playout}, {sizes});")
+            self.print(f"pairs_runtime->addContactProperty({p.id()}, \"{p.name()}\", {ptr_addr}, {d_ptr_addr}, {ptype}, {playout}, {sizes});")
 
         if isinstance(ast_node, RegisterFeatureProperty):
             fp = ast_node.feature_property()
-            ptr = fp.name()
-            d_ptr = f"&d_{ptr}" if self.target.is_gpu() and fp.device_flag else "nullptr"
+            ptr = self.generate_object_reference(fp)
+            ptr_addr = self.generate_object_address(fp)
+            d_ptr_addr = self.generate_object_address(fp, device=True)
             array_size = fp.array_size()
             nkinds = fp.feature().nkinds()
             tkw = Types.c_keyword(self.sim, fp.type())
             fptype = Types.c_property_keyword(fp.type())
             assert fptype != "Prop_Invalid", "Invalid feature property type!"
 
-            self.print(f"{tkw} {ptr}[{array_size}];")
-            self.print(f"pairs->addFeatureProperty({fp.id()}, \"{fp.name()}\", &{ptr}, {d_ptr}, {fptype}, {nkinds}, {array_size} * sizeof({tkw}));")
+            self.print(f"pairs_runtime->addFeatureProperty({fp.id()}, \"{fp.name()}\", {ptr_addr}, {d_ptr_addr}, {fptype}, {nkinds}, {array_size} * sizeof({tkw}));")
 
             for i in range(array_size):
                 self.print(f"{ptr}[{i}] = {fp.data()[i]};")
 
             if self.target.is_gpu() and fp.device_flag:
-                self.print(f"pairs->copyFeaturePropertyToDevice({fp.id()}); // {fp.name()}")
+                self.print(f"pairs_runtime->copyFeaturePropertyToDevice({fp.id()}); // {fp.name()}")
 
         if isinstance(ast_node, Timestep):
             self.generate_statement(ast_node.block)
 
         if isinstance(ast_node, ReallocProperty):
             p = ast_node.property()
-            ptr = p.name()
-            d_ptr_addr = f"&d_{ptr}" if self.target.is_gpu() and p.device_flag else "nullptr"
+            ptr_addr = self.generate_object_address(p)
+            d_ptr_addr = self.generate_object_address(p, device=True)
             sizes = ", ".join([str(self.generate_expression(ScalarOp.inline(size))) for size in ast_node.sizes()])
-            self.print(f"pairs->reallocProperty({p.id()}, &{ptr}, {d_ptr_addr}, {sizes});")
-            #self.print(f"pairs->reallocProperty({p.id()}, (void **) &{ptr}, (void **) &d_{ptr}, {sizes});")
+            self.print(f"pairs_runtime->reallocProperty({p.id()}, {ptr_addr}, {d_ptr_addr}, {sizes});")
 
         if isinstance(ast_node, ReallocArray):
             a = ast_node.array()
             size = self.generate_expression(ast_node.size())
-            ptr = a.name()
-            d_ptr_addr = f"&d_{ptr}" if self.target.is_gpu() and a.device_flag else "nullptr"
-            self.print(f"pairs->reallocArray({a.id()}, &{ptr}, {d_ptr_addr}, {size});")
-            #self.print(f"pairs->reallocArray({a.id()}, (void **) &{ptr}, (void **) &d_{ptr}, {size});")
+            ptr_addr = self.generate_object_address(a)
+            d_ptr_addr = self.generate_object_address(a, device=True)
+            self.print(f"pairs_runtime->reallocArray({a.id()}, {ptr_addr}, {d_ptr_addr}, {size});")
 
         if isinstance(ast_node, DeclareVariable):
+            var_name = ast_node.var.name()
             tkw = Types.c_keyword(self.sim, ast_node.var.type())
+            prefix_decl = f"{tkw} " if ast_node.var.temporary() else ""
 
             if ast_node.var.is_scalar():
                 var = self.generate_expression(ast_node.var)
+                addr = self.generate_object_address(ast_node.var)
                 init = self.generate_expression(ast_node.var.init_value())
-                self.print(f"{tkw} {var} = {init};")
+                self.print(f"{prefix_decl}{var} = {init};")
+
+                if ast_node.var.runtime_track():
+                    self.print(f"pairs_runtime->trackVariable(\"{var_name}\", {addr});")
 
             else:
                 for i in range(Types.number_of_elements(self.sim, ast_node.var.type())):
                     var = self.generate_expression(ast_node.var, index=i)
                     init = self.generate_expression(ast_node.var.init_value(), index=i)
-                    self.print(f"{tkw} {var} = {init};")
-
+                    self.print(f"{prefix_decl}{var} = {init};")
 
             if not self.kernel_context and self.target.is_gpu() and ast_node.var.device_flag:
-                self.print(f"RuntimeVar<{tkw}> rv_{ast_node.var.name()} = pairs->addDeviceVariable(&({ast_node.var.name()}));")
-                #self.print(f"{tkw} *d_{ast_node.var.name()} = pairs->addDeviceVariable(&({ast_node.var.name()}));")
+                addr = self.generate_object_address(ast_node.var)
+                ref = self.generate_object_reference(ast_node.var, device=True)
+                self.print(f"{prefix_decl}{ref} = pairs_runtime->addDeviceVariable({addr});")
 
         if isinstance(ast_node, While):
             cond = self.generate_expression(ast_node.cond)
@@ -749,9 +1063,13 @@ class CGen:
             self.generate_statement(ast_node.block)
             self.print("}")
 
+        if isinstance(ast_node, Return):
+            expr = self.generate_expression(ast_node.expr)
+            self.print(f"return {expr};")
+
     def generate_expression(self, ast_node, mem=False, index=None):
         if isinstance(ast_node, Array):
-            return ast_node.name()
+            return self.generate_object_reference(ast_node)
 
         if isinstance(ast_node, ArrayAccess):
             if mem or ast_node.inlined is True:
@@ -778,9 +1096,9 @@ class CGen:
             extra_params = []
 
             if ast_node.name().startswith("pairs::"):
-                extra_params += ["pairs"]
+                extra_params += ["pairs_runtime"]
 
-            if ast_node.name() == "pairs->initDomain":
+            if ast_node.name() == "pairs_runtime->initDomain":
                 extra_params += ["&argc", "&argv"]
 
             params = ", ".join(extra_params + [str(self.generate_expression(p)) for p in ast_node.parameters()])
@@ -792,22 +1110,24 @@ class CGen:
             return f"({tkw})({expr})"
 
         if isinstance(ast_node, ContactProperty):
-            return ast_node.name()
+            return self.generate_object_reference(ast_node)
 
         if isinstance(ast_node, Deref):
             var = self.generate_expression(ast_node.var)
-            return f"(*{var})"
+            # Dereferences are ignored for write variables when full objects
+            # are generated since they can be directly written into
+            return var if (self.generate_full_object_names or ast_node.var.force_read) else f"(*{var})"
 
         if isinstance(ast_node, DeviceStaticRef):
             elem = self.generate_expression(ast_node.elem)
-            return f"d_{elem}"
+            return f"{elem}_d"
 
         if isinstance(ast_node, FeatureProperty):
-            return ast_node.name()
+            return self.generate_object_reference(ast_node)
 
         if isinstance(ast_node, HostRef):
             elem = self.generate_expression(ast_node.elem)
-            return f"h_{elem}"
+            return f"{elem}_h"
 
         if isinstance(ast_node, Iter):
             assert mem is False, "Iterator is not lvalue!"
@@ -817,13 +1137,22 @@ class CGen:
             assert mem is False, "Literal is not lvalue!"
             if ast_node.type() == Types.String:
                 return f"\"{ast_node.value}\""
+            
+            if ast_node.type() == Types.Boolean:
+                if ast_node.value == True:
+                    return "true"
+                if ast_node.value == False:
+                    return "false"
 
             if not ast_node.is_scalar():
                 assert index is not None, "Index must be set for non-scalar literals."
                 return ast_node.value[index]
 
             if isinstance(ast_node.value, float) and math.isinf(ast_node.value):
-                return f"std::numeric_limits<{self.real_type()}>::infinity()"
+                if self.kernel_context:
+                    return "CUDART_INF"
+                else:
+                    return f"std::numeric_limits<{self.real_type()}>::infinity()"
 
             return ast_node.value
 
@@ -837,7 +1166,7 @@ class CGen:
             return f"{ast_node.name()}"
 
         if isinstance(ast_node, Property):
-            return ast_node.name()
+            return self.generate_object_reference(ast_node)
 
         if isinstance(ast_node, PropertyAccess):
             assert ast_node.is_scalar() or index is not None, \
@@ -911,8 +1240,11 @@ class CGen:
             return f"{ast_node.name()}"
 
         if isinstance(ast_node, Var):
-            return ast_node.name() if ast_node.is_scalar() else f"{ast_node.name()}_{index}"
-
+            return self.generate_object_reference(ast_node, index=index)
+        
+        if isinstance(ast_node, Parameter):
+            return ast_node.name()
+        
         if isinstance(ast_node, VectorAccess):
             return self.generate_expression(ast_node.expr, mem, self.generate_expression(ast_node.index))
 
diff --git a/src/pairs/code_gen/interface.py b/src/pairs/code_gen/interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ed3f7f2b511137000e87f8db64c4a70894efd35
--- /dev/null
+++ b/src/pairs/code_gen/interface.py
@@ -0,0 +1,251 @@
+from pairs.ir.block import Block, pairs_interface_block
+from pairs.ir.functions import Call_Void, Call, Call_Int
+from pairs.ir.parameters import Parameter
+from pairs.ir.ret import Return
+from pairs.ir.scalars import ScalarOp
+from pairs.sim.domain import UpdateDomain
+from pairs.sim.cell_lists import BuildCellListsStencil
+from pairs.sim.comm import Synchronize, Borders, Exchange, ReverseComm
+from pairs.ir.types import Types
+from pairs.ir.branches import Filter, Branch
+from pairs.sim.cell_lists import BuildCellLists, BuildCellListsStencil, PartitionCellLists, BuildCellNeighborLists
+from pairs.sim.neighbor_lists import BuildNeighborLists
+from pairs.sim.variables import DeclareVariables 
+from pairs.sim.arrays import DeclareArrays
+from pairs.sim.properties import AllocateProperties, AllocateContactProperties, ResetVolatileProperties
+from pairs.sim.features import AllocateFeatureProperties
+from pairs.sim.instrumentation import RegisterMarkers, RegisterTimers
+from pairs.sim.grid import MutableGrid
+from pairs.sim.domain_partitioners import DomainPartitioners
+from pairs.ir.print import PrintCode
+from pairs.ir.assign import Assign
+from pairs.sim.contact_history import BuildContactHistory, ClearUnusedContactHistory, ResetContactHistoryUsageStatus
+from pairs.sim.thermo import ComputeThermo
+
+class InterfaceModules:
+    def __init__(self, sim):
+        self.sim = sim
+
+    def create_all(self):
+        self.initialize()
+        self.setup_sim()
+        self.update_domain()
+        self.update_cells(self.sim.reneighbor_frequency) 
+        self.communicate(self.sim.reneighbor_frequency)
+        self.reverse_comm() 
+        self.reset_volatiles()
+
+        if self.sim._use_contact_history:
+            if self.neighbor_lists:
+                self.build_contact_history(self.sim.reneighbor_frequency)
+            self.reset_contact_history()
+
+        if self.sim._compute_thermo != 0:
+            self.compute_thermo(self.sim._compute_thermo)
+
+        self.rank()
+        self.nlocal()
+        self.nghost()
+        self.size()
+        self.end()      
+
+    @pairs_interface_block
+    def initialize(self):
+        self.sim.module_name('initialize')
+        nprops = self.sim.properties.nprops()
+        ncontactprops = self.sim.contact_properties.nprops()
+        narrays = self.sim.arrays.narrays()
+        part = DomainPartitioners.c_keyword(self.sim.partitioner())
+
+        PrintCode(self.sim, f"pairs_runtime = new PairsRuntime({nprops}, {ncontactprops}, {narrays}, {part});")
+        PrintCode(self.sim, f"pobj = new PairsObjects();")
+
+        inits = Block.from_list(self.sim, [
+            DeclareVariables(self.sim),
+            DeclareArrays(self.sim),
+            AllocateProperties(self.sim),
+            AllocateContactProperties(self.sim),
+            AllocateFeatureProperties(self.sim),
+            RegisterTimers(self.sim),
+            RegisterMarkers(self.sim)
+        ])
+
+        if self.sim.create_domain_at_initialization:
+            self.sim.add_statement(Block.merge_blocks(inits, self.sim.create_domain))
+        else:
+            assert self.sim.grid is None, "A grid already exists"
+            self.sim.grid = MutableGrid(self.sim, self.sim.dims)
+            self.sim.add_statement(inits)
+
+    @pairs_interface_block
+    def setup_sim(self):
+        self.sim.module_name('setup_sim')
+        
+        if self.sim.cell_lists.runtime_spacing:
+            for d in range(self.sim.dims):
+                Assign(self.sim, self.sim.cell_lists.spacing[d], Parameter(self.sim, f'cell_spacing_d{d}', Types.Real))
+
+        if self.sim.cell_lists.runtime_cutoff_radius:
+            Assign(self.sim, self.sim.cell_lists.cutoff_radius, Parameter(self.sim, 'cutoff_radius', Types.Real))
+
+        self.sim.add_statement(self.sim.setup_particles)
+        # This update assumes all particles have been created exactly in the rank that contains them 
+        self.sim.add_statement(UpdateDomain(self.sim))  
+        self.sim.add_statement(BuildCellListsStencil(self.sim, self.sim.cell_lists))
+        
+    @pairs_interface_block
+    def update_domain(self):
+        self.sim.module_name('update_domain')
+        self.sim.add_statement(Exchange(self.sim._comm))    # Local particles must be contained in their owners before domain update
+        self.sim.add_statement(UpdateDomain(self.sim))
+        # Exchange is not needed after update since all locals are contained in thier owners
+        self.sim.add_statement(Borders(self.sim._comm))     # Ghosts must be recreated after update
+        self.sim.add_statement(ResetVolatileProperties(self.sim))   # Reset volatile includes the new locals
+        self.sim.add_statement(BuildCellListsStencil(self.sim, self.sim.cell_lists))    # Rebuild stencil since subdom sizes have changed
+        self.sim.add_statement(self.sim.update_cells_procedures)
+        
+    @pairs_interface_block
+    def reset_volatiles(self):
+        self.sim.module_name('reset_volatiles')
+        self.sim.add_statement(ResetVolatileProperties(self.sim))
+    
+    @pairs_interface_block
+    def update_cells(self, reneighbor_frequency=1):
+        self.sim.module_name('update_cells')
+        timestep = Parameter(self.sim, f'timestep', Types.Int32)
+        cond = ScalarOp.inline(ScalarOp.or_op(
+            ScalarOp.cmp((timestep + 1) % reneighbor_frequency, 0),
+            ScalarOp.cmp(timestep, 0)
+            ))
+
+        self.sim.add_statement(Filter(self.sim, cond, self.sim.update_cells_procedures))
+
+    @pairs_interface_block
+    def communicate(self, reneighbor_frequency=1):
+        self.sim.module_name('communicate')
+        timestep = Parameter(self.sim, f'timestep', Types.Int32)
+        cond = ScalarOp.inline(ScalarOp.or_op(
+            ScalarOp.cmp((timestep + 1) % reneighbor_frequency, 0),
+            ScalarOp.cmp(timestep, 0)
+            ))
+        
+        exchange = Filter(self.sim, cond, Exchange(self.sim._comm))
+        border_sync = Branch(self.sim, cond, 
+                             blk_if = Borders(self.sim._comm), 
+                             blk_else = Synchronize(self.sim._comm))
+        
+        self.sim.add_statement(exchange)
+        self.sim.add_statement(border_sync)
+        
+        # TODO: Maybe remove this from here, but volatiles must always be reset after exchange
+        self.sim.add_statement(Filter(self.sim, cond, Block(self.sim, ResetVolatileProperties(self.sim))))   
+
+    @pairs_interface_block
+    def reverse_comm(self):
+        self.sim.module_name('reverse_comm')
+        self.sim.add_statement(ReverseComm(self.sim._comm, reduce=True))
+    
+    @pairs_interface_block
+    def build_contact_history(self, reneighbor_frequency=1):
+        self.sim.module_name('build_contact_history')
+        timestep = Parameter(self.sim, f'timestep', Types.Int32)
+        cond = ScalarOp.inline(ScalarOp.or_op(
+            ScalarOp.cmp((timestep + 1) % reneighbor_frequency, 0),
+            ScalarOp.cmp(timestep, 0)
+            ))
+        
+        self.sim.add_statement(
+            Filter(self.sim, cond,
+                   BuildContactHistory(self.sim, self.sim._contact_history, self.sim.cell_lists)))
+
+    @pairs_interface_block
+    def reset_contact_history(self):
+        self.sim.module_name('reset_contact_history')
+        self.sim.add_statement(ResetContactHistoryUsageStatus(self.sim, self.sim._contact_history))
+        self.sim.add_statement(ClearUnusedContactHistory(self.sim, self.sim._contact_history))
+
+    @pairs_interface_block
+    def compute_thermo(self):
+        self.sim.module_name('compute_thermo')
+        self.sim.add_statement(ComputeThermo(self.sim))
+
+    @pairs_interface_block
+    def rank(self):
+        self.sim.module_name('rank')
+        Return(self.sim, self.sim.domain_partitioning().rank)
+
+    @pairs_interface_block
+    def nlocal(self):
+        self.sim.module_name('nlocal')
+        Return(self.sim, self.sim.nlocal)
+
+    @pairs_interface_block
+    def nghost(self):
+        self.sim.module_name('nghost')
+        Return(self.sim, self.sim.nghost)
+
+    @pairs_interface_block
+    def size(self):
+        self.sim.module_name('size')
+        Return(self.sim, ScalarOp.inline(self.sim.nlocal + self.sim.nghost))
+
+    @pairs_interface_block
+    def create_sphere(self):
+        self.sim.module_name('create_sphere')
+        x = Parameter(self.sim, 'x', Types.Real)
+        y = Parameter(self.sim, 'y', Types.Real)
+        z = Parameter(self.sim, 'z', Types.Real)
+        vx = Parameter(self.sim, 'vx', Types.Real)
+        vy = Parameter(self.sim, 'vy', Types.Real)
+        vz = Parameter(self.sim, 'vz', Types.Real)
+        density = Parameter(self.sim, 'density', Types.Real)
+        radius = Parameter(self.sim, 'radius', Types.Real)
+        ptype = Parameter(self.sim, 'type', Types.Real)
+        flag = Parameter(self.sim, 'flag', Types.Real)
+
+        Return(self.sim, Call(self.sim, "pairs::create_sphere", 
+                              [x, y, z, vx, vy, vz, 
+                               density, radius, ptype, flag], Types.UInt64))
+
+    @pairs_interface_block
+    def create_halfspace(self):
+        self.sim.module_name('create_halfspace')
+        x = Parameter(self.sim, 'x', Types.Real)
+        y = Parameter(self.sim, 'y', Types.Real)
+        z = Parameter(self.sim, 'z', Types.Real)
+        nx = Parameter(self.sim, 'nx', Types.Real)
+        ny = Parameter(self.sim, 'ny', Types.Real)
+        nz = Parameter(self.sim, 'nz', Types.Real)
+        ptype = Parameter(self.sim, 'type', Types.Real)
+        flag = Parameter(self.sim, 'flag', Types.Real)
+
+        Return(self.sim, Call(self.sim, "pairs::create_halfspace", 
+                              [x, y, z, nx, ny, nz, ptype, flag], Types.UInt64))
+        
+    @pairs_interface_block
+    def dem_sc_grid(self):
+        self.sim.module_name('dem_sc_grid')
+        xmax = Parameter(self.sim, 'xmax', Types.Real)
+        ymax = Parameter(self.sim, 'ymax', Types.Real)
+        zmax = Parameter(self.sim, 'zmax', Types.Real)
+        spacing = Parameter(self.sim, 'spacing', Types.Real)
+        diameter = Parameter(self.sim, 'diameter', Types.Real)
+        min_diameter = Parameter(self.sim, 'min_diameter', Types.Real)
+        max_diameter = Parameter(self.sim, 'max_diameter', Types.Real)
+        initial_velocity = Parameter(self.sim, 'initial_velocity', Types.Real)
+        particle_density = Parameter(self.sim, 'particle_density', Types.Real)
+        ntypes = Parameter(self.sim, 'ntypes', Types.Int32)
+
+        Assign(self.sim, self.sim.nlocal,
+               Call_Int(self.sim, "pairs::dem_sc_grid",
+                        [xmax, ymax, zmax, spacing, diameter, min_diameter, max_diameter,
+                         initial_velocity, particle_density, ntypes]))
+        Return(self.sim, self.sim.nlocal)
+
+    @pairs_interface_block
+    def end(self):
+        self.sim.module_name('end')
+        Call_Void(self.sim, "pairs::print_timers", [])
+        Call_Void(self.sim, "pairs::print_stats", [self.sim.nlocal, self.sim.nghost])
+        PrintCode(self.sim, "delete pobj;")
+        PrintCode(self.sim, "delete pairs_runtime;")
diff --git a/src/pairs/code_gen/printer.py b/src/pairs/code_gen/printer.py
index 4d73b7679f7f8c52ff1878339dbc0479cb9db215..3eabef6f95c22b6d2f16d168a97437cb1fe00d9f 100644
--- a/src/pairs/code_gen/printer.py
+++ b/src/pairs/code_gen/printer.py
@@ -1,4 +1,6 @@
 class Printer:
+
+    line_id = 0
     def __init__(self, output):
         self.output = output
         self.stream = None
@@ -16,4 +18,5 @@ class Printer:
 
     def __call__(self, text):
         assert self.stream is not None, "Invalid stream!"
+        Printer.line_id += 1
         self.stream.write(self.indent * ' ' + text + '\n')
diff --git a/src/pairs/ir/block.py b/src/pairs/ir/block.py
index 1a0809b811840a6f0d0edbccfd1fb68a9712992f..2a14ea2776dcf3651a8f9d03b397bf9db1bd1fb6 100644
--- a/src/pairs/ir/block.py
+++ b/src/pairs/ir/block.py
@@ -42,6 +42,21 @@ def pairs_device_block(func):
     return inner
 
 
+def pairs_interface_block(func):
+    def inner(*args, **kwargs):
+        sim = args[0].sim # self.sim
+        sim.init_block()
+        func(*args, **kwargs)
+        return Module(sim,
+            name=sim._module_name,
+            block=Block(sim, sim._block),
+            resizes_to_check=sim._resizes_to_check,
+            check_properties_resize=sim._check_properties_resize,
+            run_on_device=False,
+            interface=True)
+
+    return inner
+
 class Block(ASTNode):
     def __init__(self, sim, stmts):
         super().__init__(sim)
diff --git a/src/pairs/ir/declaration.py b/src/pairs/ir/declaration.py
index 3e26c57fd82b2e59d24fb1043abe823714a8ae9c..35992d52e2bbebd0c59e0a7502a16939cb3b1788 100644
--- a/src/pairs/ir/declaration.py
+++ b/src/pairs/ir/declaration.py
@@ -7,7 +7,7 @@ class Decl(ASTNode):
         self.elem = elem
 
     def __str__(self):
-        return f"Decl<self.elem>"
+        return f"Decl<{self.elem}>"
 
     def children(self):
         return [self.elem]
diff --git a/src/pairs/ir/device.py b/src/pairs/ir/device.py
index 952ff1429c9eb45dd47bc40edcb2d3b53fb09f38..0dfd6b9ad0ebf802429cc1cacdf9339bd871a816 100644
--- a/src/pairs/ir/device.py
+++ b/src/pairs/ir/device.py
@@ -62,6 +62,9 @@ class CopyProperty(ASTNode):
         self._action = action
         self.sim.add_statement(self)
 
+    def __str__(self):
+        return f"CopyProperty<{self._prop}>"
+    
     def prop(self):
         return self._prop
 
@@ -72,9 +75,31 @@ class CopyProperty(ASTNode):
         return self._action
 
     def children(self):
-        return [self._prop]
+        return [self._prop, self.sim.nghost, self.sim.nlocal]
+
+class CopyFeatureProperty(ASTNode):
+    def __init__(self, sim, prop, ctx, action):
+        super().__init__(sim)
+        self._prop = prop
+        self._ctx = ctx
+        self._action = action
+        self.sim.add_statement(self)
+
+    def __str__(self):
+        return f"CopyFeatureProperty<{self._prop}>"
+    
+    def prop(self):
+        return self._prop
 
+    def context(self):
+        return self._ctx
 
+    def action(self):
+        return self._action
+
+    def children(self):
+        return [self._prop]
+    
 class CopyContactProperty(ASTNode):
     def __init__(self, sim, prop, ctx, action):
         super().__init__(sim)
diff --git a/src/pairs/ir/features.py b/src/pairs/ir/features.py
index d709336b0971a7361e0cb5159d36bc975605cb28..24bb365f3e951d55d479150c8d5ebb60a9f88f0b 100644
--- a/src/pairs/ir/features.py
+++ b/src/pairs/ir/features.py
@@ -131,9 +131,7 @@ class FeatureProperty(ASTNode):
                      self.feature_prop_feature.nkinds()]
 
     def array_size(self):
-        nelems = self.feature_prop_feature.nkinds() * \
-                 Types.number_of_elements(self.sim, self.feature_prop_type)
-        return nelems * nelems
+        return self.feature_prop_feature.nkinds()**2  * Types.number_of_elements(self.sim, self.feature_prop_type)
 
     def __getitem__(self, expr):
         return FeaturePropertyAccess(self.sim, self, expr)
@@ -161,7 +159,7 @@ class FeaturePropertyAccess(ASTTerm):
             sizes = feature_prop.sizes()
             layout = feature_prop.layout()
 
-            for elem in range(Types.number_of_elements(feature_prop.type())):
+            for elem in range(Types.number_of_elements(self.sim, feature_prop.type())):
                 if layout == Layouts.AoS:
                     self.vector_indexes[elem] = self.index * sizes[0] + elem
                 elif layout == Layouts.SoA:
diff --git a/src/pairs/ir/functions.py b/src/pairs/ir/functions.py
index b18406bf77bc5d94c22387864435fc677680ea8e..ffb49d53717b9a15940c6c9039ddd1ee6c7baf69 100644
--- a/src/pairs/ir/functions.py
+++ b/src/pairs/ir/functions.py
@@ -11,6 +11,9 @@ class Call(ASTTerm):
         self.params = [Lit.cvt(sim, p) for p in params]
         self.return_type = return_type
 
+    def __str__(self):
+        return f"Call<{self.func_name}, {self.params}>"
+    
     def name(self):
         return self.func_name
 
@@ -28,8 +31,13 @@ class Call_Int(Call):
     def __init__(self, sim, func_name, parameters):
         super().__init__(sim, func_name, parameters, Types.Int32)
 
+    def __str__(self):
+            return f"Call_Int<{self.func_name}, {self.params}>"
 
 class Call_Void(Call):
     def __init__(self, sim, func_name, parameters):
-        super().__init__(sim, func_name, parameters, Types.Invalid)
+        super().__init__(sim, func_name, parameters, Types.Void)
         sim.add_statement(self)
+    
+    def __str__(self):
+        return f"Cal_Void<{self.func_name}, {self.params}>"
diff --git a/src/pairs/ir/kernel.py b/src/pairs/ir/kernel.py
index 04def29cf9153a8ec7f5048f79579bd062186dd0..4f477ca1b50a7cdce7b9a13894cd96f10e26f769 100644
--- a/src/pairs/ir/kernel.py
+++ b/src/pairs/ir/kernel.py
@@ -8,7 +8,9 @@ from pairs.ir.matrices import MatrixOp
 from pairs.ir.properties import Property, ContactProperty
 from pairs.ir.quaternions import QuaternionOp
 from pairs.ir.variables import Var
+from pairs.ir.parameters import Parameter
 from pairs.ir.vectors import VectorOp
+from pairs.ir.loops import Iter
 
 
 class Kernel(ASTNode):
@@ -19,6 +21,8 @@ class Kernel(ASTNode):
         self._id = Kernel.last_kernel
         self._name = name if name is not None else "kernel" + str(Kernel.last_kernel)
         self._variables = {}
+        self._parameters = {}
+        self._iters = {}
         self._arrays = {}
         self._properties = {}
         self._contact_properties = {}
@@ -50,6 +54,12 @@ class Kernel(ASTNode):
     def variables(self):
         return self._variables
 
+    def parameters(self):
+        return self._parameters
+    
+    def iters(self):
+        return self._iters
+    
     def read_only_variables(self):
         return [var for var in self._variables if self._variables[var] == Actions.ReadOnly]
 
@@ -99,6 +109,28 @@ class Kernel(ASTNode):
 
                 action = Actions.NoAction if var not in self._variables else self._variables[var]
                 self._variables[var] = Actions.update_rule(action, new_op)
+    
+    def add_parameter(self, parameter, write=False):
+        parameter_list = parameter if isinstance(parameter, list) else [parameter]
+        new_op = 'w' if write else 'r'
+
+        for param in parameter_list:
+            assert isinstance(param, Parameter), \
+                "Module.add_parameter(): given element is not of type Parameter!"
+
+            action = Actions.NoAction if param not in self._parameters else self._parameters[param]
+            self._parameters[param] = Actions.update_rule(action, new_op)
+
+    def add_iter(self, iter, write=False):
+        iter_list = iter if isinstance(iter, list) else [iter]
+        new_op = 'w' if write else 'r'
+
+        for it in iter_list:
+            assert isinstance(it, Iter), \
+                "Kernel.add_iter(): Element is not of type Iter."
+
+            action = Actions.NoAction if it not in self._iters else self._iters[it]
+            self._iters[it] = Actions.update_rule(action, new_op)
 
     def add_property(self, prop, write=False):
         prop_list = prop if isinstance(prop, list) else [prop]
diff --git a/src/pairs/ir/loops.py b/src/pairs/ir/loops.py
index 997fda5c6fa73938a74c8fb52f453caa1bd60a21..8842818627c7da514ec9a24ae85c84a5f08cd747 100644
--- a/src/pairs/ir/loops.py
+++ b/src/pairs/ir/loops.py
@@ -18,6 +18,7 @@ class Iter(ASTTerm):
         super().__init__(sim, ScalarOp)
         self.loop = loop
         self.iter_id = Iter.new_id()
+        self._ref_candidate = False
 
     def id(self):
         return self.iter_id
@@ -27,7 +28,16 @@ class Iter(ASTTerm):
 
     def type(self):
         return Types.Int32
-
+    
+    def mark_as_ref_candidate(self):
+        self._ref_candidate = True
+
+    def is_ref_candidate(self):
+        return self._ref_candidate
+    
+    def __hash__(self):
+        return hash(self.iter_id)
+    
     def __eq__(self, other):
         return isinstance(other, Iter) and self.iter_id == other.iter_id
 
@@ -39,7 +49,7 @@ class Iter(ASTTerm):
 
 
 class For(ASTNode):
-    def __init__(self, sim, range_min, range_max, block=None):
+    def __init__(self, sim, range_min, range_max, block=None, not_kernel=False):
         super().__init__(sim)
         self.iterator = Iter(sim, self)
         self.min = Lit.cvt(sim, range_min)
@@ -47,6 +57,7 @@ class For(ASTNode):
         self.block = Block(sim, []) if block is None else block
         self.kernel = None
         self._kernel_candidate = False
+        self.not_kernel = not_kernel
 
     def __str__(self):
         return f"For<{self.iterator}, {self.min} ... {self.max}>"
@@ -63,6 +74,9 @@ class For(ASTNode):
     def mark_as_kernel_candidate(self):
         self._kernel_candidate = True
 
+    def mark_iter_as_ref_candidate(self):
+        self.iterator.mark_as_ref_candidate()
+
     def is_kernel_candidate(self):
         return self._kernel_candidate
 
diff --git a/src/pairs/ir/math.py b/src/pairs/ir/math.py
index e85aa0678f7fbfc6372665fdfab78f346eb21e97..a6a156a4986a6a8b122dc70d8ca920ad18d29269 100644
--- a/src/pairs/ir/math.py
+++ b/src/pairs/ir/math.py
@@ -1,6 +1,7 @@
 from pairs.ir.ast_term import ASTTerm
 from pairs.ir.scalars import ScalarOp
 from pairs.ir.types import Types
+from pairs.ir.lit import Lit
 
 
 class MathFunction(ASTTerm):
@@ -115,6 +116,7 @@ class Cos(MathFunction):
 
 class Ceil(MathFunction):
     def __init__(self, sim, expr):
+        expr = Lit.cvt(sim, expr)
         assert Types.is_real(expr.type()), "Expression must be of real type!"
         super().__init__(sim)
         self._params = [expr]
diff --git a/src/pairs/ir/module.py b/src/pairs/ir/module.py
index ab78942b2f43f946a714513fab4ad7d390442197..ded67ac6f4448590c346f51177b17fe364b729d0 100644
--- a/src/pairs/ir/module.py
+++ b/src/pairs/ir/module.py
@@ -4,15 +4,25 @@ from pairs.ir.ast_node import ASTNode
 from pairs.ir.features import FeatureProperty
 from pairs.ir.properties import Property, ContactProperty
 from pairs.ir.variables import Var
+from pairs.ir.parameters import Parameter
+from pairs.ir.types import Types
 
 
 class Module(ASTNode):
     last_module = 0
 
-    def __init__(self, sim, name=None, block=None, resizes_to_check={}, check_properties_resize=False, run_on_device=False):
+    def __init__(self, sim, 
+                 name=None, 
+                 block=None, 
+                 resizes_to_check={}, 
+                 check_properties_resize=False, 
+                 run_on_device=False, 
+                 user_defined=False, 
+                 interface=False):
         super().__init__(sim)
         self._id = Module.last_module
         self._name = name if name is not None else "module" + str(Module.last_module)
+        self._parameters = {}
         self._variables = {}
         self._arrays = {}
         self._properties = {}
@@ -23,8 +33,21 @@ class Module(ASTNode):
         self._resizes_to_check = resizes_to_check
         self._check_properties_resize = check_properties_resize
         self._run_on_device = run_on_device
+        self._user_defined = user_defined
+        self._interface = interface
+        self._return_type = Types.Void
         self._profile = False
-        sim.add_module(self)
+
+        if user_defined:
+            assert not interface, ("User-defined modules can't be part of the interface directly."
+                                "Wrap them inside seperate interface modules.")
+            sim.add_udf_module(self)
+        else:
+            if interface:
+                sim.add_interface_module(self)
+            else:
+                sim.add_module(self)
+                
         Module.last_module += 1
 
     def __str__(self):
@@ -45,6 +68,18 @@ class Module(ASTNode):
     @property
     def run_on_device(self):
         return self._run_on_device
+    
+    @property
+    def user_defined(self):
+        return self._user_defined
+
+    @property
+    def interface(self):
+        return self._interface
+
+    @property
+    def return_type(self):
+        return self._return_type
 
     def profile(self):
         self._profile = True
@@ -53,6 +88,9 @@ class Module(ASTNode):
     def must_profile(self):
         return self._profile
 
+    def parameters(self):
+        return self._parameters
+    
     def variables(self):
         return self._variables
 
@@ -99,6 +137,17 @@ class Module(ASTNode):
             action = Actions.NoAction if var not in self._variables else self._variables[var]
             self._variables[var] = Actions.update_rule(action, new_op)
 
+    def add_parameter(self, parameter, write=False):
+        parameter_list = parameter if isinstance(parameter, list) else [parameter]
+        new_op = 'w' if write else 'r'
+
+        for param in parameter_list:
+            assert isinstance(param, Parameter), \
+                "Module.add_parameter(): given element is not of type Parameter!"
+
+            action = Actions.NoAction if param not in self._parameters else self._parameters[param]
+            self._parameters[param] = Actions.update_rule(action, new_op)
+
     def add_property(self, prop, write=False):
         prop_list = prop if isinstance(prop, list) else [prop]
         new_op = 'w' if write else 'r'
@@ -150,5 +199,8 @@ class ModuleCall(ASTNode):
     def module(self):
         return self._module
 
+    def __str__(self):
+        return f"ModuleCall<{self._module}>"
+    
     def children(self):
         return [self._module]
diff --git a/src/pairs/ir/mutator.py b/src/pairs/ir/mutator.py
index bbe06f9ce371d818111cf597ab36e99f65fe5137..3fb017f80cc517c309075658225a1e6a4d435e03 100644
--- a/src/pairs/ir/mutator.py
+++ b/src/pairs/ir/mutator.py
@@ -54,6 +54,18 @@ class Mutator:
             ast_node._reduction_variable = self.mutate(ast_node._reduction_variable)
 
         return ast_node
+    
+    def mutate_Return(self, ast_node):
+        ast_node.expr = self.mutate(ast_node.expr)
+        return ast_node
+    
+    def mutate_Print(self, ast_node):
+        ast_node.args = [self.mutate(arg) for arg in ast_node.args]
+        return ast_node
+
+    def mutate_PrintCode(self, ast_node):
+        ast_node.arg = self.mutate(ast_node.arg)
+        return ast_node
 
     def mutate_ArrayAccess(self, ast_node):
         ast_node.array = self.mutate(ast_node.array)
diff --git a/src/pairs/ir/parameters.py b/src/pairs/ir/parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6b9ab6457ad53354f07c025fad0b8c71c7cbbbb
--- /dev/null
+++ b/src/pairs/ir/parameters.py
@@ -0,0 +1,18 @@
+from pairs.ir.ast_term import ASTTerm 
+from pairs.ir.operator_class import OperatorClass
+
+
+class Parameter(ASTTerm):
+    def __init__(self, sim, param_name, param_type):
+        super().__init__(sim, OperatorClass.from_type(param_type))
+        self.param_name = param_name
+        self.param_type = param_type
+
+    def __str__(self):
+        return f"Parameter<{self.param_name}>"
+
+    def name(self):
+        return self.param_name
+
+    def type(self):
+        return self.param_type
diff --git a/src/pairs/ir/print.py b/src/pairs/ir/print.py
new file mode 100644
index 0000000000000000000000000000000000000000..58c2a6e1d062342e6e2d6511fccc8ee75a7ce55a
--- /dev/null
+++ b/src/pairs/ir/print.py
@@ -0,0 +1,24 @@
+from pairs.ir.ast_node import ASTNode
+from pairs.ir.lit import Lit
+
+class Print(ASTNode):
+    def __init__(self, sim, *args):
+        super().__init__(sim)
+        self.args = [Lit.cvt(sim, a) for a in args]
+        self.sim.add_statement(self)
+
+    def children(self):
+        return self.args
+    
+    def __str__(self):
+        return "Print<" + ", ".join(str(arg) for arg in self.args) + ">"
+
+class PrintCode(ASTNode):
+    def __init__(self, sim, str):
+        super().__init__(sim)
+        self.arg = Lit.cvt(sim, str)
+        self.sim.add_statement(self)
+
+    def children(self):
+        return self.arg
+    
\ No newline at end of file
diff --git a/src/pairs/ir/properties.py b/src/pairs/ir/properties.py
index f10f9424b82f721b3564f6fdc9899f94aca4bc7e..21ed08c918529f509047c0075a2670a5922af5b0 100644
--- a/src/pairs/ir/properties.py
+++ b/src/pairs/ir/properties.py
@@ -16,8 +16,8 @@ class Properties:
         self.props = []
         self.defs = {}
 
-    def add(self, p_name, p_type, p_value, p_volatile, p_layout=Layouts.AoS):
-        p = Property(self.sim, p_name, p_type, p_value, p_volatile, p_layout)
+    def add(self, p_name, p_type, p_value, p_volatile, p_layout=Layouts.AoS, p_reduce=False):
+        p = Property(self.sim, p_name, p_type, p_value, p_volatile, p_layout, p_reduce)
         self.props.append(p)
         self.defs[p_name] = p_value
         return p
@@ -27,6 +27,9 @@ class Properties:
 
     def all(self):
         return self.props
+    
+    def reduction_props(self):
+        return [p for p in self.props if p.reduce is True]
 
     def volatiles(self):
         return [p for p in self.props if p.volatile is True]
@@ -51,7 +54,7 @@ class Properties:
 class Property(ASTNode):
     last_prop_id = 0
 
-    def __init__(self, sim, name, dtype, default, volatile, layout=Layouts.AoS):
+    def __init__(self, sim, name, dtype, default, volatile, layout=Layouts.AoS, reduce=False):
         super().__init__(sim)
         self.prop_id = Property.last_prop_id
         self.prop_name = name
@@ -59,6 +62,7 @@ class Property(ASTNode):
         self.prop_layout = layout
         self.default_value = default
         self.volatile = volatile
+        self.reduce = reduce
         self.device_flag = False
         Property.last_prop_id += 1
 
@@ -83,6 +87,9 @@ class Property(ASTNode):
     def default(self):
         return self.default_value
 
+    def is_volatile(self):
+        return self.volatile
+
     def ndims(self):
         return 1 if Types.is_scalar(self.prop_type) else 2
 
diff --git a/src/pairs/ir/ret.py b/src/pairs/ir/ret.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb235044e87e5ca08f2587b2e406399e796865f8
--- /dev/null
+++ b/src/pairs/ir/ret.py
@@ -0,0 +1,13 @@
+from pairs.ir.ast_node import ASTNode
+
+class Return(ASTNode):
+    def __init__(self, sim, expr):
+        super().__init__(sim)
+        self.expr = expr
+        self.sim.add_statement(self)
+
+    def __str__(self):
+        return f"Return<{self.expr}>"
+
+    def children(self):
+        return [self.expr]
\ No newline at end of file
diff --git a/src/pairs/ir/types.py b/src/pairs/ir/types.py
index ea1d40c784552d9f8bd922210e282f5a5904f400..c2548cdd5beb799839d3d88df92a04e28bb92311 100644
--- a/src/pairs/ir/types.py
+++ b/src/pairs/ir/types.py
@@ -1,5 +1,6 @@
 class Types:
-    Invalid = -1
+    Invalid = -2
+    Void = -1
     Int32 = 0
     Int64 = 1
     UInt64 = 2
@@ -13,6 +14,23 @@ class Types:
     Matrix = 10
     Quaternion = 11
 
+    def c_accessor_keyword(sim, t):
+        real_kw = 'double' if sim.use_double_precision() else 'float'
+        return (
+            real_kw if t==Types.Real
+            else f'pairs::Vector3<{real_kw}>' if t==Types.Vector
+            else f'pairs::Matrix3<{real_kw}>' if t==Types.Matrix
+            else f'pairs::Quaternion<{real_kw}>' if t==Types.Quaternion
+            else 'float' if t == Types.Float
+            else 'double' if t == Types.Double
+            else 'int' if t == Types.Int32
+            else 'int64_t' if t == Types.Int64
+            else 'uint64_t' if t == Types.UInt64
+            else 'bool' if t == Types.Boolean
+            else 'void' if t == Types.Void
+            else '<invalid type>'
+        )
+
     def c_keyword(sim, t):
         real_kw = 'double' if sim.use_double_precision() else 'float'
         return (
@@ -20,14 +38,16 @@ class Types:
             else 'float' if t == Types.Float
             else 'double' if t == Types.Double
             else 'int' if t == Types.Int32
-            else 'long long int' if t == Types.Int64
-            else 'unsigned long long int' if t == Types.UInt64
+            else 'int64_t' if t == Types.Int64
+            else 'uint64_t' if t == Types.UInt64
             else 'bool' if t == Types.Boolean
+            else 'void' if t == Types.Void
             else '<invalid type>'
         )
 
     def c_property_keyword(t):
         return "Prop_Integer"      if t == Types.Int32 else \
+               "Prop_UInt64"       if t == Types.UInt64 else \
                "Prop_Real"         if t == Types.Real else \
                "Prop_Vector"       if t == Types.Vector else \
                "Prop_Matrix"       if t == Types.Matrix else \
diff --git a/src/pairs/ir/utils.py b/src/pairs/ir/utils.py
index 33495668bc00d74021af97b24689513f86357bd5..2a0a707eebd7546745f6dd4017449ead7beb51cb 100644
--- a/src/pairs/ir/utils.py
+++ b/src/pairs/ir/utils.py
@@ -12,11 +12,3 @@ def is_terminal(node):
     terminal_types = (Array, ContactProperty, FeatureProperty, Iter, Neighbor, Property, Symbol, Var)
     return any([isinstance(node, _type) for _type in terminal_types])
 
-
-class Print(ASTNode):
-    def __init__(self, sim, string):
-        super().__init__(sim)
-        self.string = string
-
-    def __str__(self):
-        return f"Print<{self.string}>"
diff --git a/src/pairs/ir/variables.py b/src/pairs/ir/variables.py
index 3d96065c726c838f5de39710a345a8d7ea68a856..00bcc698214a3938b5ae5c3b9b6917114f596c5c 100644
--- a/src/pairs/ir/variables.py
+++ b/src/pairs/ir/variables.py
@@ -18,8 +18,8 @@ class Variables:
         self.vars = []
         self.nvars = 0
 
-    def add(self, v_name, v_type, v_value=0):
-        var = Var(self.sim, v_name, v_type, v_value)
+    def add(self, v_name, v_type, v_value=0, v_runtime_track=False):
+        var = Var(self.sim, v_name, v_type, v_value, v_runtime_track)
         self.vars.append(var)
         return var
 
@@ -39,15 +39,17 @@ class Variables:
 
 
 class Var(ASTTerm):
-    def __init__(self, sim, var_name, var_type, init_value=0, temp=False):
+    def __init__(self, sim, var_name, var_type, init_value=0, runtime_track=False, temp=False):
         super().__init__(sim, OperatorClass.from_type(var_type))
         self.var_name = var_name
         self.var_type = var_type
         self.var_init_value = Lit.cvt(sim, init_value)
+        self.var_runtime_track = runtime_track
         self.var_temporary = temp
         self.mutable = True
         self.var_bonded_arrays = []
         self.device_flag = False
+        self.force_read = False
 
         if temp:
             DeclareVariable(sim, self)
@@ -74,6 +76,9 @@ class Var(ASTTerm):
     def init_value(self):
         return self.var_init_value
 
+    def runtime_track(self):
+        return self.var_runtime_track
+
     def add_bonded_array(self, array):
         self.var_bonded_arrays.append(array)
 
@@ -100,6 +105,10 @@ class Deref(ASTTerm):
     def var(self):
         return self._var
 
+    def copy(self, deep=False):
+        # Terminal copies are just themselves
+        return self
+
     def type(self):
         return self._var.type()
 
diff --git a/src/pairs/mapping/funcs.py b/src/pairs/mapping/funcs.py
index f77410043a102cf72a5719bec54e482ccb336afc..b4ec2f8f5fcb083a92f3eb5a43f6f33cc49093a5 100644
--- a/src/pairs/mapping/funcs.py
+++ b/src/pairs/mapping/funcs.py
@@ -7,6 +7,7 @@ from pairs.ir.loops import For, ParticleFor
 from pairs.ir.operators import Operators
 from pairs.ir.operator_class import OperatorClass
 from pairs.ir.properties import ContactProperty
+from pairs.ir.parameters import Parameter
 from pairs.ir.scalars import ScalarOp
 from pairs.ir.types import Types
 from pairs.mapping.keywords import Keywords
@@ -80,16 +81,16 @@ class BuildParticleIR(ast.NodeVisitor):
 
         raise Exception("Invalid operator: {}".format(ast.dump(op)))
 
-    def __init__(self, sim, ctx_symbols={}):
+    def __init__(self, sim, ctx_symbols={}, func_params={}):
         self.sim = sim
         self.ctx_symbols = ctx_symbols.copy()
+        self.func_params = func_params.copy()
         self.keywords = Keywords(sim)
 
     def add_symbols(self, symbols):
         self.ctx_symbols.update(symbols)
 
     def visit_Assign(self, node):
-        #print(ast.dump(node))
         assert len(node.targets) == 1, "Only one target is allowed on assignments!"
         lhs = self.visit(node.targets[0])
         rhs = self.visit(node.value)
@@ -102,15 +103,16 @@ class BuildParticleIR(ast.NodeVisitor):
 
     def visit_AugAssign(self, node):
         lhs = self.visit(node.target)
+        # We need a copy of the target object so it is properly visited during
+        # compiler analyses and transformations
+        lhs_copy = self.visit(node.target)
         rhs = self.visit(node.value)
         op_class = OperatorClass.from_type_list([lhs.type(), rhs.type()])
-        bin_op = op_class(self.sim, lhs, rhs, BuildParticleIR.get_binary_op(node.op))
+        bin_op = op_class(self.sim, lhs_copy, rhs, BuildParticleIR.get_binary_op(node.op))
 
-        if isinstance(lhs, UndefinedSymbol):
-            self.add_symbols({lhs.symbol_id: bin_op})
-            rhs.set_label(lhs.symbol_id)
-        else:
-            Assign(self.sim, lhs, bin_op)
+        assert not isinstance(lhs, UndefinedSymbol), \
+            f"Invalid AugAssign: symbol {lhs} not defined yet!"
+        Assign(self.sim, lhs, bin_op)
 
     def visit_BinOp(self, node):
         #print(ast.dump(node))
@@ -178,7 +180,7 @@ class BuildParticleIR(ast.NodeVisitor):
 
     def visit_If(self, node):
         condition = self.visit(node.test)
-        one_way = node.orelse is None
+        one_way = node.orelse is None or len(node.orelse) == 0
 
         if one_way:
             for _ in Filter(self.sim, condition):
@@ -210,6 +212,7 @@ class BuildParticleIR(ast.NodeVisitor):
     def visit_Name(self, node):
         symbol_types = [
             self.ctx_symbols.get,
+            self.func_params.get,
             self.sim.array,
             self.sim.property,
             self.sim.feature_property,
@@ -282,7 +285,11 @@ class BuildParticleIR(ast.NodeVisitor):
         return op_class(self.sim, operand, None, BuildParticleIR.get_unary_op(node.op))
 
 
-def compute(sim, func, cutoff_radius=None, symbols={}, pre_step=False, skip_first=False):
+def compute(sim, func, cutoff_radius=None, symbols={}, parameters={}, pre_step=False, skip_first=False):
+    if sim._generate_whole_program:
+        assert not parameters, "Compute functions can't take custom parameters when generating whole program."
+    
+
     src = inspect.getsource(func)
     tree = ast.parse(src, mode='exec')
     #print(ast.dump(ast.parse(src, mode='exec')))
@@ -298,6 +305,7 @@ def compute(sim, func, cutoff_radius=None, symbols={}, pre_step=False, skip_firs
 
     # Convert literal symbols
     symbols = {symbol: Lit.cvt(sim, value) for symbol, value in symbols.items()}
+    parameters = {pname: Parameter(sim, pname, ptype) for pname, ptype in parameters.items()}
 
     sim.init_block()
     sim.module_name(func.__name__)
@@ -305,14 +313,14 @@ def compute(sim, func, cutoff_radius=None, symbols={}, pre_step=False, skip_firs
     if nparams == 1:
         for i in ParticleFor(sim):
             for _ in Filter(sim, ScalarOp.cmp(sim.particle_flags[i] & Flags.Fixed, 0)):
-                ir = BuildParticleIR(sim, symbols)
+                ir = BuildParticleIR(sim, symbols, parameters)
                 ir.add_symbols({params[0]: i})
                 ir.visit(tree)
 
     else:
         for interaction_data in ParticleInteraction(sim, nparams, cutoff_radius):
             # Start building IR
-            ir = BuildParticleIR(sim, symbols)
+            ir = BuildParticleIR(sim, symbols, parameters)
             ir.add_symbols({
                 params[0]: interaction_data.i(),
                 params[1]: interaction_data.j(),
@@ -327,12 +335,13 @@ def compute(sim, func, cutoff_radius=None, symbols={}, pre_step=False, skip_firs
 
             ir.visit(tree)
 
-    if pre_step:
-        sim.build_pre_step_module_with_statements(skip_first=skip_first, profile=True)
-
+    if sim._generate_whole_program:
+        if pre_step:
+            sim.build_pre_step_module_with_statements(skip_first=skip_first, profile=True)
+        else:
+            sim.build_module_with_statements(skip_first=skip_first, profile=True)
     else:
-        sim.build_module_with_statements(skip_first=skip_first, profile=True)
-
+        sim.build_user_defined_function()
 
 def setup(sim, func, symbols={}):
     src = inspect.getsource(func)
@@ -358,4 +367,8 @@ def setup(sim, func, symbols={}):
         ir.add_symbols({params[0]: i})
         ir.visit(tree)
 
-    sim.build_setup_module_with_statements()
+    if sim._generate_whole_program:
+        sim.build_setup_module_with_statements()
+    else:
+        sim.build_user_defined_function()
+    
diff --git a/src/pairs/mapping/keywords.py b/src/pairs/mapping/keywords.py
index ad10c960b8bf87b3f828417f7f8c3049727b23f5..51c255de3767f567182afa8dc5616e5f85151604 100644
--- a/src/pairs/mapping/keywords.py
+++ b/src/pairs/mapping/keywords.py
@@ -10,6 +10,7 @@ from pairs.ir.quaternions import Quaternion
 from pairs.ir.scalars import ScalarOp
 from pairs.ir.select import Select
 from pairs.ir.types import Types
+from pairs.ir.print import Print
 from pairs.ir.vectors import Vector, ZeroVector
 from pairs.sim.shapes import Shapes
 
@@ -30,6 +31,9 @@ class Keywords:
     def exists(self, keyword):
         method = self.get_method(f"keyword_{keyword}")
         return method is not None
+    
+    def keyword_printf(self, args):
+        Print(self.sim, *args)
 
     def keyword_is_point_mass(self, args):
         assert len(args) == 1, "is_point_mass() keyword requires one parameter."
diff --git a/src/pairs/sim/cell_lists.py b/src/pairs/sim/cell_lists.py
index 016ca1a96892e41502d97237a03edafec02d36be..4038f48fb628f6c00c55c5cf535b43e8ce1a9af3 100644
--- a/src/pairs/sim/cell_lists.py
+++ b/src/pairs/sim/cell_lists.py
@@ -11,21 +11,37 @@ from pairs.ir.math import Ceil
 from pairs.ir.scalars import ScalarOp
 from pairs.ir.select import Select
 from pairs.ir.types import Types
-from pairs.ir.utils import Print
+from pairs.ir.print import Print
 from pairs.sim.flags import Flags
 from pairs.sim.lowerable import Lowerable
 
 
 class CellLists:
-    def __init__(self, sim, dom_part, spacing, cutoff_radius):
+    def __init__(self, sim, dom_part, spacing=None, cutoff_radius=None):
         self.sim = sim
         self.dom_part = dom_part
-        self.spacing = spacing if isinstance(spacing, list) else [spacing for d in range(sim.ndims())]
-        self.cutoff_radius = cutoff_radius
-        self.nneighbor_cells = [math.ceil(cutoff_radius / self.spacing[d]) for d in range(sim.ndims())]
-        self.nstencil_max = reduce((lambda x, y: x * y), [self.nneighbor_cells[d] * 2 + 1 for d in range(sim.ndims())])
+
+        # Cell spacing and cutoff radius can be set at runtime 
+        # only if they haven't been pre-set in the input script
+        if spacing:
+            self.spacing = spacing if isinstance(spacing, list) else [spacing for d in range(sim.ndims())]
+            self.runtime_spacing = False
+        else:
+            assert self.sim._generate_whole_program==False, "Cell spacing needs to be defined when generating whole program."
+            self.spacing = self.sim.add_array('spacing', self.sim.ndims(), Types.Real)
+            self.runtime_spacing = True
+
+        if cutoff_radius:
+            self.cutoff_radius = cutoff_radius
+            self.runtime_cutoff_radius = False
+        else:
+            assert self.sim._generate_whole_program==False, "cutoff_radius needs to be defined when generating whole program."
+            self.cutoff_radius = self.sim.add_var('cutoff_radius', Types.Real)
+            self.runtime_cutoff_radius = True
+
         # Data introduced in the simulation
         self.nstencil           =   self.sim.add_var('nstencil', Types.Int32)
+        self.nstencil_capacity  =   self.sim.add_var('nstencil_capacity', Types.Int32, 27)
         self.ncells             =   self.sim.add_var('ncells', Types.Int32, 1)
         self.ncells_capacity    =   self.sim.add_var('ncells_capacity', Types.Int32, 100000)
         self.cell_capacity      =   self.sim.add_var('cell_capacity', Types.Int32, 64)
@@ -34,7 +50,7 @@ class CellLists:
         self.cell_particles     =   self.sim.add_array('cell_particles', [self.ncells_capacity, self.cell_capacity], Types.Int32)
         self.cell_sizes         =   self.sim.add_array('cell_sizes', self.ncells_capacity, Types.Int32)
         self.nshapes            =   self.sim.add_array('nshapes', [self.ncells_capacity, self.sim.max_shapes()], Types.Int32)
-        self.stencil            =   self.sim.add_array('stencil', self.nstencil_max, Types.Int32)
+        self.stencil            =   self.sim.add_array('stencil', self.nstencil_capacity, Types.Int32)
         self.particle_cell      =   self.sim.add_array('particle_cell', self.sim.particle_capacity, Types.Int32)
 
         if sim._store_neighbors_per_cell:
@@ -52,8 +68,9 @@ class BuildCellListsStencil(Lowerable):
     def lower(self):
         stencil = self.cell_lists.stencil
         nstencil = self.cell_lists.nstencil
+        nstencil_capacity = self.cell_lists.nstencil_capacity
         spacing = self.cell_lists.spacing
-        nneighbor_cells = self.cell_lists.nneighbor_cells
+        cutoff_radius = self.cell_lists.cutoff_radius
         dim_ncells = self.cell_lists.dim_ncells
         ncells = self.cell_lists.ncells
         ncells_capacity = self.cell_lists.ncells_capacity
@@ -63,6 +80,7 @@ class BuildCellListsStencil(Lowerable):
 
         self.sim.module_name("build_cell_lists_stencil")
         self.sim.check_resize(ncells_capacity, ncells)
+        self.sim.check_resize(nstencil_capacity, nstencil)
 
         for s in range(self.sim.max_shapes()):
             Assign(self.sim, shapes_buffer[s], self.sim.get_shape_id(s))
@@ -79,7 +97,7 @@ class BuildCellListsStencil(Lowerable):
             Assign(self.sim, nstencil, 0)
 
             for dim in range(self.sim.ndims()):
-                nneigh = nneighbor_cells[dim]
+                nneigh = Ceil(self.sim,(cutoff_radius / spacing[dim]))
                 for dim_offset in For(self.sim, -nneigh, nneigh + 1):
                     index = dim_offset if index is None else index * dim_ncells[dim] + dim_offset
                     if dim == self.sim.ndims() - 1:
diff --git a/src/pairs/sim/comm.py b/src/pairs/sim/comm.py
index 3a439d8048517f2b4c8b0cb5023c1942787d71bd..00622afc06347dcd988bb9ec0cc0ddca5b7f4058 100644
--- a/src/pairs/sim/comm.py
+++ b/src/pairs/sim/comm.py
@@ -9,7 +9,7 @@ from pairs.ir.contexts import Contexts
 from pairs.ir.device import CopyArray
 from pairs.ir.functions import Call_Void
 from pairs.ir.loops import For, ParticleFor, While
-from pairs.ir.utils import Print
+from pairs.ir.print import Print, PrintCode
 from pairs.ir.select import Select
 from pairs.ir.sizeof import Sizeof
 from pairs.ir.types import Types
@@ -23,42 +23,62 @@ class Comm:
         self.nsend_all        = sim.add_var('nsend_all', Types.Int32)
         self.send_capacity    = sim.add_var('send_capacity', Types.Int32, 200000)
         self.recv_capacity    = sim.add_var('recv_capacity', Types.Int32, 200000)
-        self.elem_capacity    = sim.add_var('elem_capacity', Types.Int32, 40)
-        self.neigh_capacity   = sim.add_var('neigh_capacity', Types.Int32, 10)
-        self.nsend            = sim.add_array('nsend', [self.neigh_capacity], Types.Int32)
-        self.send_offsets     = sim.add_array('send_offsets', [self.neigh_capacity], Types.Int32)
+        self.elem_capacity    = sim.add_var('elem_capacity', Types.Int32, 100)
+        self.nsend            = sim.add_array('nsend', [dom_part.nranks_capacity], Types.Int32)
+        self.send_offsets     = sim.add_array('send_offsets', [dom_part.nranks_capacity], Types.Int32)
         self.send_buffer      = sim.add_array('send_buffer', [self.send_capacity, self.elem_capacity], Types.Real, arr_sync=False)
         self.send_map         = sim.add_array('send_map', [self.send_capacity], Types.Int32, arr_sync=False)
         self.exchg_flag       = sim.add_array('exchg_flag', [sim.particle_capacity], Types.Int32, arr_sync=False)
         self.exchg_copy_to    = sim.add_array('exchg_copy_to', [self.send_capacity], Types.Int32, arr_sync=False)
         self.send_mult        = sim.add_array('send_mult', [self.send_capacity, sim.ndims()], Types.Int32)
-        self.nrecv            = sim.add_array('nrecv', [self.neigh_capacity], Types.Int32)
-        self.recv_offsets     = sim.add_array('recv_offsets', [self.neigh_capacity], Types.Int32)
+        self.nrecv            = sim.add_array('nrecv', [dom_part.nranks_capacity], Types.Int32)
+        self.recv_offsets     = sim.add_array('recv_offsets', [dom_part.nranks_capacity], Types.Int32)
         self.recv_buffer      = sim.add_array('recv_buffer', [self.recv_capacity, self.elem_capacity], Types.Real, arr_sync=False)
         self.recv_map         = sim.add_array('recv_map', [self.recv_capacity], Types.Int32)
         self.recv_mult        = sim.add_array('recv_mult', [self.recv_capacity, sim.ndims()], Types.Int32)
-        self.nsend_contact    = sim.add_array('nsend_contact', [self.neigh_capacity], Types.Int32)
-        self.nrecv_contact    = sim.add_array('nrecv_contact', [self.neigh_capacity], Types.Int32)
-        self.contact_soffsets = sim.add_array('contact_soffsets', [self.neigh_capacity], Types.Int32)
-        self.contact_roffsets = sim.add_array('contact_roffsets', [self.neigh_capacity], Types.Int32)
+        self.nsend_contact    = sim.add_array('nsend_contact', [dom_part.nranks_capacity], Types.Int32)
+        self.nrecv_contact    = sim.add_array('nrecv_contact', [dom_part.nranks_capacity], Types.Int32)
+        self.contact_soffsets = sim.add_array('contact_soffsets', [dom_part.nranks_capacity], Types.Int32)
+        self.contact_roffsets = sim.add_array('contact_roffsets', [dom_part.nranks_capacity], Types.Int32)
+        
+        if self.sim.properties.reduction_props():
+            self.nsend_reverse            = sim.add_array('nsend_reverse', [dom_part.nranks_capacity], Types.Int32)
+            self.send_offsets_reverse     = sim.add_array('send_offsets_reverse', [dom_part.nranks_capacity], Types.Int32)
+            self.send_buffer_reverse      = sim.add_array('send_buffer_reverse', [self.send_capacity, self.elem_capacity], Types.Real, arr_sync=False)
+            self.nrecv_reverse            = sim.add_array('nrecv_reverse', [dom_part.nranks_capacity], Types.Int32)
+            self.recv_offsets_reverse     = sim.add_array('recv_offsets_reverse', [dom_part.nranks_capacity], Types.Int32)
+            self.recv_buffer_reverse      = sim.add_array('recv_buffer_reverse', [self.recv_capacity, self.elem_capacity], Types.Real, arr_sync=False)
+
+
+class Synchronize(Lowerable):
+    def __init__(self, comm):
+        self.sim = comm.sim
+        self.comm = comm
 
     @pairs_inline
-    def synchronize(self):
+    def lower(self):
         # Every property that is not constant across timesteps and have neighbor accesses during any
         # interaction kernel (i.e. property[j] in force calculation kernel)
         prop_names = ['position', 'linear_velocity', 'angular_velocity']
         prop_list = [self.sim.property(p) for p in prop_names if self.sim.property(p) is not None]
 
-        PackAllGhostParticles(self, prop_list)
-        CommunicateAllData(self, prop_list)
-        UnpackAllGhostParticles(self, prop_list)
+        PackAllGhostParticles(self.comm, prop_list)
+        CommunicateAllData(self.comm, prop_list)
+        UnpackAllGhostParticles(self.comm, prop_list)
+
+
+class Borders(Lowerable):
+    def __init__(self, comm):
+        self.sim = comm.sim
+        self.comm = comm
 
     @pairs_inline
-    def borders(self):
+    def lower(self):
         # Every property that has neighbor accesses during any interaction kernel (i.e. property[j]
         # exists in any force calculation kernel)
         # We ignore normal because there should be no ghost half-spaces
         prop_names = [
+            'flags',
             'uid',
             'type',
             'mass',
@@ -71,84 +91,128 @@ class Comm:
 
         prop_list = [self.sim.property(p) for p in prop_names if self.sim.property(p) is not None]
 
-        Assign(self.sim, self.nsend_all, 0)
+        Assign(self.sim, self.comm.nsend_all, 0)
         Assign(self.sim, self.sim.nghost, 0)
 
-        for step in range(self.dom_part.number_of_steps()):
+        for step in range(self.comm.dom_part.number_of_steps()):
             if self.sim._target.is_gpu():
-                CopyArray(self.sim, self.nsend, Contexts.Host, Actions.Ignore)
-                CopyArray(self.sim, self.nrecv, Contexts.Host, Actions.Ignore)
+                CopyArray(self.sim, self.comm.nsend, Contexts.Host, Actions.Ignore)
+                CopyArray(self.sim, self.comm.nrecv, Contexts.Host, Actions.Ignore)
 
-            for j in self.dom_part.step_indexes(step):
-                Assign(self.sim, self.nsend[j], 0)
-                Assign(self.sim, self.nrecv[j], 0)
+            for j in self.comm.dom_part.step_indexes(step):
+                Assign(self.sim, self.comm.nsend[j], 0)
+                Assign(self.sim, self.comm.nrecv[j], 0)
 
             if self.sim._target.is_gpu():
-                CopyArray(self.sim, self.nsend, Contexts.Device, Actions.Ignore)
-                CopyArray(self.sim, self.nrecv, Contexts.Device, Actions.Ignore)
+                CopyArray(self.sim, self.comm.nsend, Contexts.Device, Actions.Ignore)
+                CopyArray(self.sim, self.comm.nrecv, Contexts.Device, Actions.Ignore)
 
-            DetermineGhostParticles(self, step, self.sim.cell_spacing())
-            CommunicateSizes(self, step)
-            SetCommunicationOffsets(self, step)
-            PackGhostParticles(self, step, prop_list)
-            CommunicateData(self, step, prop_list)
-            UnpackGhostParticles(self, step, prop_list)
+            DetermineGhostParticles(self.comm, step, self.sim.cell_spacing())
+            CommunicateSizes(self.comm, step)
+            SetCommunicationOffsets(self.comm, step)
+            PackGhostParticles(self.comm, step, prop_list)
+            CommunicateData(self.comm, step, prop_list)
+            UnpackGhostParticles(self.comm, step, prop_list)
 
-            step_nrecv = sum([self.nrecv[j] for j in self.dom_part.step_indexes(step)])
+            step_nrecv = self.comm.dom_part.reduce_sum_step_indexes(step, self.comm.nrecv)
             Assign(self.sim, self.sim.nghost, self.sim.nghost + step_nrecv)
 
+
+class Exchange(Lowerable):
+    def __init__(self, comm):
+        self.sim = comm.sim
+        self.comm = comm
+
     @pairs_inline
-    def exchange(self):
+    def lower(self):
         # Every property except volatiles
         prop_list = self.sim.properties.non_volatiles()
 
-        for step in range(self.dom_part.number_of_steps()):
-            Assign(self.sim, self.nsend_all, 0)
-            Assign(self.sim, self.sim.nghost, 0)
+        for step in range(self.comm.dom_part.number_of_steps()):
+            Assign(self.comm.sim, self.comm.nsend_all, 0)
+            Assign(self.comm.sim, self.sim.nghost, 0)
 
             for s in range(step + 1):
-                for j in self.dom_part.step_indexes(s):
-                    Assign(self.sim, self.nsend[j], 0)
-                    Assign(self.sim, self.nrecv[j], 0)
-                    Assign(self.sim, self.send_offsets[j], 0)
-                    Assign(self.sim, self.recv_offsets[j], 0)
-                    Assign(self.sim, self.nsend_contact[j], 0)
-                    Assign(self.sim, self.nrecv_contact[j], 0)
-                    Assign(self.sim, self.contact_soffsets[j], 0)
-                    Assign(self.sim, self.contact_soffsets[j], 0)
+                for j in self.comm.dom_part.step_indexes(s):
+                    Assign(self.comm.sim, self.comm.nsend[j], 0)
+                    Assign(self.comm.sim, self.comm.nrecv[j], 0)
+                    Assign(self.comm.sim, self.comm.send_offsets[j], 0)
+                    Assign(self.comm.sim, self.comm.recv_offsets[j], 0)
+                    Assign(self.comm.sim, self.comm.nsend_contact[j], 0)
+                    Assign(self.comm.sim, self.comm.nrecv_contact[j], 0)
+                    Assign(self.comm.sim, self.comm.contact_soffsets[j], 0)
+                    Assign(self.comm.sim, self.comm.contact_soffsets[j], 0)
 
             if self.sim._target.is_gpu():
-                CopyArray(self.sim, self.nsend, Contexts.Device, Actions.Ignore)
-                CopyArray(self.sim, self.nrecv, Contexts.Device, Actions.Ignore)
+                CopyArray(self.comm.sim, self.comm.nsend, Contexts.Device, Actions.Ignore)
+                CopyArray(self.comm.sim, self.comm.nrecv, Contexts.Device, Actions.Ignore)
 
-            DetermineGhostParticles(self, step, 0.0)
-            CommunicateSizes(self, step)
-            SetCommunicationOffsets(self, step)
-            PackGhostParticles(self, step, prop_list)
+            DetermineGhostParticles(self.comm, step, 0.0)
+            CommunicateSizes(self.comm, step)
+            SetCommunicationOffsets(self.comm, step)
+            PackGhostParticles(self.comm, step, prop_list)
 
             if self.sim._target.is_gpu():
-                send_map_size = self.nsend_all * Sizeof(self.sim, Types.Int32)
-                exchg_flag_size = self.sim.nlocal * Sizeof(self.sim, Types.Int32)
-                CopyArray(self.sim, self.send_map, Contexts.Host, Actions.ReadOnly, send_map_size)
-                CopyArray(self.sim, self.exchg_flag, Contexts.Host, Actions.ReadOnly, exchg_flag_size)
+                send_map_size = self.comm.nsend_all * Sizeof(self.comm.sim, Types.Int32)
+                exchg_flag_size = self.sim.nlocal * Sizeof(self.comm.sim, Types.Int32)
+                CopyArray(self.comm.sim, self.comm.send_map, Contexts.Host, Actions.ReadOnly, send_map_size)
+                CopyArray(self.comm.sim, self.comm.exchg_flag, Contexts.Host, Actions.ReadOnly, exchg_flag_size)
 
-            RemoveExchangedParticles_part1(self)
+            RemoveExchangedParticles_part1(self.comm)
 
             if self.sim._target.is_gpu():
-                exchg_copy_to_size = self.nsend_all * Sizeof(self.sim, Types.Int32)
+                exchg_copy_to_size = self.comm.nsend_all * Sizeof(self.comm.sim, Types.Int32)
                 CopyArray(
-                    self.sim, self.exchg_copy_to, Contexts.Device, Actions.ReadOnly, exchg_copy_to_size)
+                    self.comm.sim, self.comm.exchg_copy_to, Contexts.Device, Actions.ReadOnly, exchg_copy_to_size)
 
-            RemoveExchangedParticles_part2(self, prop_list)
-            CommunicateData(self, step, prop_list)
-            UnpackGhostParticles(self, step, prop_list)
+            RemoveExchangedParticles_part2(self.comm, prop_list)
+            CommunicateData(self.comm, step, prop_list)
+            UnpackGhostParticles(self.comm, step, prop_list)
 
             if self.sim._use_contact_history:
-                PackContactHistoryData(self, step)
-                CommunicateContactHistoryData(self, step)
-                UnpackContactHistoryData(self, step)
+                PackContactHistoryData(self.comm, step)
+                CommunicateContactHistoryData(self.comm, step)
+                UnpackContactHistoryData(self.comm, step)
+
+            ChangeSizeAfterExchange(self.comm, step)
+
+
+class ReverseComm(Lowerable):
+    def __init__(self, comm, reduce=False):
+        self.sim = comm.sim
+        self.comm = comm
+        self.reduce = reduce
+
+    @pairs_inline
+    def lower(self):
+        prop_list = self.sim.properties.reduction_props()
+
+        if prop_list :
+            for step in range(self.comm.dom_part.number_of_steps() - 1, -1, -1):
+                if self.sim._target.is_gpu():
+                    CopyArray(self.sim, self.comm.nsend, Contexts.Host, Actions.ReadOnly)
+                    CopyArray(self.sim, self.comm.nrecv, Contexts.Host, Actions.ReadOnly)
+                    CopyArray(self.sim, self.comm.send_offsets, Contexts.Host, Actions.ReadOnly)
+                    CopyArray(self.sim, self.comm.recv_offsets, Contexts.Host, Actions.ReadOnly)
+
+                    CopyArray(self.sim, self.comm.nsend_reverse, Contexts.Host, Actions.WriteOnly)
+                    CopyArray(self.sim, self.comm.nrecv_reverse, Contexts.Host, Actions.WriteOnly)
+                    CopyArray(self.sim, self.comm.send_offsets_reverse, Contexts.Host, Actions.WriteOnly)
+                    CopyArray(self.sim, self.comm.recv_offsets_reverse, Contexts.Host, Actions.WriteOnly)
+                
+                for j in self.comm.dom_part.step_indexes(step):
+                    Assign(self.sim, self.comm.nsend_reverse[j], self.comm.nrecv[j])
+                    Assign(self.sim, self.comm.nrecv_reverse[j], self.comm.nsend[j])
+                    Assign(self.sim, self.comm.send_offsets_reverse[j], self.comm.recv_offsets[j])
+                    Assign(self.sim, self.comm.recv_offsets_reverse[j], self.comm.send_offsets[j])
+
+                PackGhostParticlesReverse(self.comm, step, prop_list)
+                CommunicateDataReverse(self.comm, step, prop_list)
+                UnpackGhostParticlesReverse(self.comm, step, prop_list, self.reduce)
+
+
+
 
-            ChangeSizeAfterExchange(self, step)
 
 
 class CommunicateSizes(Lowerable):
@@ -160,7 +224,7 @@ class CommunicateSizes(Lowerable):
 
     @pairs_inline
     def lower(self):
-        Call_Void(self.sim, "pairs->communicateSizes", [self.step, self.comm.nsend, self.comm.nrecv])
+        Call_Void(self.sim, "pairs_runtime->communicateSizes", [self.step, self.comm.nsend, self.comm.nrecv])
 
 
 class CommunicateData(Lowerable):
@@ -176,12 +240,29 @@ class CommunicateData(Lowerable):
         elem_size = sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
 
         Call_Void(self.sim,
-                  "pairs->communicateData",
+                  "pairs_runtime->communicateData",
                   [self.step, elem_size,
                    self.comm.send_buffer, self.comm.send_offsets, self.comm.nsend,
                    self.comm.recv_buffer, self.comm.recv_offsets, self.comm.nrecv])
 
+class CommunicateDataReverse(Lowerable):
+    def __init__(self, comm, step, prop_list):
+        super().__init__(comm.sim)
+        self.comm = comm
+        self.step = step
+        self.prop_list = prop_list
+        self.sim.add_statement(self)
+
+    @pairs_inline
+    def lower(self):
+        elem_size = sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
 
+        Call_Void(self.sim,
+                  "pairs_runtime->communicateDataReverse",
+                  [self.step, elem_size,
+                   self.comm.send_buffer_reverse, self.comm.send_offsets_reverse, self.comm.nsend_reverse,
+                   self.comm.recv_buffer_reverse, self.comm.recv_offsets_reverse, self.comm.nrecv_reverse])
+        
 class CommunicateContactHistoryData(Lowerable):
     def __init__(self, comm, step):
         super().__init__(comm.sim)
@@ -195,7 +276,7 @@ class CommunicateContactHistoryData(Lowerable):
                                   for cp in self.sim.contact_properties]) + 1
 
         Call_Void(self.sim,
-                  "pairs->communicateContactHistoryData",
+                  "pairs_runtime->communicateContactHistoryData",
                   [self.step, nelems_per_contact,
                    self.comm.send_buffer, self.comm.contact_soffsets, self.comm.nsend_contact,
                    self.comm.recv_buffer, self.comm.contact_roffsets, self.comm.nrecv_contact])
@@ -214,7 +295,7 @@ class CommunicateAllData(Lowerable):
 
         Call_Void(
             self.sim,
-            "pairs->communicateAllData",
+            "pairs_runtime->communicateAllData",
             [self.comm.dom_part.number_of_steps(), elem_size,
              self.comm.send_buffer, self.comm.send_offsets, self.comm.nsend,
              self.comm.recv_buffer, self.comm.recv_offsets, self.comm.nrecv])
@@ -241,6 +322,7 @@ class DetermineGhostParticles(Lowerable):
         self.sim.check_resize(self.comm.send_capacity, nsend)
         #self.sim.check_resize(self.comm.send_capacity, nsend_all)
 
+        # PrintCode(self.sim, f"std::cout << \"resizes[0] {self.sim._module_name} ========== \" << pobj->resizes[0] << std::endl;")
         if is_exchange:
             for i in ParticleFor(self.sim):
                 Assign(self.sim, exchg_flag[i], 0)
@@ -274,18 +356,18 @@ class SetCommunicationOffsets(Lowerable):
         recv_offsets = self.comm.recv_offsets
         self.sim.module_name(f"set_communication_offsets{self.step}")
 
-        isend = 0
-        irecv = 0
+        isend = self.sim.add_temp_var(0)
+        irecv = self.sim.add_temp_var(0)
         for i in range(self.step):
             for j in self.comm.dom_part.step_indexes(i):
-                isend += nsend[j]
-                irecv += nrecv[j]
+                Assign(self.sim, isend, ScalarOp.inline(isend + nsend[j]))
+                Assign(self.sim, irecv, ScalarOp.inline(irecv + nrecv[j]))
 
         for j in self.comm.dom_part.step_indexes(self.step):
             Assign(self.sim, send_offsets[j], isend)
             Assign(self.sim, recv_offsets[j], irecv)
-            isend += nsend[j]
-            irecv += nrecv[j]
+            Assign(self.sim, isend, ScalarOp.inline(isend + nsend[j]))
+            Assign(self.sim, irecv, ScalarOp.inline(irecv + nrecv[j]))
 
 
 class PackGhostParticles(Lowerable):
@@ -307,9 +389,9 @@ class PackGhostParticles(Lowerable):
         send_mult = self.comm.send_mult
         self.sim.module_name(f"pack_ghost_particles{self.step}_" + "_".join([str(p.id()) for p in self.prop_list]))
 
-        step_indexes = self.comm.dom_part.step_indexes(self.step)
-        start = self.comm.send_offsets[step_indexes[0]]
-        for i in For(self.sim, start, ScalarOp.inline(start + sum([self.comm.nsend[j] for j in step_indexes]))):
+        start = self.comm.send_offsets[self.comm.dom_part.first_step_index(self.step)]
+        end = ScalarOp.inline(start + self.comm.dom_part.reduce_sum_step_indexes(self.step, self.comm.nsend))
+        for i in For(self.sim, start, end):
             p_offset = 0
             m = send_map[i]
             for p in self.prop_list:
@@ -329,6 +411,44 @@ class PackGhostParticles(Lowerable):
                     Assign(self.sim, send_buffer[i][p_offset], cast_fn(p[m]))
                     p_offset += 1
 
+class PackGhostParticlesReverse(Lowerable):
+    def __init__(self, comm, step, prop_list):
+        super().__init__(comm.sim)
+        self.comm = comm
+        self.step = step
+        self.prop_list = prop_list
+        self.sim.add_statement(self)
+
+    def get_elems_per_particle(self):
+        return sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
+
+    @pairs_device_block
+    def lower(self):
+        nlocal = self.sim.nlocal
+        send_buffer_reverse = self.comm.send_buffer_reverse
+        send_buffer_reverse.set_stride(1, self.get_elems_per_particle())
+
+        self.sim.module_name(f"pack_ghost_particles_reverse{self.step}_" + "_".join([str(p.id()) for p in self.prop_list]))
+
+        start = self.comm.send_offsets_reverse[self.comm.dom_part.first_step_index(self.step)]
+        end = ScalarOp.inline(start + self.comm.dom_part.reduce_sum_step_indexes(self.step, self.comm.nsend_reverse))
+        for i in For(self.sim, start, end):
+            p_offset = 0
+            m = nlocal + i
+            for p in self.prop_list:
+                if not Types.is_scalar(p.type()):
+                    nelems = Types.number_of_elements(self.sim, p.type())
+                    for e in range(nelems):
+                        src = p[m][e]
+                        Assign(self.sim, send_buffer_reverse[i][p_offset + e], src)
+
+                    p_offset += nelems
+
+                else:
+                    cast_fn = lambda x: Cast(self.sim, x, Types.Real) if p.type() != Types.Real else x
+                    Assign(self.sim, send_buffer_reverse[i][p_offset], cast_fn(p[m]))
+                    p_offset += 1
+
             
 class UnpackGhostParticles(Lowerable):
     def __init__(self, comm, step, prop_list):
@@ -348,9 +468,9 @@ class UnpackGhostParticles(Lowerable):
         recv_buffer.set_stride(1, self.get_elems_per_particle())
         self.sim.module_name(f"unpack_ghost_particles{self.step}_" + "_".join([str(p.id()) for p in self.prop_list]))
 
-        step_indexes = self.comm.dom_part.step_indexes(self.step)
-        start = self.comm.recv_offsets[step_indexes[0]]
-        for i in For(self.sim, start, ScalarOp.inline(start + sum([self.comm.nrecv[j] for j in step_indexes]))):
+        start = self.comm.recv_offsets[self.comm.dom_part.first_step_index(self.step)]
+        end = ScalarOp.inline(start + self.comm.dom_part.reduce_sum_step_indexes(self.step, self.comm.nrecv))
+        for i in For(self.sim, start, end):
             p_offset = 0
             for p in self.prop_list:
                 if not Types.is_scalar(p.type()):
@@ -365,6 +485,49 @@ class UnpackGhostParticles(Lowerable):
                     Assign(self.sim, p[nlocal + i], cast_fn(recv_buffer[i][p_offset]))
                     p_offset += 1
 
+class UnpackGhostParticlesReverse(Lowerable):
+    def __init__(self, comm, step, prop_list, reduce=False):
+        super().__init__(comm.sim)
+        self.comm = comm
+        self.step = step
+        self.prop_list = prop_list
+        self.reduce = reduce
+        self.sim.add_statement(self)
+        
+
+    def get_elems_per_particle(self):
+        return sum([Types.number_of_elements(self.sim, p.type()) for p in self.prop_list])
+
+    @pairs_device_block
+    def lower(self):
+        send_map = self.comm.send_map
+        recv_buffer_reverse = self.comm.recv_buffer_reverse
+        recv_buffer_reverse.set_stride(1, self.get_elems_per_particle())
+        self.sim.module_name(f"unpack_ghost_particles_reverse{self.step}_" + "_".join([str(p.id()) for p in self.prop_list]))
+
+        start = self.comm.recv_offsets_reverse[self.comm.dom_part.first_step_index(self.step)]
+        end = ScalarOp.inline(start + self.comm.dom_part.reduce_sum_step_indexes(self.step, self.comm.nrecv_reverse))
+        for i in For(self.sim, start, end):
+            p_offset = 0
+            m = send_map[i]
+            for p in self.prop_list:
+                if not Types.is_scalar(p.type()):
+                    nelems = Types.number_of_elements(self.sim, p.type())
+                    for e in range(nelems):
+                        if self.reduce:
+                            AtomicInc(self.sim, p[m][e], recv_buffer_reverse[i][p_offset + e])
+                        else:
+                            Assign(self.sim, p[m][e], recv_buffer_reverse[i][p_offset + e])
+
+                    p_offset += nelems
+
+                else:
+                    cast_fn = lambda x: Cast(self.sim, x, p.type()) if p.type() != Types.Real else x
+                    if self.reduce:
+                        AtomicInc(self.sim, p[m], cast_fn(recv_buffer_reverse[i][p_offset]))
+                    else:
+                        Assign(self.sim, p[m], cast_fn(recv_buffer_reverse[i][p_offset]))
+                    p_offset += 1
 
 class PackAllGhostParticles(Lowerable):
     def __init__(self, comm, prop_list):
@@ -423,9 +586,7 @@ class UnpackAllGhostParticles(Lowerable):
         recv_buffer.set_stride(1, self.get_elems_per_particle())
         self.sim.module_name(f"unpack_all_ghost_particles" + "_".join([str(p.id()) for p in self.prop_list]))
 
-        nrecv_size = sum([len(dom_part.step_indexes(s)) for s in range(dom_part.number_of_steps())])
-        nrecv_all = sum([self.comm.nrecv[j] for j in range(nrecv_size)])
-
+        nrecv_all = self.comm.dom_part.reduce_sum_all_steps(self.comm.nrecv)
         for i in For(self.sim, 0, nrecv_all):
             p_offset = 0
             for p in self.prop_list:
@@ -517,7 +678,8 @@ class ChangeSizeAfterExchange(Lowerable):
     def lower(self):
         self.sim.module_name(f"change_size_after_exchange{self.step}")
         self.sim.check_resize(self.sim.particle_capacity, self.sim.nlocal)
-        Assign(self.sim, self.sim.nlocal, self.sim.nlocal + sum([self.comm.nrecv[j] for j in self.comm.dom_part.step_indexes(self.step)]))
+        nrecv = self.comm.dom_part.reduce_sum_step_indexes(self.step, self.comm.nrecv)
+        Assign(self.sim, self.sim.nlocal, self.sim.nlocal + nrecv)
 
 
 class PackContactHistoryData(Lowerable):
@@ -612,7 +774,6 @@ class UnpackContactHistoryData(Lowerable):
         contact_used = self.sim._contact_history.contact_used
         self.sim.module_name(f"unpack_contact_history{self.step}")
 
-        step_indexes = self.comm.dom_part.step_indexes(self.step)
         nelems_per_contact = sum([Types.number_of_elements(self.sim, cp.type()) \
                                   for cp in self.sim.contact_properties]) + 1
 
diff --git a/src/pairs/sim/contact_history.py b/src/pairs/sim/contact_history.py
index 9a1a94d76e41ce8bd045ace9df45fcb45e0e8838..51b76f49d5bd9e2458f1af57d241a9b485a80bdc 100644
--- a/src/pairs/sim/contact_history.py
+++ b/src/pairs/sim/contact_history.py
@@ -4,7 +4,7 @@ from pairs.ir.branches import Branch, Filter
 from pairs.ir.loops import ParticleFor, For, While
 from pairs.ir.scalars import ScalarOp
 from pairs.ir.types import Types
-from pairs.ir.utils import Print
+from pairs.ir.print import Print
 from pairs.sim.interaction import NeighborFor
 from pairs.sim.lowerable import Lowerable
 
diff --git a/src/pairs/sim/domain.py b/src/pairs/sim/domain.py
index a30635397da67196f6d72b8c337d42e7a0062779..2560f1a8759727a4d7c0167da12955a88c67aa63 100644
--- a/src/pairs/sim/domain.py
+++ b/src/pairs/sim/domain.py
@@ -1,17 +1,18 @@
 from pairs.ir.block import pairs_inline
-from pairs.ir.functions import Call_Void
-from pairs.ir.types import Types
 from pairs.sim.lowerable import Lowerable
 
-
 class InitializeDomain(Lowerable):
     def __init__(self, sim):
         super().__init__(sim)
 
     @pairs_inline
     def lower(self):
-        dom_part = self.sim.domain_partitioning()
-        grid_array = [(self.sim.grid.min(d), self.sim.grid.max(d)) for d in range(self.sim.ndims())]
-        Call_Void(self.sim, "pairs->initDomain", [param for delim in grid_array for param in delim]),
-        Call_Void(self.sim, "pairs->fillCommunicationArrays", [dom_part.neighbor_ranks, dom_part.pbc, dom_part.subdom])
+        self.sim.domain_partitioning().initialize()
+
+class UpdateDomain(Lowerable):
+    def __init__(self, sim):
+        super().__init__(sim)
 
+    @pairs_inline
+    def lower(self):
+        self.sim.domain_partitioning().update()
diff --git a/src/pairs/sim/domain_partitioners.py b/src/pairs/sim/domain_partitioners.py
index 6e00ad848f8496fbf8c24aa36ac3a06e813dcc07..d99c6cd8aba45f7e344309fe953cd53585053b97 100644
--- a/src/pairs/sim/domain_partitioners.py
+++ b/src/pairs/sim/domain_partitioners.py
@@ -2,10 +2,10 @@ class DomainPartitioners:
     Invalid = -1
     Regular = 0
     RegularXY = 1
-    BoxList = 2
+    BlockForest = 2
 
     def c_keyword(layout):
-        return "Regular"    if layout == DomainPartitioners.Regular else \
-               "RegularXY"  if layout == DomainPartitioners.RegularXY else \
-               "BoxList"    if layout == DomainPartitioners.BoxList else \
+        return "RegularPartitioning"        if layout == DomainPartitioners.Regular else \
+               "RegularXYPartitioning"      if layout == DomainPartitioners.RegularXY else \
+               "BlockForestPartitioning"    if layout == DomainPartitioners.BlockForest else \
                "Invalid"
diff --git a/src/pairs/sim/domain_partitioning.py b/src/pairs/sim/domain_partitioning.py
index 901df44744426921cb896e9c2781caff73c5521d..485e1a6cbb8c8fe54dabdadd7efd0e7368c3d23b 100644
--- a/src/pairs/sim/domain_partitioning.py
+++ b/src/pairs/sim/domain_partitioning.py
@@ -1,20 +1,27 @@
-from pairs.ir.block import pairs_device_block, pairs_host_block
-from pairs.ir.branches import Branch, Filter
-from pairs.ir.loops import For, ParticleFor
-from pairs.ir.utils import Print
+from pairs.ir.assign import Assign
+from pairs.ir.branches import Filter
+from pairs.ir.loops import For
+from pairs.ir.functions import Call_Int, Call_Void, Call
 from pairs.ir.scalars import ScalarOp
 from pairs.ir.select import Select
 from pairs.ir.types import Types
 from pairs.sim.flags import Flags
-from pairs.sim.lowerable import Lowerable
-
-
+from pairs.ir.lit import Lit
+from pairs.sim.grid import MutableGrid
+from pairs.ir.device import CopyArray
+from pairs.ir.contexts import Contexts
+from pairs.ir.actions import Actions
+from pairs.sim.load_balancing_algorithms import LoadBalancingAlgorithms
+from pairs.ir.print import PrintCode
 class DimensionRanges:
     def __init__(self, sim):
-        self.sim            = sim
-        self.neighbor_ranks = sim.add_static_array('neighbor_ranks', [sim.ndims() * 2], Types.Int32)
-        self.pbc            = sim.add_static_array('pbc', [sim.ndims() * 2], Types.Int32)
-        self.subdom         = sim.add_static_array('subdom', [sim.ndims() * 2], Types.Real)
+        self.sim                = sim
+        self.nranks             = 6
+        self.nranks_capacity    = self.nranks
+        self.neighbor_ranks     = sim.add_static_array('neighbor_ranks', [sim.ndims() * 2], Types.Int32)
+        self.pbc                = sim.add_static_array('pbc', [sim.ndims() * 2], Types.Int32)
+        self.subdom             = sim.add_static_array('subdom', [sim.ndims() * 2], Types.Real)
+        self.rank               = sim.add_var('rank', Types.Int32)
 
     def min(self, dim):
         return self.subdom[dim * 2 + 0]
@@ -28,6 +35,33 @@ class DimensionRanges:
     def step_indexes(self, step):
         return [step * 2 + 0, step * 2 + 1]
 
+    def first_step_index(self, step):
+        return self.step_indexes(step)[0]
+
+    def reduce_sum_all_steps(self, array):
+        total_size = sum([len(self.step_indexes(s)) for s in range(self.number_of_steps())])
+        return sum([array[i] for i in range(total_size)])
+
+    def reduce_sum_step_indexes(self, step, array):
+       return sum([array[i] for i in self.step_indexes(step)])
+
+    def initialize(self):
+        grid_array = [self.sim.grid.min(d) for d in range(self.sim.ndims())] + [self.sim.grid.max(d) for d in range(self.sim.ndims())]
+        Call_Void(self.sim, "pairs_runtime->initDomain", grid_array)
+
+    def update(self):
+        Call_Void(self.sim, "pairs_runtime->updateDomain", [])
+        Assign(self.sim, self.rank, Call_Int(self.sim, "pairs_runtime->getDomainPartitioner()->getRank", []))
+
+        Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['neighbor_ranks', self.neighbor_ranks, self.sim.ndims() * 2])
+        Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['pbc', self.pbc, self.sim.ndims() * 2])
+        Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['subdom', self.subdom, self.sim.ndims() * 2])
+
+        if isinstance(self.sim.grid, MutableGrid):
+            for d in range(self.sim.dims):
+                Assign(self.sim, self.sim.grid.min(d), Call(self.sim, "pairs_runtime->getDomainPartitioner()->getMin", [d], Types.Real))
+                Assign(self.sim, self.sim.grid.max(d), Call(self.sim, "pairs_runtime->getDomainPartitioner()->getMax", [d], Types.Real))
+
     def ghost_particles(self, step, position, offset=0.0):
         # Particles with one of the following flags are ignored
         flags_to_exclude = (Flags.Infinite | Flags.Global)
@@ -40,11 +74,8 @@ class DimensionRanges:
                         pbc_shifts = [0 if d != step else self.pbc[j] for d in range(self.sim.ndims())]
                         yield i, j, self.neighbor_ranks[j], pbc_shifts
 
-
-
         def prev_neighbor(self, j, step, position, offset, flags_to_exclude):
             particle_flags = self.sim.particle_flags
-            j = step * 2 + 1
             for i in For(self.sim, 0, self.sim.nlocal + self.sim.nghost):
                 for _ in Filter(self.sim, ScalarOp.cmp(particle_flags[i] & flags_to_exclude, 0)):
                     for _ in Filter(self.sim, position[i][step] > self.subdom[j] - offset):
@@ -63,3 +94,160 @@ class DimensionRanges:
             j = step * 2 + 1
             for _ in Filter(self.sim, ScalarOp.inline(ScalarOp.cmp(self.pbc[j], 0))):
                 yield from prev_neighbor(self, j, step, position, offset, flags_to_exclude)
+
+
+class BlockForest:
+    def __init__(self, sim):
+        self.sim                = sim
+        self.load_balancer      = None
+        self.regrid_min         = None
+        self.regrid_max         = None
+        self.reduce_step        = sim.add_var('reduce_step', Types.Int32)   # this var is treated as a tmp (workaround for gpu)
+        self.reduce_step.force_read = True
+        self.rank               = sim.add_var('rank', Types.Int32)
+        self.nranks             = sim.add_var('nranks', Types.Int32)
+        self.nranks_capacity    = sim.add_var('nranks_capacity', Types.Int32, init_value=27)
+        self.ntotal_aabbs       = sim.add_var('ntotal_aabbs', Types.Int32)
+        self.aabb_capacity      = sim.add_var('aabb_capacity', Types.Int32, init_value=27)
+        self.ranks              = sim.add_array('ranks', [self.nranks_capacity], Types.Int32)
+        self.naabbs             = sim.add_array('naabbs', [self.nranks_capacity], Types.Int32)
+        self.aabb_offsets       = sim.add_array('aabb_offsets', [self.nranks_capacity], Types.Int32)
+        self.aabbs              = sim.add_array('aabbs', [self.aabb_capacity, 6], Types.Real)
+        self.subdom             = sim.add_array('subdom', [sim.ndims() * 2], Types.Real)
+
+    def min(self, dim):
+        return self.subdom[dim * 2 + 0]
+
+    def max(self, dim):
+        return self.subdom[dim * 2 + 1]
+
+    def number_of_steps(self):
+        return 1
+
+    def step_indexes(self, step):
+        yield from For(self.sim, 0, self.nranks, not_kernel=True)
+
+    def first_step_index(self, step):
+        return 0
+
+    def reduce_sum_all_steps(self, array):
+        return self.reduce_sum_step_indexes(0, array)
+
+    def reduce_sum_step_indexes(self, step, array):
+        Assign(self.sim, self.reduce_step, 0)
+        for i in For(self.sim, 0, self.nranks, not_kernel=True):
+            Assign(self.sim, self.reduce_step, ScalarOp.inline( self.reduce_step + array[i]))
+            
+        return self.reduce_step
+
+    def initialize(self):
+        grid_array = [self.sim.grid.min(d) for d in range(self.sim.ndims())] + [self.sim.grid.max(d) for d in range(self.sim.ndims())]
+
+        Call_Void(self.sim, "pairs_runtime->initDomain", 
+                  grid_array + self.sim._pbc + ([True] if self.load_balancer is not None else []))
+        
+        if self.load_balancer is not None:
+            PrintCode(self.sim, "pairs_runtime->getDomainPartitioner()->initWorkloadBalancer"
+                      f"({LoadBalancingAlgorithms.c_keyword(self.load_balancer)}, {self.regrid_min}, {self.regrid_max});")
+
+            # Call_Void(self.sim, "pairs_runtime->getDomainPartitioner()->initWorkloadBalancer", 
+            #           [self.load_balancer, self.regrid_min, self.regrid_max])
+
+    def update(self):
+        Call_Void(self.sim, "pairs_runtime->updateDomain", [])
+        Assign(self.sim, self.rank, Call_Int(self.sim, "pairs_runtime->getDomainPartitioner()->getRank", []))
+        Assign(self.sim, self.nranks, Call_Int(self.sim, "pairs_runtime->getNumberOfNeighborRanks", []))
+
+        for _ in Filter(self.sim, ScalarOp.neq(self.nranks, 0)):
+            Assign(self.sim, self.ntotal_aabbs, Call_Int(self.sim, "pairs_runtime->getNumberOfNeighborAABBs", []))
+
+            for _ in Filter(self.sim, self.nranks_capacity < self.nranks):
+                Assign(self.sim, self.nranks_capacity, self.nranks + 10)
+                self.ranks.realloc()
+                self.naabbs.realloc()
+                self.aabb_offsets.realloc()
+
+            for _ in Filter(self.sim, self.aabb_capacity < self.ntotal_aabbs):
+                Assign(self.sim, self.aabb_capacity, self.ntotal_aabbs + 20)
+                self.aabbs.realloc()
+            
+            CopyArray(self.sim, self.ranks, Contexts.Host, Actions.WriteOnly, self.nranks)
+            CopyArray(self.sim, self.naabbs, Contexts.Host, Actions.WriteOnly, self.nranks)
+            CopyArray(self.sim, self.aabb_offsets, Contexts.Host, Actions.WriteOnly, self.nranks)
+            CopyArray(self.sim, self.aabbs, Contexts.Host, Actions.WriteOnly, self.ntotal_aabbs * 6)
+            CopyArray(self.sim, self.subdom, Contexts.Host, Actions.WriteOnly)
+
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['ranks', self.ranks, self.nranks])
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['naabbs', self.naabbs, self.nranks])
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabb_offsets', self.aabb_offsets, self.nranks])
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['aabbs', self.aabbs, self.ntotal_aabbs * 6])
+            Call_Void(self.sim, "pairs_runtime->copyRuntimeArray", ['subdom', self.subdom, self.sim.ndims() * 2])
+        
+        if isinstance(self.sim.grid, MutableGrid):
+            for d in range(self.sim.dims):
+                Assign(self.sim, self.sim.grid.min(d), Call(self.sim, "pairs_runtime->getDomainPartitioner()->getMin", [d], Types.Real))
+                Assign(self.sim, self.sim.grid.max(d), Call(self.sim, "pairs_runtime->getDomainPartitioner()->getMax", [d], Types.Real))
+
+    def ghost_particles(self, step, position, offset=0.0):
+        ''' TODO :  If we have pbc, a sinlge particle can be a ghost particle multiple times (at different locations) for the same neighbor block,
+                    so this function should have the capability to yield more than one particle for every neighbor.
+                    But currently it doesn't have that capability, so we need at least 2 blocks in the dimensions that we have pbc.
+                    (eg: a particle in a 1x1x1 block config with pbc <ture, true, true> can be ghost at 7 other locations)
+        '''
+        # Particles with one of the following flags are ignored
+        flags_to_exclude = (Flags.Infinite | Flags.Global)
+
+        for r in self.step_indexes(0):     # for every neighbor rank
+            for i in For(self.sim, 0, self.sim.nlocal):     # for every local particle in this rank
+                particle_flags = self.sim.particle_flags
+
+                for _ in Filter(self.sim, ScalarOp.cmp(particle_flags[i] & flags_to_exclude, 0)):
+                    for aabb_id in For(self.sim, self.aabb_offsets[r], self.aabb_offsets[r] + self.naabbs[r]): # for every aabb of this neighbor
+                        for _ in Filter(self.sim, ScalarOp.neq(self.ranks[r] , self.rank)):     # if my neighobr is not my own rank
+                            full_cond = None
+                            pbc_shifts = []
+
+                            for d in range(self.sim.ndims()):
+                                aabb_min = self.aabbs[aabb_id][d * 2 + 0]
+                                aabb_max = self.aabbs[aabb_id][d * 2 + 1]
+                                d_pbc = 0
+                                d_length = self.sim.grid.length(d)
+
+                                if self.sim._pbc[d]:
+                                    center = aabb_min + (aabb_max - aabb_min) * 0.5     # center of neighbor block
+                                    dist = position[i][d] - center                      # distance of our particle from center of neighbor
+                                    cond_pbc_neg = dist >  (d_length * 0.5)
+                                    cond_pbc_pos = dist < -(d_length * 0.5)
+
+                                    d_pbc = Select(self.sim, cond_pbc_neg, -1, Select(self.sim, cond_pbc_pos, 1, 0))
+
+                                adj_pos = position[i][d] + d_pbc * d_length 
+                                d_cond = ScalarOp.and_op(adj_pos > aabb_min - offset, adj_pos < aabb_max + offset)
+                                full_cond = d_cond if full_cond is None else ScalarOp.and_op(full_cond, d_cond)
+                                pbc_shifts.append(d_pbc)
+
+                            for _ in Filter(self.sim, full_cond):
+                                yield i, r, self.ranks[r], pbc_shifts
+
+                        for _ in Filter(self.sim, ScalarOp.cmp(self.ranks[r] , self.rank)):     # if my neighbor is me (cuz I'm the only rank in a dimension that has pbc)
+                            pbc_shifts = []
+                            isghost = Lit(self.sim, 0)
+
+                            for d in range(self.sim.ndims()):
+                                aabb_min = self.aabbs[aabb_id][d * 2 + 0]
+                                aabb_max = self.aabbs[aabb_id][d * 2 + 1]
+                                center = aabb_min + (aabb_max - aabb_min) * 0.5     # center of neighbor block
+                                dist = position[i][d] - center                      # distance of our particle from center of neighbor
+                                d_pbc = 0
+                                d_length = self.sim.grid.length(d)
+
+                                if self.sim._pbc[d]:
+                                    cond_pbc_neg = dist >  (d_length*0.5 - offset)
+                                    cond_pbc_pos = dist < -(d_length*0.5 - offset)
+                                    d_pbc = Select(self.sim, cond_pbc_neg, -1, Select(self.sim, cond_pbc_pos, 1, 0))
+                                    isghost = ScalarOp.or_op(isghost, d_pbc)
+
+                                pbc_shifts.append(d_pbc)
+                            
+                            for _ in Filter(self.sim, isghost):
+                                yield i, r, self.ranks[r], pbc_shifts
diff --git a/src/pairs/sim/instrumentation.py b/src/pairs/sim/instrumentation.py
index dedc7c18e940796e035aa97ccd3ff527e370438f..7281f13fc9f848038c4df34248d65db7e8e13c8c 100644
--- a/src/pairs/sim/instrumentation.py
+++ b/src/pairs/sim/instrumentation.py
@@ -13,7 +13,7 @@ class RegisterTimers(FinalLowerable):
             Call_Void(self.sim, "pairs::register_timer", [t, Timers.name(t)])
 
         for m in self.sim.module_list:
-            if m.name != 'main':
+            if m.name != 'main' and m.name != 'initialize':
                 Call_Void(self.sim, "pairs::register_timer", [m.module_id + Timers.Offset, m.name])
 
 
@@ -25,5 +25,5 @@ class RegisterMarkers(FinalLowerable):
     def lower(self):
         if self.sim._enable_profiler:
             for m in self.sim.module_list:
-                if m.name != 'main' and m.must_profile():
+                if m.name != 'main' and m.name != 'initialize' and m.must_profile():
                     Call_Void(self.sim, "LIKWID_MARKER_REGISTER", [m.name])
diff --git a/src/pairs/sim/load_balancing_algorithms.py b/src/pairs/sim/load_balancing_algorithms.py
new file mode 100644
index 0000000000000000000000000000000000000000..165d151cf4c936ef2184ab9dabbb98b0e634df2f
--- /dev/null
+++ b/src/pairs/sim/load_balancing_algorithms.py
@@ -0,0 +1,13 @@
+class LoadBalancingAlgorithms:
+    Morton = 0
+    Hilbert = 1
+    Diffusive = 3
+    Metis = 2
+
+    def c_keyword(algorithm):
+        return "Hilbert"        if algorithm == LoadBalancingAlgorithms.Hilbert else \
+               "Morton"         if algorithm == LoadBalancingAlgorithms.Morton else \
+               "Diffusive"      if algorithm == LoadBalancingAlgorithms.Diffusive else \
+               "Metis"          if algorithm == LoadBalancingAlgorithms.Metis else \
+               "Invalid"
+    
\ No newline at end of file
diff --git a/src/pairs/sim/neighbor_lists.py b/src/pairs/sim/neighbor_lists.py
index 5662522b2c2d178319b6aea8f0ad12d92d0960f9..bc50e7e796355f3d7163a8b8b9080e83a58810f1 100644
--- a/src/pairs/sim/neighbor_lists.py
+++ b/src/pairs/sim/neighbor_lists.py
@@ -4,7 +4,7 @@ from pairs.ir.branches import Branch, Filter
 from pairs.ir.layouts import Layouts
 from pairs.ir.loops import ParticleFor
 from pairs.ir.types import Types
-from pairs.ir.utils import Print
+from pairs.ir.print import Print
 from pairs.sim.interaction import ParticleInteraction
 from pairs.sim.lowerable import Lowerable
 
diff --git a/src/pairs/sim/properties.py b/src/pairs/sim/properties.py
index 775fe19a69a39c0c8278a3aa4a77228b3073e897..85eb027763f203a9fdfea9b0d31db826944b46c4 100644
--- a/src/pairs/sim/properties.py
+++ b/src/pairs/sim/properties.py
@@ -4,7 +4,7 @@ from pairs.ir.loops import ParticleFor
 from pairs.ir.memory import Malloc, Realloc
 from pairs.ir.properties import RegisterProperty, RegisterContactProperty
 from pairs.ir.types import Types
-from pairs.ir.utils import Print
+from pairs.ir.print import Print
 from pairs.sim.lowerable import Lowerable, FinalLowerable
 from functools import reduce
 import operator
diff --git a/src/pairs/sim/simulation.py b/src/pairs/sim/simulation.py
index 291f085666fdb6cdac4e4f3d3b3a28969ab76c78..f7360b4e4e8d28c6f4a5407f5af9162188c0a890 100644
--- a/src/pairs/sim/simulation.py
+++ b/src/pairs/sim/simulation.py
@@ -4,7 +4,7 @@ from pairs.ir.branches import Filter
 from pairs.ir.features import Features, FeatureProperties
 from pairs.ir.kernel import Kernel
 from pairs.ir.layouts import Layouts
-from pairs.ir.module import Module
+from pairs.ir.module import Module, ModuleCall
 from pairs.ir.properties import Properties, ContactProperties
 from pairs.ir.symbols import Symbol
 from pairs.ir.types import Types
@@ -13,15 +13,16 @@ from pairs.ir.variables import Variables
 from pairs.mapping.funcs import compute, setup
 from pairs.sim.arrays import DeclareArrays
 from pairs.sim.cell_lists import CellLists, BuildCellLists, BuildCellListsStencil, PartitionCellLists, BuildCellNeighborLists
-from pairs.sim.comm import Comm
+from pairs.sim.comm import Comm, Synchronize, Borders, Exchange, ReverseComm
 from pairs.sim.contact_history import ContactHistory, BuildContactHistory, ClearUnusedContactHistory, ResetContactHistoryUsageStatus
 from pairs.sim.copper_fcc_lattice import CopperFCCLattice
 from pairs.sim.dem_sc_grid import DEMSCGrid
-from pairs.sim.domain import InitializeDomain
+from pairs.sim.domain import InitializeDomain, UpdateDomain
 from pairs.sim.domain_partitioners import DomainPartitioners
-from pairs.sim.domain_partitioning import DimensionRanges
+from pairs.sim.domain_partitioning import BlockForest, DimensionRanges
+from pairs.sim.load_balancing_algorithms import LoadBalancingAlgorithms
 from pairs.sim.features import AllocateFeatureProperties
-from pairs.sim.grid import Grid2D, Grid3D
+from pairs.sim.grid import Grid2D, Grid3D, MutableGrid
 from pairs.sim.instrumentation import RegisterMarkers, RegisterTimers
 from pairs.sim.lattice import ParticleLattice
 from pairs.sim.neighbor_lists import NeighborLists, BuildNeighborLists
@@ -32,9 +33,12 @@ from pairs.sim.timestep import Timestep
 from pairs.sim.variables import DeclareVariables 
 from pairs.sim.vtk import VTKWrite
 from pairs.transformations import Transformations
+from pairs.code_gen.interface import InterfaceModules
 
 
 class Simulation:
+    """P4IRS Simulation class, this class is the center of kernel simulations which contains all
+       fundamental data structures to generate a P4IRS simulation code"""
     def __init__(
         self,
         code_gen,
@@ -44,10 +48,15 @@ class Simulation:
         double_prec=False,
         use_contact_history=False,
         particle_capacity=800000,
-        neighbor_capacity=100):
+        neighbor_capacity=100,
+        generate_whole_program=False):
 
+        # Code generator for the simulation
         self.code_gen = code_gen
         self.code_gen.assign_simulation(self)
+        self._generate_whole_program = generate_whole_program
+
+        # Data structures to be generated
         self.position_prop = None
         self.properties = Properties(self)
         self.vars = Variables(self)
@@ -55,60 +64,104 @@ class Simulation:
         self.features = Features(self)
         self.feature_properties = FeatureProperties(self)
         self.contact_properties = ContactProperties(self)
-        self.particle_capacity = self.add_var('particle_capacity', Types.Int32, particle_capacity)
+
+        # General capacities, sizes and particle properties
+        self.sim_timestep = self.add_var('sim_timestep', Types.Int32, runtime=True)
+        self.particle_capacity = \
+            self.add_var('particle_capacity', Types.Int32, particle_capacity, runtime=True)
         self.neighbor_capacity = self.add_var('neighbor_capacity', Types.Int32, neighbor_capacity)
-        self.nlocal = self.add_var('nlocal', Types.Int32)
-        self.nghost = self.add_var('nghost', Types.Int32)
+        self.nlocal = self.add_var('nlocal', Types.Int32, runtime=True)
+        self.nghost = self.add_var('nghost', Types.Int32, runtime=True)
         self.resizes = self.add_array('resizes', 3, Types.Int32, arr_sync=False)
-        self.particle_uid = self.add_property('uid', Types.Int32, 0)
+        self.particle_uid = self.add_property('uid', Types.UInt64, 0)
         self.particle_shape = self.add_property('shape', Types.Int32, 0)
         self.particle_flags = self.add_property('flags', Types.Int32, 0)
+
+        # Grid for the simulation
         self.grid = None
+
+        # Acceleration structures
         self.cell_lists = None
         self._store_neighbors_per_cell = False
         self.neighbor_lists = None
+        self.update_cells_procedures = Block(self, [])
+
+        # Context information used to partially build the program AST
         self.scope = []
         self.nested_count = 0
         self.nest = False
         self._capture_statements = True
         self._block = Block(self, [])
-        self.setups = Block(self, [])
+
+        # Different segments of particle code/functions
+        self.create_domain = Block(self, [])
+        self.create_domain_at_initialization = False
+
+        self.setup_particles = Block(self, [])
+        self.module_list = []
+        self.kernel_list = []
+
+        # Individual user-defined and interface modules are created only when generate_whole_program is False
+        self.udf_module_list = []
+        self.interface_module_list = []
+
+        # User-defined functions to be called by other subroutines (used only when generate_whole_program is True)
         self.setup_functions = []
         self.pre_step_functions = []
         self.functions = []
-        self.module_list = []
-        self.kernel_list = []
+
+        # Structures to generated resize code for capacities
         self._check_properties_resize = False
         self._resizes_to_check = {}
-        self._module_name = None
-        self._double_prec = double_prec
-        self.dims = dims
-        self.ntimesteps = timesteps
-        self.expr_id = 0
-        self.iter_id = 0
-        self.reneighbor_frequency = 1
+
+        # VTK data
         self.vtk_file = None
         self.vtk_frequency = 0
+
+        # Domain partitioning
         self._dom_part = None
         self._partitioner = None
-        self._target = None
-        self._pbc = [True for _ in range(dims)]
+        self._comm = None
+
+        # Contact history
         self._use_contact_history = use_contact_history
         self._contact_history = ContactHistory(self) if use_contact_history else None
-        self._shapes = shapes
-        self._compute_half = False
-        self._apply_list = None
-        self._enable_profiler = False
-        self._compute_thermo = 0
+
+
+        self._module_name = None                # Current module name
+        self._double_prec = double_prec         # Use double-precision FP arithmetic
+        self.dims = dims                        # Number of dimensions
+        self.ntimesteps = timesteps             # Number of time-steps
+        self.reneighbor_frequency = 1           # Re-neighbor frequency
+        self.rebalance_frequency = 0            # Re-balance frequency for dynamic load balancing
+        self._target = None                     # Hardware target info
+        self._pbc = [True for _ in range(dims)] # PBC flags for each dimension
+        self._shapes = shapes                   # List of shapes used in the simulation
+        self._compute_half = False              # Compute half of interactions (Newton 3D Law)
+        self._apply_list = None                 # Context elements when using apply() directive
+        self._enable_profiler = False           # Enable/disable profiler
+        self._compute_thermo = 0                # Compute thermo information
 
     def set_domain_partitioner(self, partitioner):
+        """Selects domain-partitioner used and create its object for this simulation instance"""
         self._partitioner = partitioner
 
         if partitioner in (DomainPartitioners.Regular, DomainPartitioners.RegularXY):
             self._dom_part = DimensionRanges(self)
 
+        elif partitioner == DomainPartitioners.BlockForest:
+            self._dom_part = BlockForest(self)
+
         else:
             raise Exception("Invalid domain partitioner.")
+        
+    def set_workload_balancer(self, algorithm=LoadBalancingAlgorithms.Morton, 
+                              regrid_min=100, regrid_max=1000, rebalance_frequency=0):
+        assert self._partitioner == DomainPartitioners.BlockForest, "Load balancing is only supported by BlockForest."
+        self.rebalance_frequency = rebalance_frequency
+        self._dom_part.load_balancer = algorithm
+        self._dom_part.regrid_min = regrid_min
+        self._dom_part.regrid_max = regrid_max
 
     def partitioner(self):
         return self._partitioner
@@ -128,12 +181,33 @@ class Simulation:
     def max_shapes(self):
         return len(self._shapes)
 
+    def add_udf_module(self, module):
+        assert isinstance(module, Module), "add_udf_module(): Given parameter is not of type Module!"
+        assert module.user_defined and not module.interface
+        if module.name not in [m.name for m in self.udf_module_list]:
+            self.udf_module_list.append(module)
+
+    def add_interface_module(self, module):
+        assert isinstance(module, Module), "add_interface_module(): Given parameter is not of type Module!"
+        assert module.interface and not module.user_defined
+        if module.name not in [m.name for m in self.interface_module_list]:
+            self.interface_module_list.append(module)
+
     def add_module(self, module):
         assert isinstance(module, Module), "add_module(): Given parameter is not of type Module!"
+        assert not module.interface and not module.user_defined
         if module.name not in [m.name for m in self.module_list]:
             self.module_list.append(module)
 
+    def interface_modules(self):
+        return self.interface_module_list
+    
+    def udf_modules(self):
+        return self.udf_module_list
+    
     def modules(self):
+        """List simulation modules, with main always in the last position"""
+
         sorted_mods = []
         main_mod = None
         for m in self.module_list:
@@ -142,7 +216,10 @@ class Simulation:
             else:
                 main_mod = m
 
-        return sorted_mods + [main_mod]
+        if main_mod is not None:
+            sorted_mods += [main_mod]
+
+        return sorted_mods
 
     def add_kernel(self, kernel):
         assert isinstance(kernel, Kernel), "add_kernel(): Given parameter is not of type Kernel!"
@@ -163,9 +240,9 @@ class Simulation:
         assert len(pbc_config) == self.dims, "PBC must be specified for each dimension."
         self._pbc = pbc_config
 
-    def add_property(self, prop_name, prop_type, value=0.0, volatile=False):
+    def add_property(self, prop_name, prop_type, value=0.0, volatile=False, reduce=False):
         assert self.property(prop_name) is None, f"Property already defined: {prop_name}"
-        return self.properties.add(prop_name, prop_type, value, volatile)
+        return self.properties.add(prop_name, prop_type, value, volatile, p_reduce=reduce)
 
     def add_position(self, prop_name, value=[0.0, 0.0, 0.0], volatile=False, layout=Layouts.AoS):
         assert self.property(prop_name) is None, f"Property already defined: {prop_name}"
@@ -176,10 +253,18 @@ class Simulation:
         assert self.feature(feature_name) is None, f"Feature already defined: {feature_name}"
         return self.features.add(feature_name, nkinds)
 
-    def add_feature_property(self, feature_name, prop_name, prop_type, prop_data):
+    def add_feature_property(self, feature_name, prop_name, prop_type, prop_data=None):
         feature = self.feature(feature_name)
         assert feature is not None, f"Feature not found: {feature_name}"
         assert self.property(prop_name) is None, f"Property already defined: {prop_name}"
+
+        array_size = feature.nkinds()**2 * Types.number_of_elements(self, prop_type)
+
+        if not prop_data:
+            prop_data = [0 for i in range(array_size)]
+        else:
+            assert len(prop_data) == array_size, f"Incorrect array size for {prop_name}: Expected array size = {array_size}"
+
         return self.feature_properties.add(feature, prop_name, prop_type, prop_data)
 
     def add_contact_property(self, prop_name, prop_type, prop_default, layout=Layouts.AoS):
@@ -212,9 +297,9 @@ class Simulation:
     def array(self, arr_name):
         return self.arrays.find(arr_name)
 
-    def add_var(self, var_name, var_type, init_value=0):
+    def add_var(self, var_name, var_type, init_value=0, runtime=False):
         assert self.var(var_name) is None, f"Variable already defined: {var_name}"
-        return self.vars.add(var_name, var_type, init_value)
+        return self.vars.add(var_name, var_type, init_value, runtime)
 
     def add_temp_var(self, init_value):
         return self.vars.add_temp(init_value)
@@ -226,33 +311,46 @@ class Simulation:
         return self.vars.find(var_name)
 
     def set_domain(self, grid):
+        """Set domain bounds. 
+        If the domain is set through this function, the 'set_domain' module won't be generated in the modular version.
+        Use this function only if you do not need to set domain at runtime.
+        This function is required only for whole-program generation."""
+        self.create_domain_at_initialization = True
         self.grid = Grid3D(self, grid[0], grid[1], grid[2], grid[3], grid[4], grid[5])
-        self.setups.add_statement(InitializeDomain(self))
+        self.create_domain.add_statement(InitializeDomain(self))
 
     def reneighbor_every(self, frequency):
         self.reneighbor_frequency = frequency
 
     def create_particle_lattice(self, grid, spacing, props={}):
-        self.setups.add_statement(ParticleLattice(self, grid, spacing, props, self.position()))
+        self.setup_particles.add_statement(ParticleLattice(self, grid, spacing, props, self.position()))
 
     def read_particle_data(self, filename, prop_names, shape_id):
+        """Generate statement to read particle data from file"""
         props = [self.property(prop_name) for prop_name in prop_names]
-        self.setups.add_statement(ReadParticleData(self, filename, props, shape_id))
+        self.setup_particles.add_statement(ReadParticleData(self, filename, props, shape_id))
 
     def copper_fcc_lattice(self, nx, ny, nz, rho, temperature, ntypes):
-        self.setups.add_statement(CopperFCCLattice(self, nx, ny, nz, rho, temperature, ntypes))
+        """Specific initialization for MD Copper FCC lattice case"""
+        self.setup_particles.add_statement(CopperFCCLattice(self, nx, ny, nz, rho, temperature, ntypes))
 
     def dem_sc_grid(self, xmax, ymax, zmax, spacing, diameter, min_diameter, max_diameter, initial_velocity, particle_density, ntypes):
-        self.setups.add_statement(
+        """Specific initialization for DEM grid"""
+        self.setup_particles.add_statement(
             DEMSCGrid(self, xmax, ymax, zmax, spacing, diameter, min_diameter, max_diameter,
                       initial_velocity, particle_density, ntypes))
 
-    def build_cell_lists(self, spacing, store_neighbors_per_cell=False):
+    def build_cell_lists(self, spacing=None, store_neighbors_per_cell=False):
+        """Add routines to build the linked-cells acceleration structure.
+        Leave spacing as None so it can be set at runtime."""
         self._store_neighbors_per_cell = store_neighbors_per_cell
         self.cell_lists = CellLists(self, self._dom_part, spacing, spacing)
         return self.cell_lists
 
-    def build_neighbor_lists(self, spacing):
+    def build_neighbor_lists(self, spacing=None):
+        """Add routines to build the Verlet Lists acceleration structure.
+        Leave spacing as None so it can be set at runtime."""
+
         assert self._store_neighbors_per_cell is False, \
             "Using neighbor-lists with store_neighbors_per_cell option is invalid."
 
@@ -260,13 +358,14 @@ class Simulation:
         self.neighbor_lists = NeighborLists(self, self.cell_lists)
         return self.neighbor_lists
 
-    def compute(self, func, cutoff_radius=None, symbols={}, pre_step=False, skip_first=False):
-        return compute(self, func, cutoff_radius, symbols, pre_step, skip_first)
+    def compute(self, func, cutoff_radius=None, symbols={}, parameters={}, pre_step=False, skip_first=False):
+        return compute(self, func, cutoff_radius, symbols, parameters, pre_step, skip_first)
 
     def setup(self, func, symbols={}):
         return setup(self, func, symbols)
 
     def init_block(self):
+        """Initialize new block in this simulation instance"""
         self._block = Block(self, [])
         self._check_properties_resize = False
         self._resizes_to_check = {}
@@ -276,24 +375,30 @@ class Simulation:
         self._module_name = name
 
     def check_properties_resize(self):
+        """Enable checking properties for resizing"""
         self._check_properties_resize = True
 
     def check_resize(self, capacity, size):
+        """Determine that capacity must always be checked with respect to size in a block/module"""
+
         if capacity not in self._resizes_to_check:
             self._resizes_to_check[capacity] = size
         else:
             raise Exception("Two sizes assigned to same capacity!")
 
     def build_setup_module_with_statements(self):
+        """Build a Module in the setup part of the program using the last initialized block"""
+
         self.setup_functions.append(
             Module(self,
                 name=self._module_name,
                 block=Block(self, self._block),
                 resizes_to_check=self._resizes_to_check,
                 check_properties_resize=self._check_properties_resize,
-                run_on_device=False))
+                run_on_device=True))
 
     def build_pre_step_module_with_statements(self, run_on_device=True, skip_first=False, profile=False):
+        """Build a Module in the pre-step part of the program using the last initialized block"""
         module = Module(self, name=self._module_name,
                               block=Block(self, self._block),
                               resizes_to_check=self._resizes_to_check,
@@ -310,6 +415,7 @@ class Simulation:
             self.pre_step_functions.append(module)
 
     def build_module_with_statements(self, run_on_device=True, skip_first=False, profile=False):
+        """Build a Module in the compute part of the program using the last initialized block"""
         module = Module(self, name=self._module_name,
                               block=Block(self, self._block),
                               resizes_to_check=self._resizes_to_check,
@@ -324,10 +430,22 @@ class Simulation:
         else:
             self.functions.append(module)
 
+    def build_user_defined_function(self, run_on_device=True):
+        """Build a user-defined Module that will be callable seperately as part of the interface"""
+        Module(self, name=self._module_name,
+                block=Block(self, self._block),
+                resizes_to_check=self._resizes_to_check,
+                check_properties_resize=self._check_properties_resize,
+                run_on_device=run_on_device,
+                user_defined=True)
+        
+
     def capture_statements(self, capture=True):
+        """When toggled, all constructed statements are captured and automatically added to the last initialized block"""
         self._capture_statements = capture
 
     def add_statement(self, stmt):
+        """Add captured statements to the last block when _capture_statements is toggled"""
         if self._capture_statements:
             if not self.scope:
                 self._block.add_statement(stmt)
@@ -337,6 +455,7 @@ class Simulation:
         return stmt
 
     def nest_mode(self):
+        """When explicitly constructing loops in P4IRS, make them nested"""
         self.nested_count = 0
         self.nest = True
         yield
@@ -345,9 +464,11 @@ class Simulation:
             self.scope.pop()
 
     def enter(self, scope):
+        """Enter a new scope, used for tracking scopes when building P4IRS AST elements"""
         self.scope.append(scope)
 
     def leave(self):
+        """Leave last scope, used for tracking scopes when building P4IRS AST elements"""
         if not self.nest:
             self.scope.pop()
         else:
@@ -379,26 +500,86 @@ class Simulation:
     def compute_thermo(self, every=0):
         self._compute_thermo = every
 
+    def create_update_cells_block(self):
+        subroutines = [
+            BuildCellLists(self, self.cell_lists),
+            PartitionCellLists(self, self.cell_lists)
+        ]
+
+        # Add routine to build neighbor-lists per cell
+        if self._store_neighbors_per_cell:
+            subroutines.append(BuildCellNeighborLists(self, self.cell_lists))
+
+        # Add routine to build neighbor-lists per particle (standard Verlet Lists)
+        if self.neighbor_lists is not None:
+            subroutines.append(BuildNeighborLists(self, self.neighbor_lists))
+
+        self.update_cells_procedures.add_statement(subroutines)
+
     def generate(self):
+        """Generate the code for the simulation"""
         assert self._target is not None, "Target not specified!"
-        comm = Comm(self, self._dom_part)
+
+        # Initialize communication instance with the specified domain-partitioner
+        self._comm = Comm(self, self._dom_part)
+        self.create_update_cells_block()
+
+        if self._generate_whole_program:
+            self.generate_program()
+        else:
+            self.generate_library()
+
+    def generate_library(self):
+        InterfaceModules(self).create_all()
+        
+        # User defined functions are wrapped inside seperate interface modules here.
+        # The udf's have the same name as their interface module but they get implemented in the pairs::internal scope.
+        for m in self.udf_module_list:
+            module = Module(self, name=m.name, block=Block(self, m), interface=True)
+            module._id = m._id
+
+        Transformations(self.interface_modules(), self._target).apply_all()
+
+        # Generate library
+        self.code_gen.generate_library()
+
+        # Generate getters for the runtime functions
+        self.code_gen.generate_interfaces()
+
+    def generate_program(self):
+        assert self.grid, "No domain is created. Set domain bounds with 'set_domain'."
+
+        reverse_comm_module = ReverseComm(self._comm, reduce=True)
+
+        # Params that determine when a method must be called only when reneighboring
         every_reneighbor_params = {'every': self.reneighbor_frequency}
 
-        timestep_procedures = self.pre_step_functions + [
-            (comm.exchange(), every_reneighbor_params),
-            (comm.borders(), comm.synchronize(), every_reneighbor_params),
-            (BuildCellLists(self, self.cell_lists), every_reneighbor_params),
-            (PartitionCellLists(self, self.cell_lists), every_reneighbor_params)
-        ]
+        timestep_procedures = []
 
-        if self._store_neighbors_per_cell:
-            timestep_procedures.append(
-                (BuildCellNeighborLists(self, self.cell_lists), every_reneighbor_params))
+        # First steps executed during each time-step in the simulation
+        timestep_procedures += self.pre_step_functions 
 
-        if self.neighbor_lists is not None:
-            timestep_procedures.append(
-                (BuildNeighborLists(self, self.neighbor_lists), every_reneighbor_params))
+        # Rebalancing routines
+        if self.rebalance_frequency:
+            update_domain_procedures = Block.from_list(self, [
+                Exchange(self._comm),
+                UpdateDomain(self),
+                Borders(self._comm),
+                ResetVolatileProperties(self),
+                BuildCellListsStencil(self, self.cell_lists),
+                self.update_cells_procedures
+                ])
+
+            timestep_procedures.append((update_domain_procedures, {'every': self.rebalance_frequency}))
 
+        # Communication routines
+        timestep_procedures += [(Exchange(self._comm), every_reneighbor_params),
+                                (Borders(self._comm), Synchronize(self._comm), every_reneighbor_params)]
+
+        # Update acceleration data structures
+        timestep_procedures += [(self.update_cells_procedures, every_reneighbor_params)]
+
+        # Add routines for contact history management
         if self._use_contact_history:
             if self.neighbor_lists is not None:
                 timestep_procedures.append(
@@ -407,46 +588,66 @@ class Simulation:
 
             timestep_procedures.append(ResetContactHistoryUsageStatus(self, self._contact_history))
 
-        timestep_procedures += [ResetVolatileProperties(self)] + self.functions
+        # Reset volatile properties
+        timestep_procedures += [ResetVolatileProperties(self)]
+
+        # Add computational kernels
+        timestep_procedures += self.functions
 
+        # For whole-program-generation, add reverse_comm wherever needed in the timestep loop (eg: after computational kernels) like this:
+        timestep_procedures += [reverse_comm_module]
+
+        # Clear unused contact history
         if self._use_contact_history:
             timestep_procedures.append(ClearUnusedContactHistory(self, self._contact_history))
 
+        # Add routine to calculate thermal data
         if self._compute_thermo != 0:
             timestep_procedures.append(
                 (ComputeThermo(self), {'every': self._compute_thermo}))
 
+
+        # Data structures and timer/markers initialization
+        inits = Block.from_list(self, [
+            DeclareVariables(self),
+            DeclareArrays(self),
+            AllocateProperties(self),
+            AllocateContactProperties(self),
+            AllocateFeatureProperties(self),
+            RegisterTimers(self),
+            RegisterMarkers(self)
+        ])
+
+        # Construct the time-step loop
         timestep = Timestep(self, self.ntimesteps, timestep_procedures)
         self.enter(timestep.block)
 
+        # Add routine to write VTK data when set
         if self.vtk_file is not None:
             timestep.add(VTKWrite(self, self.vtk_file, timestep.timestep(), self.vtk_frequency))
 
         self.leave()
 
+        # Combine everything into a whole program
+        # Initialization and setup functions, together with time-step loop
+        # UpdateDomain is added after setup_particles because particles must be already present in the simulation
         body = Block.from_list(self, [
-            self.setups,
+            self.create_domain,
+            self.setup_particles,
+            UpdateDomain(self),        
             self.setup_functions,
             BuildCellListsStencil(self, self.cell_lists),
             timestep.as_block()
         ])
 
-        inits = Block.from_list(self, [
-            DeclareVariables(self),
-            DeclareArrays(self),
-            AllocateProperties(self),
-            AllocateContactProperties(self),
-            AllocateFeatureProperties(self),
-            RegisterTimers(self),
-            RegisterMarkers(self)
-        ])
-
         program = Module(self, name='main', block=Block.merge_blocks(inits, body))
 
         # Apply transformations
         transformations = Transformations(program, self._target)
         transformations.apply_all()
 
-        # Generate program
-        #ASTGraph(self.functions, "functions.dot").render()
+        # Generate whole program
         self.code_gen.generate_program(program)
+
+        # Generate getters for the runtime functions
+        self.code_gen.generate_interfaces()
diff --git a/src/pairs/sim/timestep.py b/src/pairs/sim/timestep.py
index 1281a4d60e4a75272f34b3087cf3ad5c4e772661..abef09a055507f431af554bf7163603e35f8e3cc 100644
--- a/src/pairs/sim/timestep.py
+++ b/src/pairs/sim/timestep.py
@@ -10,7 +10,7 @@ class Timestep:
     def __init__(self, sim, nsteps, item_list=None):
         self.sim = sim
         self.block = Block(sim, [])
-        self.timestep_loop = For(sim, 0, nsteps + 1, self.block)
+        self.timestep_loop = For(sim, 0, nsteps + 1, self.block) if self.sim._generate_whole_program else None
 
         if item_list is not None:
             for item in item_list:
@@ -31,13 +31,13 @@ class Timestep:
                     self.add(item)
 
     def timestep(self):
-        return self.timestep_loop.iter()
+        return self.timestep_loop.iter() if self.sim._generate_whole_program else self.sim.sim_timestep
 
     def add(self, item, exec_every=0, item_else=None, skip_first=False):
         assert exec_every >= 0, "exec_every parameter must be higher or equal than zero!"
         stmts = item if not isinstance(item, Block) else item.statements()
         stmts_else = None
-        ts = self.timestep_loop.iter()
+        ts = self.timestep() 
         self.sim.enter(self.block)
 
         if item_else is not None:
@@ -65,7 +65,7 @@ class Timestep:
         self.sim.capture_statements(False)
 
         block = Block(self.sim, [Call_Void(self.sim, "pairs::start_timer", [Timers.All]),
-                                 self.timestep_loop,
+                                 self.timestep_loop if self.sim._generate_whole_program else self.block,
                                  Call_Void(self.sim, "pairs::stop_timer", [Timers.All])])
 
         self.sim.capture_statements(_capture)
diff --git a/src/pairs/transformations/__init__.py b/src/pairs/transformations/__init__.py
index 7d5cab5efcdede5a7d5dea3926014d2d9571360b..733d5c10fbaec621d37db6a7009e64493f752719 100644
--- a/src/pairs/transformations/__init__.py
+++ b/src/pairs/transformations/__init__.py
@@ -2,7 +2,7 @@ import time
 from pairs.analysis import Analysis
 from pairs.transformations.blocks import LiftDeclarations, MergeAdjacentBlocks
 from pairs.transformations.devices import AddDeviceCopies, AddDeviceKernels, AddHostReferencesToModules, AddDeviceReferencesToModules
-from pairs.transformations.expressions import ReplaceSymbols, LowerNeighborIndexes, SimplifyExpressions, PruneUnusedVectorIndexes, AddExpressionDeclarations
+from pairs.transformations.expressions import ReplaceSymbols, LowerNeighborIndexes, ConstantPropagation, SimplifyExpressions, PruneUnusedVectorIndexes, AddExpressionDeclarations
 from pairs.transformations.instrumentation import AddModulesInstrumentation
 from pairs.transformations.loops import LICM
 from pairs.transformations.lower import Lower
@@ -10,24 +10,29 @@ from pairs.transformations.modules import DereferenceWriteVariables, AddResizeLo
 
 
 class Transformations:
-    def __init__(self, ast, target):
-        self._ast = ast
+    def __init__(self, ast_list, target):
+        self._ast_list = ast_list if isinstance(ast_list, list) else [ast_list]
         self._target = target
         self._module_resizes = None
 
     def apply(self, transformation, data=None):
         print(f"Applying transformation: {type(transformation).__name__}... ", end="")
         start = time.time()
-        transformation.set_ast(self._ast)
-        if data is not None:
-            transformation.set_data(data)
 
-        self._ast = transformation.mutate()
+        new_ast_list = []
+        for ast in self._ast_list:
+            transformation.set_ast(ast)
+            if data is not None:
+                transformation.set_data(data)
+
+            new_ast_list.append(transformation.mutate())
+
+        self._ast_list = new_ast_list
         elapsed = time.time() - start
         print(f"{elapsed:.2f}s elapsed.")
 
     def analysis(self):
-        return Analysis(self._ast)
+        return Analysis(self._ast_list)
 
     def lower(self, lower_finals=False):
         nlowered = 1
@@ -42,6 +47,7 @@ class Transformations:
         self.apply(PruneUnusedVectorIndexes())
         self.apply(LowerNeighborIndexes())
         self.apply(ReplaceSymbols())
+        self.apply(ConstantPropagation())
         self.apply(SimplifyExpressions())
 
     def lift_declarations_to_owner_blocks(self):
@@ -61,12 +67,14 @@ class Transformations:
         self._module_resizes = add_resize_logic.module_resizes
         self.analysis().fetch_modules_references()
         self.apply(DereferenceWriteVariables())
+        self.analysis().infer_modules_return_types()
         self.apply(ReplaceModulesByCalls(), [self._module_resizes])
         self.apply(MergeAdjacentBlocks())
 
     def add_device_copies(self):
         if self._target.is_gpu():
             self.apply(AddDeviceCopies(), [self._module_resizes])
+            self.analysis().fetch_modules_references()
 
     def add_device_kernels(self):
         if self._target.is_gpu():
@@ -97,8 +105,13 @@ class Transformations:
         self.modularize()
         self.add_device_kernels()
         self.add_device_copies()
-        self.add_instrumentation()
         self.lower(True)
         self.add_expression_declarations()
         self.add_host_references_to_modules()
         self.add_device_references_to_modules()
+        
+        # TODO: Place stop timers before the function returns
+        # or simply don't instrument modules that have a non-void return type
+        # to avoid having to deal with returns within conditional blocks 
+        # self.add_instrumentation()
+
diff --git a/src/pairs/transformations/devices.py b/src/pairs/transformations/devices.py
index e050f9ed52c82b253970ec35c17097b0f710fc93..d33f30ef174ba06b07d34908aff49926ea982fe2 100644
--- a/src/pairs/transformations/devices.py
+++ b/src/pairs/transformations/devices.py
@@ -4,7 +4,7 @@ from pairs.ir.block import Block
 from pairs.ir.branches import Branch, Filter
 from pairs.ir.cast import Cast
 from pairs.ir.contexts import Contexts
-from pairs.ir.device import CopyArray, CopyContactProperty, CopyProperty, CopyVar, DeviceStaticRef, HostRef
+from pairs.ir.device import CopyArray, CopyContactProperty, CopyProperty, CopyFeatureProperty, CopyVar, DeviceStaticRef, HostRef
 from pairs.ir.functions import Call_Void
 from pairs.ir.kernel import Kernel, KernelLaunch
 from pairs.ir.lit import Lit
@@ -46,6 +46,9 @@ class AddDeviceCopies(Mutator):
                     for prop, action in s.module.properties().items():
                         new_stmts += [CopyProperty(s.sim, prop, copy_context, action)]
 
+                    for fp, action in s.module.feature_properties().items():
+                        new_stmts += [CopyFeatureProperty(s.sim, fp, copy_context, action)]
+
                     for contact_prop, action in s.module.contact_properties().items():
                         new_stmts += [CopyContactProperty(s.sim, contact_prop, copy_context, action)]
 
@@ -88,6 +91,7 @@ class AddDeviceKernels(Mutator):
         super().__init__(ast)
         self._module_name = None
         self._kernel_id = 0
+        self._device_module = False
 
     def create_kernel(self, sim, iterator, rmax, block):
         kernel_name = f"{self._module_name}_kernel{self._kernel_id}"
@@ -99,62 +103,28 @@ class AddDeviceKernels(Mutator):
             self._kernel_id += 1
 
         return kernel
+    
+    def mutate_For(self, ast_node):
+        if ast_node.is_kernel_candidate() and self._device_module:
+            kernel = self.create_kernel(ast_node.sim, ast_node.iterator, ast_node.max, ast_node.block)
+            ast_node = KernelLaunch(ast_node.sim, kernel, ast_node.iterator, ast_node.min, ast_node.max)
+
+        else:
+            ast_node.block = self.mutate(ast_node.block)
+        
+        return ast_node
 
     def mutate_Module(self, ast_node):
+        parent_runs_on_device = self._device_module
         if ast_node.run_on_device:
+            self._device_module = True
             self._module_name = ast_node.name
             self._kernel_id = 0
 
-            new_stmts = []
-            for stmt in ast_node._block.stmts:
-                if stmt is not None:
-                    if isinstance(stmt, For) and stmt.is_kernel_candidate():
-                        kernel = self.create_kernel(ast_node.sim, stmt.iterator, stmt.max, stmt.block)
-                        new_stmts.append(
-                            KernelLaunch(ast_node.sim, kernel, stmt.iterator, stmt.min, stmt.max))
-
-                    else:
-                        if isinstance(stmt, Branch):
-                            stmt = self.check_and_mutate_branch(stmt)
-
-                        new_stmts.append(stmt)
-
-            ast_node._block.stmts = new_stmts
-
         ast_node._block = self.mutate(ast_node._block)
+        self._device_module = parent_runs_on_device
         return ast_node
 
-    def check_and_mutate_branch(self, ast_node):
-        new_stmts = []
-        for stmt in ast_node.block_if.stmts:
-            if stmt is not None:
-                if isinstance(stmt, For) and stmt.is_kernel_candidate():
-                    kernel = self.create_kernel(ast_node.sim, stmt.iterator, stmt.max, stmt.block)
-                    new_stmts.append(
-                        KernelLaunch(ast_node.sim, kernel, stmt.iterator, stmt.min, stmt.max))
-
-                else:
-                    new_stmts.append(stmt)
-
-        ast_node.block_if.stmts = new_stmts
-
-        if ast_node.block_else is not None:
-            new_stmts = []
-            for stmt in ast_node.block_else.stmts:
-                if stmt is not None:
-                    if isinstance(stmt, For) and stmt.is_kernel_candidate():
-                        kernel = self.create_kernel(ast_node.sim, stmt.iterator, stmt.max, stmt.block)
-                        new_stmts.append(
-                            KernelLaunch(ast_node.sim, kernel, stmt.iterator, stmt.min, stmt.max))
-
-                    else:
-                        new_stmts.append(stmt)
-
-            ast_node.block_else.stmts = new_stmts
-
-        return ast_node
-
-
 class AddHostReferencesToModules(Mutator):
     def __init__(self, ast=None):
         super().__init__(ast)
diff --git a/src/pairs/transformations/expressions.py b/src/pairs/transformations/expressions.py
index bd85bf81c65608dd40bb5ced66a360fdad42720e..750bce6b0d8cb579c59ea542aac297c8f1dc9c76 100644
--- a/src/pairs/transformations/expressions.py
+++ b/src/pairs/transformations/expressions.py
@@ -5,6 +5,51 @@ from pairs.ir.operators import Operators
 from pairs.ir.types import Types
 
 
+class ConstantPropagation(Mutator):
+    def __init__(self, ast=None):
+        super().__init__(ast)
+
+    def mutate_ScalarOp(self, ast_node):
+        sim = ast_node.lhs.sim
+        ast_node.lhs = self.mutate(ast_node.lhs)
+        if not ast_node.operator().is_unary():
+            ast_node.rhs = self.mutate(ast_node.rhs)
+
+        if (not ast_node.operator().is_unary() and
+            isinstance(ast_node.lhs, Lit) and isinstance(ast_node.rhs, Lit)):
+
+            if ast_node.op == Operators.Add:
+                return Lit(sim, ast_node.lhs.value + ast_node.rhs.value)
+
+            if ast_node.op == Operators.Sub:
+                return Lit(sim, ast_node.lhs.value - ast_node.rhs.value)
+
+            if ast_node.op == Operators.Mul:
+                return Lit(sim, ast_node.lhs.value * ast_node.rhs.value)
+
+            if ast_node.op == Operators.Div:
+                return Lit(sim, ast_node.lhs.value / ast_node.rhs.value)
+            
+            if ast_node.op == Operators.Gt:
+                return Lit(sim, 1) if Lit(sim, ast_node.lhs.value > ast_node.rhs.value) else Lit(sim, 0)
+            
+            if ast_node.op == Operators.Lt:
+                return Lit(sim, 1) if Lit(sim, ast_node.lhs.value < ast_node.rhs.value) else Lit(sim, 0)
+            
+            if ast_node.op == Operators.Geq:
+                return Lit(sim, 1) if Lit(sim, ast_node.lhs.value >= ast_node.rhs.value) else Lit(sim, 0)
+            
+            if ast_node.op == Operators.Leq:
+                return Lit(sim, 1) if Lit(sim, ast_node.lhs.value <= ast_node.rhs.value) else Lit(sim, 0)
+            
+            if ast_node.op == Operators.Eq:
+                return Lit(sim, 1) if Lit(sim, ast_node.lhs.value == ast_node.rhs.value) else Lit(sim, 0)
+            
+            if ast_node.op == Operators.Neq:
+                return Lit(sim, 1) if Lit(sim, ast_node.lhs.value != ast_node.rhs.value) else Lit(sim, 0)
+            
+        return ast_node
+    
 class ReplaceSymbols(Mutator):
     def __init__(self, ast=None):
         super().__init__(ast)
@@ -148,7 +193,7 @@ class AddExpressionDeclarations(Mutator):
             self.declared_exprs.append(atomic_add_id)
 
         return ast_node
-
+    
     def mutate_Block(self, ast_node):
         block_id = id(ast_node)
         self.decls[block_id] = []
diff --git a/src/pairs/transformations/instrumentation.py b/src/pairs/transformations/instrumentation.py
index 1e70bdb7ebe2753d3031cf3a1ed87dd047b69650..88b73c0d8406b97392d267ace3b1e1bbbb3ca068 100644
--- a/src/pairs/transformations/instrumentation.py
+++ b/src/pairs/transformations/instrumentation.py
@@ -12,16 +12,17 @@ class AddModulesInstrumentation(Mutator):
     def mutate_ModuleCall(self, ast_node):
         ast_node._module = self.mutate(ast_node._module)
         module = ast_node._module
-        if module.name == 'main':
+        if module.name == 'main' or module.name == 'initialize':
             return ast_node
 
-        timer_id = module.module_id + Timers.Offset
-        start_timer = Call_Void(ast_node.sim, "pairs::start_timer", [timer_id])
-        stop_timer = Call_Void(ast_node.sim, "pairs::stop_timer", [timer_id])
-
         if module.must_profile():
             start_marker = Call_Void(ast_node.sim, "LIKWID_MARKER_START", [module.name])
             stop_marker = Call_Void(ast_node.sim, "LIKWID_MARKER_STOP", [module.name])
-            return Block(ast_node.sim, [start_timer, start_marker, ast_node, stop_marker, stop_timer])
+            module._block =  Block.from_list(ast_node.sim, [start_marker, module._block, stop_marker])
+        
+        timer_id = module.module_id + Timers.Offset
+        start_timer = Call_Void(ast_node.sim, "pairs::start_timer", [timer_id])
+        stop_timer = Call_Void(ast_node.sim, "pairs::stop_timer", [timer_id])
+        module._block = Block.from_list(ast_node.sim, [start_timer, module._block, stop_timer])
 
-        return Block(ast_node.sim, [start_timer, ast_node, stop_timer])
+        return ast_node
diff --git a/src/pairs/transformations/modules.py b/src/pairs/transformations/modules.py
index 1ee2c9b15f09caa2e51dafc02cdf3405f7722e5c..4b538305087bed716f778b9095d7ff24619f5ea1 100644
--- a/src/pairs/transformations/modules.py
+++ b/src/pairs/transformations/modules.py
@@ -9,7 +9,7 @@ from pairs.ir.module import Module, ModuleCall
 from pairs.ir.mutator import Mutator
 from pairs.ir.properties import ReallocProperty
 from pairs.ir.types import Types
-from pairs.ir.utils import Print
+from pairs.ir.print import Print
 from pairs.ir.variables import Var, Deref
 from functools import reduce
 import operator
@@ -195,7 +195,7 @@ class ReplaceModulesByCalls(Mutator):
 
                 resize_stmts.append(
                     Filter(sim, sim.resizes[resize_id] > 0, Block(sim,
-                        [Print(sim, f"resizes[{resize_id}] -> {capacity.name()}")] +
+                        # [Print(sim, f"resizes[{resize_id}] = " , sim.resizes[resize_id], f" {capacity.name()} = ", capacity)] +
                         [Assign(sim, capacity, self.grow_fn(sim.resizes[resize_id]))] +
                         [a.realloc() for a in capacity.bonded_arrays()] +
                         props_realloc)))