diff --git a/CMakeLists.txt b/CMakeLists.txt
index 27c9afc4b893be98110b0073e151c2b70e523a32..0ecd4d90006027630096f1c57c268bb5d494e7da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,5 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 project(pairs CXX)
-
 # Set default build type if none is specified
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build (Debug, Release, etc.)" FORCE)
@@ -9,30 +8,27 @@ endif()
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DDEBUG")
 
-option(USE_WALBERLA "Enable waLBerla support for using BlockForest partitioning" ON)
-option(USE_MPI "USE_MPI" ON)
-option(COMPILE_CUDA "COMPILE_CUDA" ON)
-option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON)
-option(GENERATE_WHOLE_PROGRAM "Generate the whole program (i.e. including the 'main' function). No additional source files are needed." OFF)
-option(BUILD_APP "Build a stand-alone app which uses the P4IRS modular interface. Provide your source files with -DUSER_SOURCE_FILES" OFF)
+option(USE_MPI                      "USE_MPI" ON)
+option(COMPILE_CUDA                 "COMPILE_CUDA" OFF)
+option(GENERATE_WHOLE_PROGRAM       "Generate the whole program (i.e. including the 'main' function). No additional source files are needed." OFF)
+option(BUILD_APP                    "Build a stand-alone app which uses the P4IRS modular interface. Provide your source files with -DUSER_SOURCE_FILES" OFF)
 
 if(GENERATE_WHOLE_PROGRAM AND BUILD_APP)
     message(FATAL_ERROR "You must choose either GENERATE_WHOLE_PROGRAM or BUILD_APP or neither.\n
-        Choose neither if you only want to use the generated libraries in your project (in a seperate build system).")
+        Choose neither if you only want to use the P4IRS library in your project (in a seperate build system).")
 endif()
 
-set(PYTHON_SCRIPT ${PYTHON_SCRIPT} CACHE PATH "The python script triggering code generation")
-if(NOT EXISTS ${PYTHON_SCRIPT})
-    message(FATAL_ERROR "PYTHON_SCRIPT doesn't exist! Specify it with -DPYTHON_SCRIPT=/path/to/script.py")
+set(INPUT_SCRIPT ${INPUT_SCRIPT} CACHE PATH "The input python script triggering code generation")
+if(NOT EXISTS ${INPUT_SCRIPT})
+    message(FATAL_ERROR "INPUT_SCRIPT doesn't exist! Specify it with -DINPUT_SCRIPT=/path/to/script.py")
 endif()
-get_filename_component(PYTHON_SCRIPT_NAME ${PYTHON_SCRIPT} NAME_WE)
+get_filename_component(INPUT_SCRIPT_NAME ${INPUT_SCRIPT} NAME_WE)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 
 #================================================================================
 # Setup directories =============================================================
 #================================================================================
-file(COPY runtime DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY data DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 
 set(OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/output)
@@ -44,8 +40,10 @@ file(MAKE_DIRECTORY ${OUTPUT_DIR})
 #================================================================================
 # Generated header (internally used by runtime files) ===========================
 #================================================================================
-set(GEN_INTERFACE_HEADER ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces/last_generated.hpp)
-set(GEN_INTERFACE_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces)
+# TODO: Unify all interfaces
+set(GEN_INTERFACE_DIR ${CMAKE_CURRENT_BINARY_DIR}/internal_interfaces)
+set(GEN_INTERFACE_HEADER ${CMAKE_CURRENT_BINARY_DIR}/last_generated.hpp)
+file(MAKE_DIRECTORY ${GEN_INTERFACE_DIR})
 
 #================================================================================
 # RUNTIME_COMMON_FILES ==========================================================
@@ -80,6 +78,10 @@ else()
     list(APPEND PAIRS_LINK_LIBRARIES ${PAIRS_TARGET})
 endif()
 
+# Include P4IRS 'runtime' dir
+target_include_directories(${PAIRS_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime)
+list(APPEND PAIRS_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/runtime)
+
 set_target_properties(${PAIRS_TARGET} PROPERTIES 
     CXX_STANDARD_REQUIRED ON
     CXX_STANDARD 17
@@ -106,7 +108,15 @@ endif()
 #================================================================================
 # waLBerla ======================================================================
 #================================================================================
-if(USE_WALBERLA)
+set(WALBERLA_DIR ${WALBERLA_DIR} CACHE PATH "Path to waLBerla source directory (required only when using BlockForest partitioning).")
+
+if(WALBERLA_DIR)
+    if(EXISTS "${WALBERLA_DIR}")
+        target_compile_definitions(${PAIRS_TARGET} PUBLIC USE_WALBERLA)
+    else()
+        message(FATAL_ERROR "Invalid WALBERLA_DIR: '${WALBERLA_DIR}' does not exist.")
+    endif()
+
     set(RUNTIME_WALBERLA_FILES
         runtime/domain/block_forest.cpp
     )
@@ -119,7 +129,6 @@ if(USE_WALBERLA)
     endif()
 
     target_sources(${PAIRS_TARGET} PRIVATE ${RUNTIME_WALBERLA_FILES})
-    target_compile_definitions(${PAIRS_TARGET} PUBLIC USE_WALBERLA)
 
     ## Linking walberla modules
     set(PAIRS_WALBERLA_DEPENDENCIES blockforest core pe)
@@ -157,7 +166,7 @@ execute_process(
 if(COMPILE_CUDA)
     find_package(CUDA REQUIRED)
     enable_language(CUDA)
-    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${PYTHON_SCRIPT_NAME}.cu")
+    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${INPUT_SCRIPT_NAME}.cu")
     set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.")
     set(TARGET_ARG "gpu")
 
@@ -174,6 +183,13 @@ if(COMPILE_CUDA)
         set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
     endif()
 
+    if(NOT DEFINED ENABLE_GPU_DIRECT)
+        set(ENABLE_GPU_DIRECT ON "Enable GPU Direct (default: ON when COMPILE_CUDA is ON)" FORCE)
+    else()
+         # User choice is respected here if they opt to COMPILE_CUDA without GPU Direct
+        set(ENABLE_GPU_DIRECT ${ENABLE_GPU_DIRECT} "Enable GPU Direct (user-defined)" FORCE)
+    endif()
+
     if(ENABLE_GPU_DIRECT)
         target_compile_definitions(${PAIRS_TARGET} PRIVATE ENABLE_CUDA_AWARE_MPI)
     endif()
@@ -192,7 +208,11 @@ if(COMPILE_CUDA)
     target_link_libraries(${PAIRS_TARGET} PUBLIC ${CUDA_LIBRARIES})
     list(APPEND PAIRS_LINK_LIBRARIES ${CUDA_LIBRARIES})
 else()
-    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${PYTHON_SCRIPT_NAME}.cpp")
+    if(ENABLE_GPU_DIRECT)
+        message(FATAL_ERROR "Invalid combination: ENABLE_GPU_DIRECT requires COMPILE_CUDA to be ON")
+    endif()
+
+    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${INPUT_SCRIPT_NAME}.cpp")
     set(TARGET_ARG "cpu")
     target_sources(${PAIRS_TARGET} PRIVATE runtime/devices/dummy.cpp)
 endif()
@@ -202,9 +222,9 @@ endif()
 #================================================================================
 add_custom_command(
     OUTPUT ${GEN_SOURCES} ${GEN_INTERFACE_HEADER}
-    COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_SCRIPT} ${TARGET_ARG}
+    COMMAND ${PYTHON_EXECUTABLE} ${INPUT_SCRIPT} ${TARGET_ARG}
     COMMENT "Generating code with P4IRS"
-    DEPENDS ${PYTHON_SCRIPT})
+    DEPENDS ${INPUT_SCRIPT})
 
 add_custom_target(generated_code DEPENDS ${GEN_SOURCES} ${GEN_INTERFACE_HEADER})
 add_dependencies(${PAIRS_TARGET} generated_code)
@@ -230,7 +250,11 @@ endif()
 # LIKWID ========================================================================
 #================================================================================
 if(LIKWID_DIR)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIKWID_PERFMON -pthread -L${LIKWID_DIR}/lib -llikwid ")
+    target_compile_options(${PAIRS_TARGET} PRIVATE -DLIKWID_PERFMON -pthread)
+
+    target_link_libraries(${PAIRS_TARGET} PRIVATE ${LIKWID_DIR}/lib/liblikwid.a)
+    list(APPEND PAIRS_LINK_LIBRARIES ${LIKWID_DIR}/lib/liblikwid.a)
+
     include_directories(${LIKWID_DIR}/include)
     list(APPEND PAIRS_INCLUDE_DIRS "${LIKWID_DIR}/include")
 endif()
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 967a1a9f117d174c758b82a6af3c41c296ded89c..0000000000000000000000000000000000000000
--- a/Makefile
+++ /dev/null
@@ -1,87 +0,0 @@
-.PHONY: all build clean
-
-# General settings
-TESTCASE=md
-PYCMD=python3
-
-# C/C++ compiler settings
-CC=mpic++
-CFLAGS=-O3 -mavx2 -mfma -fopenmp ${MPI_FLAGS} ${LIKWID_FLAGS}
-#CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS}
-#CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS}
-DEBUG_FLAGS=
-#DEBUG_FLAGS=-DDEBUG
-
-# CUDA settings
-NVCC=nvcc
-#NVCC_FLAGS=-O3 -mavx2 -mfma
-NVCC_FLAGS=-O3 -arch=sm_80 -mavx2 -mfma -ffast-math -funroll-loops --forward-unknown-to-host-compiler
-#NVCC_FLAGS=-O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler
-NVCC_PATH:="$(shell which ${NVCC})"
-CUDA_FLAGS=-DENABLE_CUDA_AWARE_MPI
-CUDART_FLAGS=-lcudart -L /apps/SPACK/0.19.1/opt/linux-almalinux8-zen/gcc-8.5.0/nvhpc-23.7-bzxcokzjvx4stynglo4u2ffpljajzlam/Linux_x86_64/23.7/cuda/12.2/targets/x86_64-linux/lib
-
-# MPI settings
-MPI_PATH=/apps/SPACK/0.19.1/opt/linux-almalinux8-zen/intel-2021.10.0/openmpi-4.1.6-ijsnjhq77rjc256wlrp52m37rsq6miff
-MPI_FLAGS=-I${MPI_PATH}/include
-
-# Likwid settings
-LIKWID_INC ?= -I/usr/local/include
-LIKWID_DEFINES ?= -DLIKWID_PERFMON
-LIKWID_LIB ?= -L/usr/local/lib
-LIKWID_FLAGS = -llikwid ${LIKWID_INC} ${LIKWID_DEFINES} ${LIKWID_LIB}
-
-# Other
-CPU_OBJ_PATH=obj_cpu
-CPU_SRC="$(TESTCASE).cpp"
-CPU_BIN="$(TESTCASE)_cpu"
-GPU_OBJ_PATH=obj_gpu
-GPU_SRC="$(TESTCASE).cu"
-GPU_BIN="$(TESTCASE)_gpu"
-
-all: clean build $(CPU_BIN) $(GPU_BIN)
-	@echo "Everything was done!"
-
-build:
-	@echo "Building pairs package..."
-	$(PYCMD) setup.py build && $(PYCMD) setup.py install --user
-
-$(CPU_SRC):
-	@echo "Generating and compiling $(TESTCASE) example for CPU..."
-	@mkdir -p $(CPU_OBJ_PATH)
-	$(PYCMD) examples/$(TESTCASE).py cpu
-
-$(GPU_SRC):
-	@echo "Generating and compiling $(TESTCASE) example for GPU..."
-	@mkdir -p $(GPU_OBJ_PATH)
-	$(PYCMD) examples/$(TESTCASE).py gpu
-
-$(CPU_OBJ_PATH)/pairs.o: runtime/pairs.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CFLAGS)
-
-$(CPU_OBJ_PATH)/regular_6d_stencil.o: runtime/domain/regular_6d_stencil.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CFLAGS)
-
-$(CPU_OBJ_PATH)/dummy.o: runtime/devices/dummy.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CFLAGS)
-
-$(GPU_OBJ_PATH)/pairs.o: runtime/pairs.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(MPI_FLAGS) $(CFLAGS) $(CUDA_FLAGS)
-
-$(GPU_OBJ_PATH)/regular_6d_stencil.o: runtime/domain/regular_6d_stencil.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(MPI_FLAGS) $(CFLAGS) $(CUDA_FLAGS)
-
-$(GPU_OBJ_PATH)/cuda_runtime.o: runtime/devices/cuda.cu
-	$(NVCC) $(NVCC_FLAGS) -c -o $@ $< $(DEBUG_FLAGS) $(MPI_FLAGS) $(CUDA_FLAGS)
-
-# Targets
-$(CPU_BIN): $(CPU_SRC) $(CPU_OBJ_PATH)/pairs.o $(CPU_OBJ_PATH)/regular_6d_stencil.o $(CPU_OBJ_PATH)/dummy.o
-	$(CC) $(CFLAGS) -o $(CPU_BIN) $(CPU_SRC) $(CPU_OBJ_PATH)/pairs.o $(CPU_OBJ_PATH)/regular_6d_stencil.o $(CPU_OBJ_PATH)/dummy.o $(DEBUG_FLAGS)
-
-$(GPU_BIN): $(GPU_SRC) $(GPU_OBJ_PATH)/pairs.o $(GPU_OBJ_PATH)/regular_6d_stencil.o $(GPU_OBJ_PATH)/cuda_runtime.o
-	$(NVCC) $(NVCC_FLAGS) -c -o $(GPU_OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) $(DEBUG_FLAGS) $(MPI_FLAGS) $(CUDA_FLAGS)
-	$(CC) -o $(GPU_BIN) $(GPU_OBJ_PATH)/$(GPU_BIN).o $(GPU_OBJ_PATH)/cuda_runtime.o $(GPU_OBJ_PATH)/pairs.o $(GPU_OBJ_PATH)/regular_6d_stencil.o $(CUDART_FLAGS) $(CUDA_FLAGS) $(CFLAGS)
-
-clean:
-	@echo "Cleaning..."
-	rm -rf build $(CPU_BIN) $(GPU_BIN) $(CPU_SRC) $(GPU_SRC) dist pairs.egg-info functions functions.pdf $(CPU_OBJ_PATH) $(GPU_OBJ_PATH)
diff --git a/README.md b/README.md
index bcef6a27a6d2ebb070f8098c59934bcc9fc73a43..882644c488aa6495080216615ba203927e6d0dd6 100644
--- a/README.md
+++ b/README.md
@@ -3,10 +3,6 @@
 P4IRS is an open-source, stand-alone compiler and domain-specific language for particle simulations which aims at generating optimized code for different target hardwares.
 It is released as a Python package and allows users to define kernels, integrators and other particle routines in a high-level and straightforward fashion without the need to implement any backend code.
 
-## Build instructions
-
-There is a Makefile which contains configurable environment variables such as `TESTCASE` compiler parameters evaluate P4IRS performance on different scenarios.
-`TESTCASE` refers to any of the files within the `examples` directory, such as `md` and `dem`.
 
 ## Usage
 
@@ -104,6 +100,64 @@ else:
 psim.generate()
 ```
 
+## Build instructions
+
+P4IRS can be built in 3 different modes using the CMake build system. Before we demostrate each mode, ensure you have CMake, MPI and CUDA (if targeting GPU execution) available in your environment.
+
+In the following, we assume we have created and navigated to a build directory: `mkdir build; cd build` 
+
+**General CMake flags (applicable to all 3 modes):**  
+* Pass your input script to CMake using `-DINPUT_SCRIPT=path/to/script.py`  
+* Enable CUDA with `-DCOMPILE_CUDA=ON`
+* Enable support for BlockForest domain partitioning and dynamic load balancing by providing the path to waLBerla source directory `-DWALBERLA_DIR=path/to/walberla` (TODO: waLBerla as a submodule)
+
+
+### 1. Whole-program generation:
+---------------------
+In this mode, everything including the `main` function is generated by P4IRS.
+
+1. Set `generate_whole_program=True` in the input script
+2. Set the CMake flag `-DGENERATE_WHOLE_PROGRAM=ON`
+
+Example: Build [md.py](examples/whole-program-generation/md.py)
+```
+cmake -DINPUT_SCRIPT=../examples/whole-program-generation/md.py -DGENERATE_WHOLE_PROGRAM=ON ..
+``` 
+Now call `make` and an **executable** is built.
+
+
+### 2. Modular stand-alone app
+---------------------
+You can build a stand-alone C++ app which uses the P4IRS modular interface.
+
+1. Set `generate_whole_program=False` in the input script
+2. Set the CMake flag `-DBUILD_APP=ON`
+3. Provide the list of your source files to CMake (semicolon-seperated):`-DUSER_SOURCE_FILES=path/to/main.cpp;path/to/helper.cpp`
+
+Example: Build the application [sd_1.cpp](examples/modular/sd_1.cpp) with [spring_dashpot.py](examples/modular/spring_dashpot.py)  
+Note: In this example we assume waLBerla has been already cloned next to the P4IRS directory.
+
+```
+cmake -DINPUT_SCRIPT=../examples/modular/spring_dashpot.py -DBUILD_APP=ON -DUSER_SOURCE_FILES=../examples/modular/sd_1.cpp -DWALBERLA_DIR=../../walberla ..
+```
+Now call `make` and an **executable** is built.
+
+
+### 3. P4IRS as a library
+---------------------
+In this mode, P4IRS is compiled as a library that can be integrated into other projects.
+
+1. Set `generate_whole_program=False` in the input script
+2. Ensure both `BUILD_APP` and `GENERATE_WHOLE_PROGRAM` are `OFF` (they are OFF by default)
+
+Configure CMake and call `make` as usual, and a **static library** is built. You can then include P4IRS and its dependencies in your build system as follows:
+```cmake
+find_package(pairs REQUIRED HINTS "path/to/pairs/build" NO_DEFAULT_PATH)
+target_include_directories(my_app PUBLIC ${PAIRS_INCLUDE_DIRS})
+target_link_directories(my_app PUBLIC ${PAIRS_LINK_DIRS})
+target_link_libraries(my_app PUBLIC ${PAIRS_LINK_LIBRARIES})
+```
+
 ## Citations
 
 TBD
diff --git a/cmake/FindwaLBerla.cmake b/cmake/FindwaLBerla.cmake
index 3eb44c5c971d010f86fef97182d3f499413b1cbf..8f87e88a03902f1c1af3900a3d4d38b921996682 100644
--- a/cmake/FindwaLBerla.cmake
+++ b/cmake/FindwaLBerla.cmake
@@ -1,5 +1,3 @@
-set( WALBERLA_DIR    WALBERLA_DIR-NOTFOUND   CACHE  PATH  "waLBerla path"  )
-
 if ( WALBERLA_DIR )
     # WALBERLA_DIR has to point to the waLBerla source directory
     # this command builds waLBerla (again) in the current build directory in the subfolder "walberla" (second argument)
diff --git a/data/planes.input b/data/planes.input
index 5de074bf8a14746a8f30ad3e3f612ff34a0c70c9..af3744e15f94a6ec863856ed5913c4fc18d344a3 100644
--- a/data/planes.input
+++ b/data/planes.input
@@ -1,2 +1,2 @@
-0,1,0.0,0.0,0.0,0.0,0.0,1.0,13
-0,1,0.8,0.015,0.2,0.0,0.0,-1.0,13
+0,  1,  0.0,    0.0,    0.0,    0.0,    0.0,    1.0,    13
+0,  1,  0.8,    0.015,  0.2,    0.0,    0.0,    -1.0,   13
diff --git a/data/sd_planes.input b/data/sd_planes.input
index e47832e4ae0413a207b8a5e29446f78b5da2c2c7..42c7724c8b621dfca891624bc9c3d58aea6ca606 100644
--- a/data/sd_planes.input
+++ b/data/sd_planes.input
@@ -1,6 +1,6 @@
 0, 1, 0, 0, 0, 1, 0, 0, 13
 0, 1, 0, 0, 0, 0, 1, 0, 13
 0, 1, 0, 0, 0, 0, 0, 1, 13
-0, 1, 1, 1, 1, -1, 0, 0, 13
-0, 1, 1, 1, 1, 0, -1, 0, 13
-0, 1, 1, 1, 1, 0, 0, -1, 13
+0, 1, 10, 10, 10, -1, 0, 0, 13
+0, 1, 10, 10, 10, 0, -1, 0, 13
+0, 1, 10, 10, 10, 0, 0, -1, 13
diff --git a/examples/dem2.py b/examples/dem2.py
deleted file mode 100644
index 7ebb47b1a335c4e0728a62500ecb0e2adcd81402..0000000000000000000000000000000000000000
--- a/examples/dem2.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import pairs
-import sys
-
-
-def linear_spring_dashpot(i, j):
-    penetration_depth = squared_distance(i, j) - radius[i] - radius[j]
-    skip_when(penetration_depth >= 0.0)
-
-    delta_ij = delta(i, j)
-    contact_normal = delta_ij / length(delta_ij)
-    k = radius[j] + 0.5 * penetration_depth
-    contact_point = position[j] + contact_normal * k
-
-    rel_vel = -velocity_wf[i] - velocity_wf[j]
-    rel_vel_n = dot(rel_vel, contact_normal) * contact_normal
-    rel_vel_t = rel_vel - rel_vel_n
-
-    fN = stiffness_norm[i, j] * (-penetration_depth) * contact_normal + damping_norm[i, j] * rel_vel_n;
-
-    tan_spring_disp = tangential_spring_displacement[i, j]
-    impact_vel_magnitude = impact_velocity_magnitude[i, j]
-    impact_magnitude = select(impact_vel_magnitude > 0.0, impact_vel_magnitude, length(rel_vel))
-    sticking = is_sticking[i, j]
-
-    rotated_tan_disp = tan_spring_disp - contact_normal * (contact_normal * tan_spring_disp)
-    new_tan_spring_disp = dt * rel_vel_t + \
-                          select(squared_length(rotated_tan_disp) <= 0.0,
-                                 zero_vector(),
-                                 rotated_tan_disp * length(tan_spring_disp) / length(rotated_tan_disp))
-
-    fTLS = stiffness_tan[i, j] * new_tan_spring_disp + damping_tan[i, j] * rel_vel_t
-    fTLS_len = length(fTLS)
-    fTLS_inv_len = 1.0 / fTLS_len
-    t = select(fTLS_len > 0, fTLS / fTLS_inv_len, zero_vector())
-
-    f_friction_abs_static = friction_static[i, j] * length(fN)
-    f_friction_abs_dynamic = friction_dynamic[i, j] * length(fN)
-    tan_vel_threshold = 1e-8
-
-    cond1 = sticking == 1 and length(rel_vel_t) < tan_vel_threshold and fTLS_len < f_friction_abs_static
-    cond2 = sticking == 1 and fTLS_len < f_friction_abs_dynamic
-    f_friction_abs = select(cond1, f_friction_abs_static, f_friction_abs_dynamic)
-    n_sticking = select(cond1 or cond2 or fTLS_len < f_friction_abs_dynamic, 1, 0)
-
-    if not cond1 and not cond2 and stiffness_tan[i, j] > 0.0:
-        tangential_spring_displacement[i, j] = \
-            (f_friction_abs * t - damping_tan[i, j] * rel_vel_t) / stiffness_tan[i, j]
-
-    else:
-        tangential_spring_displacement[i, j] = new_tan_spring_disp
-
-    impact_velocity_magnitude[i, j] = impact_magnitude
-    is_sticking[i, j] = n_sticking
-
-    fTabs = min(fTLS_len, f_friction_abs)
-    fT = fTabs * t
-    force[i] += fN + fT
-
-
-def euler(i):
-    velocity[i] += dt * force[i] / mass[i]
-    position[i] += dt * velocity[i]
-
-
-cmd = sys.argv[0]
-target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
-if target != 'cpu' and target != 'gpu':
-    print(f"Invalid target, use {cmd} <cpu/gpu>")
-
-
-dt = 0.005
-cutoff_radius = 2.5
-skin = 0.3
-ntypes = 4
-stiffness_norm = 1.0
-stiffness_tan = 1.0
-damping_norm = 1.0
-damping_tan = 1.0
-friction_static = 1.0
-friction_dynamic = 1.0
-
-psim = pairs.simulation("dem", debug=True)
-psim.add_position('position')
-psim.add_property('mass', pairs.double(), 1.0)
-psim.add_property('velocity', pairs.vector())
-psim.add_property('velocity_wf', pairs.vector())
-psim.add_property('force', pairs.vector(), vol=True)
-psim.add_property('radius', pairs.double(), 1.0)
-psim.add_feature('type', ntypes)
-psim.add_feature_property('type', 'stiffness_norm', pairs.double(), [stiffness_norm for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'stiffness_tan', pairs.double(), [stiffness_tan for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'damping_norm', pairs.double(), [damping_norm for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'damping_tan', pairs.double(), [damping_tan for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'friction_static', pairs.double(), [friction_static for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'friction_dynamic', pairs.double(), [friction_dynamic for i in range(ntypes * ntypes)])
-psim.add_contact_property('is_sticking', pairs.int32(), False)
-psim.add_contact_property('tangential_spring_displacement', pairs.vector(), [0.0, 0.0, 0.0])
-psim.add_contact_property('impact_velocity_magnitude', pairs.double(), 0.0)
-
-psim.read_particle_data("data/fluidized_bed.input", ['mass', 'position', 'velocity'])
-psim.build_neighbor_lists(cutoff_radius + skin)
-psim.vtk_output(f"output/test_{target}")
-psim.compute(linear_spring_dashpot, cutoff_radius, symbols={'dt': dt})
-psim.compute(euler, symbols={'dt': dt})
-
-if target == 'gpu':
-    psim.target(pairs.target_gpu())
-else:
-    psim.target(pairs.target_cpu())
-
-psim.generate()
diff --git a/examples/dem_sd.py b/examples/dem_sd.py
deleted file mode 100644
index da81aad4002cfb84d78f0cabed2ce40ab524198f..0000000000000000000000000000000000000000
--- a/examples/dem_sd.py
+++ /dev/null
@@ -1,184 +0,0 @@
-import math
-import pairs
-import sys
-import os
-
-def update_mass_and_inertia(i):
-    rotation_matrix[i] = diagonal_matrix(1.0)
-    rotation[i] = default_quaternion()
-
-    if is_sphere(i):
-        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
-
-    else:
-        mass[i] = infinity
-        inv_inertia[i] = 0.0
-
-
-def linear_spring_dashpot(i, j):
-    delta_ij = -penetration_depth(i, j)
-    skip_when(delta_ij < 0.0)
-
-    meff = 1.0 / ((1.0 / mass[i]) + (1.0 / mass[j]))
-    stiffness_norm = meff * (pi * pi + lnDryResCoeff * lnDryResCoeff) / \
-                     (collisionTime_SI * collisionTime_SI)
-    damping_norm = -2.0 * meff * lnDryResCoeff / collisionTime_SI
-    damping_tan = sqrt(kappa) * damping_norm
-
-    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
-    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
-
-    rel_vel = -(velocity_wf_i - velocity_wf_j)
-    rel_vel_n = dot(rel_vel, contact_normal(i, j))
-    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
-    fNabs = stiffness_norm * delta_ij + damping_norm * rel_vel_n
-    fN = fNabs * contact_normal(i, j)
-
-    fTabs = min(damping_tan * length(rel_vel_t), friction_dynamic[i, j] * fNabs) # static or dynamic?
-    fT = fTabs * normalized(rel_vel_t)
-
-    partial_force = fN + fT
-
-    apply(force, partial_force)
-    apply(torque, cross(contact_point(i, j) - position, partial_force))
-
-
-def euler(i):
-    inv_mass = 1.0 / mass[i]
-    position[i] += 0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
-    linear_velocity[i] += inv_mass * force[i] * dt
-    wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
-    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
-    rotation[i] = quaternion(phi, length(phi)) * rotation[i]
-    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation[i])
-    angular_velocity[i] += wdot * dt
-
-
-def gravity(i):
-    volume = (4.0 / 3.0) * pi * radius[i] * radius[i] * radius[i]
-    force[i][2] = force[i][2] - (densityParticle_SI - densityFluid_SI) * volume * gravity_SI
-    # if type[i]==0:
-    #     printf("gravity", i, volume, force[i][2], volume/2, 2>1)
-
-
-cmd = sys.argv[0]
-target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
-if target != 'cpu' and target != 'gpu':
-    print(f"Invalid target, use {cmd} <cpu/gpu>")
-
-# Config file parameters
-domainSize_SI=[0.1, 0.1, 0.1]
-diameter_SI = 0.009
-gravity_SI = 9.81
-densityFluid_SI = 1000
-densityParticle_SI = 2550
-generationSpacing_SI = 0.01
-initialVelocity_SI = 1
-dt_SI = 5e-5
-frictionCoefficient = 0.5
-restitutionCoefficient = 0.1
-collisionTime_SI = 5e-4
-poissonsRatio = 0.22
-timeSteps = 1
-visSpacing = 200
-denseBottomLayer = False
-bottomLayerOffsetFactor = 1.0
-kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
-
-minDiameter_SI = diameter_SI #* 0.9
-maxDiameter_SI = diameter_SI #* 1.1
-linkedCellWidth = 1.01 * maxDiameter_SI
-
-ntypes = 1
-lnDryResCoeff = math.log(restitutionCoefficient)
-frictionStatic = 0.0
-frictionDynamic = frictionCoefficient
-
-file_name = os.path.basename(__file__)
-file_name_without_extension = os.path.splitext(file_name)[0]
-
-psim = pairs.simulation(
-    file_name_without_extension,
-    [pairs.sphere(), pairs.halfspace()],
-    timesteps=timeSteps,
-    double_prec=True,
-    use_contact_history=False,
-    particle_capacity=1000000,
-    neighbor_capacity=20,
-    debug=True, generate_whole_program=False)
-
-if target == 'gpu':
-    psim.target(pairs.target_gpu())
-else:
-    psim.target(pairs.target_cpu())
-    #psim.target(pairs.target_cpu(parallel=True))
-
-psim.add_position('position')
-psim.add_property('mass', pairs.real(), 1.0)
-psim.add_property('linear_velocity', pairs.vector())
-psim.add_property('angular_velocity', pairs.vector())
-psim.add_property('force', pairs.vector(), volatile=True)
-psim.add_property('torque', pairs.vector(), volatile=True)
-psim.add_property('hydrodynamic_force', pairs.vector(), reduce=True)
-psim.add_property('hydrodynamic_torque', pairs.vector(), reduce=True)
-# psim.add_property('old_hydrodynamic_force', pairs.vector())
-# psim.add_property('old_hydrodynamic_torque', pairs.vector())
-psim.add_property('radius', pairs.real(), 1.0)
-psim.add_property('normal', pairs.vector())
-psim.add_property('inv_inertia', pairs.matrix())
-psim.add_property('rotation_matrix', pairs.matrix())
-psim.add_property('rotation', pairs.quaternion())
-psim.add_feature('type', ntypes)
-psim.add_feature_property('type', 'friction_static', pairs.real(), [frictionStatic for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDynamic for i in range(ntypes * ntypes)])
-
-# psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
-psim.set_domain_partitioner(pairs.block_forest())
-# psim.set_domain_partitioner(pairs.regular_domain_partitioner())
-psim.pbc([False, False, False])
-psim.dem_sc_grid(
-    domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
-    diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
-
-#psim.read_particle_data(
-#    "data/spheres.input",
-#    "data/spheres_4x4x2.input",
-#    "data/spheres_6x6x2.input",
-#    "data/spheres_8x8x2.input",
-#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
-#    pairs.sphere())
-
-#psim.read_particle_data(
-#    "data/spheres_bottom.input",
-#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
-#    pairs.sphere())
-
-# psim.read_particle_data(
-#     "data/planes.input",
-#     ['type', 'mass', 'position', 'normal', 'flags'],
-#     pairs.halfspace())
-
-psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
-                                     'pi': math.pi,
-                                     'infinity': math.inf })
-
-#psim.compute_half()
-psim.build_cell_lists(linkedCellWidth)
-psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
-
-psim.compute(gravity,
-             symbols={'densityParticle_SI': densityParticle_SI,
-                      'densityFluid_SI': densityFluid_SI,
-                      'gravity_SI': gravity_SI,
-                      'pi': math.pi })
-
-psim.compute(linear_spring_dashpot,
-             linkedCellWidth,
-             symbols={'pi': math.pi,
-                      'kappa': kappa,
-                      'lnDryResCoeff': lnDryResCoeff,
-                      'collisionTime_SI': collisionTime_SI})
-
-psim.compute(euler, parameters={'dt' : pairs.real()})
-# psim.compute(euler, symbols={'dt' : dt_SI})
-psim.generate()
diff --git a/examples/lift.py b/examples/lift.py
deleted file mode 100644
index 2b48fdf02a0afdcc7075843cdab8a1852cee9e45..0000000000000000000000000000000000000000
--- a/examples/lift.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from coupling.parse_cpp import parse_walberla_file
-from coupling.parse_cpp import get_class_method, print_tree
-
-filename = "mesa_pd/kernel/SpringDashpot.hpp"
-translation_unit = parse_walberla_file(filename)
-
-# subtree = get_subtree(tu.cursor, "walberla::mesa_pd::kernel")
-# print_tree(subtree)
-
-kernel = get_class_method(
-        translation_unit.cursor,
-        "walberla::mesa_pd::kernel::SpringDashpot",
-        "operator()")
-print_tree(kernel)
diff --git a/examples/lj_embedded.py b/examples/lj_embedded.py
deleted file mode 100644
index 160c6081bacc9e0a3cd6e8561ebff9ea2eb7bf15..0000000000000000000000000000000000000000
--- a/examples/lj_embedded.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import pairs
-
-
-dt = 0.005
-cutoff_radius = 2.5
-skin = 0.3
-sigma = 1.0
-epsilon = 1.0
-sigma6 = sigma ** 6
-
-psim = pairs.simulation("lj")
-mass = psim.add_real_property('mass', 1.0)
-position = psim.add_vector_property('position')
-velocity = psim.add_vector_property('velocity')
-force = psim.add_vector_property('force', vol=True)
-psim.from_file("data/minimd_setup_4x4x4.input", ['mass', 'position', 'velocity'])
-psim.create_cell_lists(2.8, 2.8)
-psim.create_neighbor_lists()
-psim.periodic(2.8)
-psim.vtk_output("output/test")
-
-for i, j, delta, rsq in psim.particle_pairs(cutoff_radius, position):
-    sr2 = 1.0 / rsq
-    sr6 = sr2 * sr2 * sr2 * sigma6
-    f = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon
-    force[i].add(delta * f)
-
-for i in psim.particles():
-    velocity[i].add(dt * force[i] / mass[i])
-    position[i].add(dt * velocity[i])
-
-psim.generate()
diff --git a/examples/lj_onetype.py b/examples/lj_onetype.py
deleted file mode 100644
index e703122d5469e96458dd831aedbfa4cf9c456090..0000000000000000000000000000000000000000
--- a/examples/lj_onetype.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import pairs
-import sys
-
-
-def lj(i, j):
-    sr2 = 1.0 / rsq
-    sr6 = sr2 * sr2 * sr2 * sigma6
-    force[i] += delta * 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon
-
-
-def euler(i):
-    velocity[i] += dt * force[i] / mass[i]
-    position[i] += dt * velocity[i]
-
-
-cmd = sys.argv[0]
-target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
-if target != 'cpu' and target != 'gpu':
-    print(f"Invalid target, use {cmd} <cpu/gpu>")
-
-
-dt = 0.005
-cutoff_radius = 2.5
-skin = 0.3
-sigma = 1.0
-epsilon = 1.0
-sigma6 = sigma ** 6
-
-psim = pairs.simulation("lj", debug=True)
-psim.add_real_property('mass', 1.0)
-psim.add_position('position')
-psim.add_vector_property('velocity')
-psim.add_vector_property('force', vol=True)
-psim.from_file("data/minimd_setup_32x32x32.input", ['mass', 'position', 'velocity'])
-psim.build_neighbor_lists(cutoff_radius + skin)
-psim.vtk_output(f"output/test_{target}")
-psim.compute(lj, cutoff_radius, {'sigma6': sigma6, 'epsilon': epsilon})
-psim.compute(euler, symbols={'dt': dt})
-
-if target == 'gpu':
-    psim.target(pairs.target_gpu())
-else:
-    psim.target(pairs.target_cpu())
-
-psim.generate()
diff --git a/examples/main.cpp b/examples/main.cpp
deleted file mode 100644
index 42a9575e000df84e33af7aa51adb683f3b94c5d6..0000000000000000000000000000000000000000
--- a/examples/main.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-#include <iostream>
-//---
-#include "dem_sd.hpp"
-#include <blockforest/BlockForest.h>
-#include <blockforest/Initialization.h>
-
-int main(int argc, char **argv) {
-    double dt = 5e-5;
-    auto pairs_sim = std::make_shared<PairsSimulation>();
-    auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim);
-
-    // Create forest (make sure to use_domain(forest)) ----------------------------------------------
-    // walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1);
-    // std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance();
-    // mpiManager->initializeMPI(&argc, &argv);
-    // mpiManager->useWorldComm();
-    // auto procs = mpiManager->numProcesses();
-    // auto block_config = walberla::Vector3<int>(2, 2, 2);
-    // auto ref_level = 0;
-    // std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest(
-    //         domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level);
-    //-----------------------------------------------------------------------------------------------
-
-    // initialize pairs data structures ----------------------------------------------
-    pairs_sim->initialize();
-
-    // either create new domain or use an existing one ----------------------------------------
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 0.1, 0.1, 0.1);
-
-    // pairs_sim->use_domain(forest);
-
-    // create planes and particles ------------------------------------------------------------
-    pairs_sim->create_halfspace(0,     0,      0,      1, 0, 0,        0, 13);
-    pairs_sim->create_halfspace(0,     0,      0,      0, 1, 0,        0, 13);
-    pairs_sim->create_halfspace(0,     0,      0,      0, 0, 1,        0, 13);
-    pairs_sim->create_halfspace(0.1,   0.1,    0.1,    -1, 0, 0,       0, 13);
-    pairs_sim->create_halfspace(0.1,   0.1,    0.1,    0, -1, 0,       0, 13);
-    pairs_sim->create_halfspace(0.1,   0.1,    0.1,    0, 0, -1,       0, 13);
-
-    // pairs::id_t pUid = pairs_sim->create_sphere(0.0499,   0.0499,   0.07,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
-    // pairs::id_t pUid2 = pairs_sim->create_sphere(0.0499,   0.0499,   0.0499,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
-
-    // Tracking a particle ------------------------------------------------------------------------
-    // if (pUid != pairs_acc->getInvalidUid()){
-    //     std::cout<< "Particle " << pUid << " is created in rank " << rank << std::endl;
-    // }
-    
-    // walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM);
-    // walberla::mpi::allReduceInplace(pUid2, walberla::mpi::SUM);
-    // MPI_Allreduce(MPI_IN_PLACE, &pUid, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
-    // MPI_Allreduce(MPI_IN_PLACE, &pUid2, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
-
-
-    // if (pUid != pairs_acc->getInvalidUid()){
-    //     std::cout<< "Particle " << pUid << " will be tracked by rank " << rank << std::endl;
-    // }
-
-    // auto pIsLocalInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();};
-    // auto pIsGhostInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdxGhost(uid) != pairs_acc->getInvalidIdx();};
-
-
-    // TODO: make sure linkedCellWidth is larger than max diameter in the system
-    // setup particles, setup functions, and the cell list stencil-------------------------------
-    pairs_sim->setup_sim();
-
-    // for(int i=0; i<pairs_acc->size(); ++i){
-    //     if(pairs_acc->getShape(i) == 0){
-    //         std::cout<< "rank: " << rank << " sphere " <<  pairs_acc->getUid(i) << "  " << pairs_acc->getPosition(i) << std::endl; 
-    //     }
-    //     else if(pairs_acc->getShape(i) == 1){
-    //         std::cout<< "rank: " << rank << " halfspace " <<  pairs_acc->getUid(i) << "  " << pairs_acc->getPosition(i) << pairs_acc->getNormal(i) << std::endl; 
-    //     }
-    // }
-
-    // if (pIsLocalInMyRank(pUid)){
-    //     int idx = pairs_acc->uidToIdx(pUid);
-    //     pairs_acc->setPosition(idx, walberla::Vector3<double>(0.01, 0.01, 0.08));
-    //     pairs_acc->setLinearVelocity(idx, walberla::Vector3<double>(0.5, 0.5, 0.5));
-    // }
-
-    for (int t=0; t<5000; ++t){
-        // if ((t%200==0) && (pIsLocalInMyRank(pUid))){
-        //     int idx = pairs_acc->uidToIdx(pUid);
-        //     std::cout<< "Tracked particle is now in rank " << rank << " --- " << pairs_acc->getPosition(idx)<< std::endl;
-        // }
-
-        pairs_sim->communicate(t);
-
-        // if(pairs_sim->rank() == 0){
-        //     pairs_sim->pairs_runtime->copyPropertyToHost(0, ReadOnly, (((pairs_sim->pobj->nlocal + pairs_sim->pobj->nghost) * 1) * sizeof(unsigned long long int))); // uid
-        //     for(int i=0; i<pairs_sim->pobj->nlocal; ++i){
-        //         std::cout << "rank ##0## local uid[" << i << "] = " << pairs_sim->pobj->uid[i] << std::endl;
-        //     }
-        // }
-        // if(pairs_sim->rank() == 2){
-        //     pairs_sim->pairs_runtime->copyPropertyToHost(0, ReadOnly, (((pairs_sim->pobj->nlocal + pairs_sim->pobj->nghost) * 1) * sizeof(unsigned long long int))); // uid
-        //     for(int i=0; i<pairs_sim->pobj->nlocal; ++i){
-        //         std::cout << "rank 2 local uid[" << i << "] = " << pairs_sim->pobj->uid[i] << std::endl;
-        //     }
-        //     for(int i=pairs_sim->pobj->nlocal; i<pairs_sim->pobj->nlocal + pairs_sim->pobj->nghost; ++i){
-        //         std::cout << "rank 2 ghost uid[" << i << "] = " << pairs_sim->pobj->uid[i] << std::endl;
-        //     }
-        // }
-
-        // std::cout << pairs_sim->rank() << " --------- nlocal = " << pairs_acc->nlocal() << " nghost = " << pairs_acc->nghost() << std::endl;
-
-        // if (pairs_sim->rank() == 2) {
-        //     for(int i=0; i<pairs_acc->nlocal(); ++i){
-        //         std::cout << pairs_sim->rank() << " Local  " << i << " -- uid= " << pairs_acc->getUid(i) << std::endl; 
-        //     }
-        //     for(int i=pairs_acc->nlocal(); i<pairs_acc->size(); ++i){
-        //         std::cout << pairs_sim->rank() << " Ghost -- " << i << " -- uid= " << pairs_acc->getUid(i) << std::endl; 
-        //     }
-        // }
-
-
-
-        // if (pIsLocalInMyRank(pUid)){
-        //     int idx = pairs_acc->uidToIdx(pUid);
-        //     pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(1, 1, 1));
-        //     pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(2, 2, 2));
-        // }
-        // if (pIsLocalInMyRank(pUid2)){
-        //     int idx = pairs_acc->uidToIdx(pUid2);
-        //     pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(10, 10, 10));
-        //     pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(20, 20, 20));
-        // }
-        
-        pairs_sim->update_cells(); 
-        pairs_sim->gravity(); 
-        pairs_sim->linear_spring_dashpot(); 
-        pairs_sim->euler(dt); 
-
-        // std::cout << "reset_volatiles" << std::endl;
-        pairs_sim->reset_volatiles(); 
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, pairs_acc->nlocal(), t, 200);
-        pairs_sim->vtk_write("output/dem_sd_ghost", pairs_acc->nlocal(), pairs_acc->size(), t, 200);
-
-        // if (pIsGhostInMyRank(pUid)){
-        //     std::cout << pairs_sim->rank() << " - ghost 1: " << pUid << std::endl;
-        //     int idx = pairs_acc->uidToIdxGhost(pUid);
-        //     pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(1, 1, 1));
-        //     pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(3, 3, 3));
-        // }
-        // if (pIsGhostInMyRank(pUid2)){
-        //     std::cout << pairs_sim->rank() << " - ghost 2: " << pUid2 << std::endl;
-        //     int idx = pairs_acc->uidToIdxGhost(pUid2);
-        //     pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(2, 2, 2));
-        //     pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(4, 4, 4));
-        // }
-
-        // std::cout << "reverse comm and reduce" << std::endl;
-        // pairs_sim->reverse_comm();
-
-        // if (pIsLocalInMyRank(pUid)){
-        //     int idx = pairs_acc->uidToIdx(pUid);
-        //     auto forceSum = pairs_acc->getHydrodynamicForce(idx);
-        //     auto torqueSum = pairs_acc->getHydrodynamicTorque(idx);
-        //     std::cout << pUid << " @@@@ reduced force = " << forceSum << std::endl;
-        //     std::cout << pUid << " @@@@ reduced torque = " << torqueSum << std::endl;
-        // }
-        // if (pIsLocalInMyRank(pUid2)){
-        //     int idx = pairs_acc->uidToIdx(pUid2);
-        //     auto forceSum = pairs_acc->getHydrodynamicForce(idx);
-        //     auto torqueSum = pairs_acc->getHydrodynamicTorque(idx);
-        //     std::cout << pUid2 << " @@@@ reduced force = " << forceSum << std::endl;
-        //     std::cout << pUid2 << " @@@@ reduced torque = " << torqueSum << std::endl;
-        // }
-    }
-
-    pairs_sim->end();
-
-    return 0;
-}
diff --git a/examples/main.cu b/examples/main.cu
deleted file mode 100644
index 94224e0c635a0d218c9c35d3c9479b3166d3bb61..0000000000000000000000000000000000000000
--- a/examples/main.cu
+++ /dev/null
@@ -1,143 +0,0 @@
-#include <iostream>
-#include <cuda_runtime.h>
-
-void checkCudaError(cudaError_t err, const char* func) {
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA error in %s: %s\n", func, cudaGetErrorString(err));
-        exit(err);
-    }
-}
-
-template< typename Type >
-class PairsVector3 {
-public:
-    PairsVector3() = default;
-
-    // If the constructor is called from device, v_ is automatically allocated on 
-    // device because it's a static array embeded in the object itself 
-    __host__ __device__ PairsVector3( Type x, Type y, Type z ) {
-        v_[0] = x;
-        v_[1] = y;
-        v_[2] = z;
-    }
-
-    __host__ __device__ Type& operator[]( int index ) { 
-        return v_[index]; 
-    }
-    __host__ __device__ const Type& operator[] ( int index ) const { 
-        return v_[index]; 
-    }
-
-private:
-    Type v_[3] = {Type(), Type(), Type()};
-};
-
-
-struct PairsObjects {
-    double *position_h;
-    double *position_d;
-};
-
-
-class PairsAccessor{
-    private:
-    PairsObjects *pobj_h;
-    PairsObjects *pobj_d;
-
-    public:
-    // PairsAccessor is only constructable from host, but its getters and setters can be called from both host and device
-    __host__ PairsAccessor(PairsObjects *pobj_): pobj_h(pobj_) {
-
-        // NOTE: Here we copy pobj_h to device, but we will use pobj_d to ONLY work with device pointers 
-        // So for example, we only try to access pobj_d->position_d, NOT pobj_d->position_h 
-        // NOTE: What's copied to device here is the PairsObject, which is a bunch of pointers (including valid device pointers like position_d)
-        // TODO (maybe): Split PairsObjects into two structs, one holding host pointers, the other holding device pointers.
-        cudaMalloc(&pobj_d, sizeof(PairsObjects));
-        cudaMemcpy(pobj_d, pobj_h, sizeof(PairsObjects), cudaMemcpyHostToDevice);
-    }
-
-    __host__ __device__ int getTest(){
-        #ifdef __CUDA_ARCH__ 
-            return 12;
-        #else
-            return 34;
-        #endif
-    }
-
-    // If this function is called from device, it returns a PairsVector3 that's constructed on device
-    // If this function is called from host, it returns a PairsVector3 that's constructed on host
-    __host__ __device__ PairsVector3<double> getPosition(const size_t i) const {
-        #ifdef __CUDA_ARCH__ 
-            // Assume postion_d points to uptodate data (we can't do copyPropertyToDevice from __device__ )
-            return PairsVector3<double>(pobj_d->position_d[i*3 + 0], pobj_d->position_d[i*3 + 1], pobj_d->position_d[i*3 + 2]);
-        #else
-            // Here we can do copyPropertyToHost (ReadOnly) if needed, to make sure position_h has uptodate data
-            return PairsVector3<double>(pobj_h->position_h[i*3 + 0], pobj_h->position_h[i*3 + 1], pobj_h->position_h[i*3 + 2]);
-        #endif 
-            
-    }
-
-    __host__ __device__ void setPosition(const size_t i, PairsVector3<double> const &vec) {
-        #ifdef __CUDA_ARCH__ 
-            // Assume vec is on device
-            pobj_d->position_d[i*3 + 0] = vec[0]; 
-            pobj_d->position_d[i*3 + 1] = vec[1]; 
-            pobj_d->position_d[i*3 + 2] = vec[2];
-            // Assume we don't need position_h data back on host (we can't do copyPropertyToHost from __device__)
-        #else  
-            // Assume vec is on host
-            pobj_h->position_h[i*3 + 0] = vec[0]; 
-            pobj_h->position_h[i*3 + 1] = vec[1]; 
-            pobj_h->position_h[i*3 + 2] = vec[2];
-            // Here we can do copyPropertyToDevice (WriteOnly) if needed (just so host and device data match)
-        #endif 
-    
-    }
-    
-};
-
-
-__global__ void mykernel(PairsAccessor ac){
-    printf("getTest from device = %d\n", ac.getTest());
-
-    PairsVector3<double> pos(7,8,9); 
-    ac.setPosition(0, pos);
-    printf("getPosition(0) from device = (%f, %f, %f) \n", ac.getPosition(0)[0], ac.getPosition(0)[1], ac.getPosition(0)[2]);
-}
-
-
-int main(int argc, char **argv) {
-
-    PairsObjects *pobj = new PairsObjects;
-
-    // User doesn't bother with the stuff below, they are done when PairsSimulation is initialized
-    //----------------------------------------------------------------------------------------------------
-    int numParticles = 1;
-    int numElements = numParticles * 3;
-    pobj->position_h = new double[numElements];
-    cudaMalloc(&pobj->position_d, numElements * sizeof(double));
-    cudaMemcpy(pobj->position_d, pobj->position_h, numElements * sizeof(double), cudaMemcpyHostToDevice);
-    //----------------------------------------------------------------------------------------------------
-
-    PairsAccessor ac(pobj);
-    printf("getTest from host = %d\n", ac.getTest());
-
-    PairsVector3<double> pos(1.2, 3.4, 5.6); 
-    ac.setPosition(0, pos);
-    printf("getPosition(0) from host = (%f, %f, %f) \n", ac.getPosition(0)[0], ac.getPosition(0)[1], ac.getPosition(0)[2]);
-    
-
-    mykernel<<<1,1>>>(ac);
-    checkCudaError(cudaDeviceSynchronize(), "mykernel");
-
-    // In mykernel, we modify the position of the particle ON DEVICE
-    // TODO: To reflect this modification on the host, we need a 'sync' function along with getters and setters (eg: 'syncPosition')
-    // to make sure both host and device data are uptodate and in sync with eachother.
-    // But unlike getters and setters, sync functions are only callable form host
-    // The 'sync' function copies the property to device if its host flag is set, or copies property to host if its device flag is set
-    // If setter is called from host, set host flag and unset device flag for that property
-    // If setter is called from device, set device flag and unset host flag for that property
-
-}
-
-
diff --git a/examples/modular/force_reduction.cpp b/examples/modular/force_reduction.cpp
index e5d2394d108a3068ec86ec8b8026d3ae95c8daca..e884b3d0acab91189ffd4465364d9e24319e8992 100644
--- a/examples/modular/force_reduction.cpp
+++ b/examples/modular/force_reduction.cpp
@@ -9,10 +9,11 @@ int main(int argc, char **argv) {
     auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
     
     // Set domain
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 0.1, 0.1, 0.1);
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 0.1, 0.1, 0.1);
 
     // Create bodies
-    pairs::id_t pUid = pairs_sim->create_sphere(0.0499,   0.0499,   0.07,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
+    pairs::id_t pUid = pairs::create_sphere(pairs_runtime, 0.0499,   0.0499,   0.07,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
     
     // setup_sim after creating all bodies
     pairs_sim->setup_sim();
@@ -93,8 +94,7 @@ int main(int argc, char **argv) {
         pairs_sim->update_cells(t); 
         pairs_sim->gravity(); 
         pairs_sim->spring_dashpot();
-        pairs_sim->euler(5e-5); 
-        pairs_sim->reset_volatiles();         
+        pairs_sim->euler(5e-5);        
         //-------------------------------------------------------------------------------------------
 
         std::cout << "---- reverse_comm and reduce ----" << std::endl;
diff --git a/examples/modular/sd_1.cpp b/examples/modular/sd_1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e56cfc5943c7626d97d6b872524655b9dd0dca0b
--- /dev/null
+++ b/examples/modular/sd_1.cpp
@@ -0,0 +1,47 @@
+#include <iostream>
+#include <memory>
+
+#include "spring_dashpot.hpp"
+
+int main(int argc, char **argv) {
+
+    auto pairs_sim = std::make_shared<PairsSimulation>();
+    pairs_sim->initialize();
+
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1); 
+
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
+
+    pairs_sim->setup_sim(0.1, 0.1, 0.1, 0.1);
+    pairs_sim->update_mass_and_inertia();
+
+    int num_timesteps = 2000;
+    int vtk_freq = 20;
+    double dt = 1e-3;
+    
+    for (int t=0; t<num_timesteps; ++t){
+        if ((t%500==0) && pairs_sim->rank()==0) std::cout << "Timestep: " << t << std::endl;
+
+        pairs_sim->communicate(t);
+        
+        pairs_sim->update_cells(t); 
+
+        pairs_sim->gravity(); 
+        pairs_sim->spring_dashpot(); 
+        pairs_sim->euler(dt); 
+
+        pairs::vtk_write_data(pairs_runtime, "output/sd_1_local", 0, pairs_sim->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_1_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
+    }
+
+    pairs_sim->end();
+}
diff --git a/examples/modular/sd_1_CPU_GPU.cpp b/examples/modular/sd_1_CPU_GPU.cpp
deleted file mode 100644
index 898d3e9826493af37b4c29da9a382322e219173e..0000000000000000000000000000000000000000
--- a/examples/modular/sd_1_CPU_GPU.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <iostream>
-#include <memory>
-
-#include "spring_dashpot.hpp"
-
-int main(int argc, char **argv) {
-
-    auto pairs_sim = std::make_shared<PairsSimulation>();
-    pairs_sim->initialize();
-
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
-
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-    pairs_sim->create_sphere(0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
-
-    pairs_sim->setup_sim(0.1, 0.1, 0.1, 0.1);
-    pairs_sim->update_mass_and_inertia();
-
-    int num_timesteps = 2000;
-    int vtk_freq = 20;
-    double dt = 1e-3;
-    
-    for (int t=0; t<num_timesteps; ++t){
-        if ((t%500==0) && pairs_sim->rank()==0) std::cout << "Timestep: " << t << std::endl;
-
-        pairs_sim->communicate(t);
-        
-        pairs_sim->update_cells(t); 
-
-        pairs_sim->gravity(); 
-        pairs_sim->spring_dashpot(); 
-        pairs_sim->euler(dt); 
-
-        pairs_sim->reset_volatiles(); 
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, pairs_sim->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
-    }
-
-    pairs_sim->end();
-}
diff --git a/examples/modular/sd_2_CPU_GPU.cpp b/examples/modular/sd_2.cpp
similarity index 65%
rename from examples/modular/sd_2_CPU_GPU.cpp
rename to examples/modular/sd_2.cpp
index 73b86385cccd5063fd068b8f2df1cbb2c69bd04e..4bdb4c85a37efef0391ccecc9d2a1e1df6e3313d 100644
--- a/examples/modular/sd_2_CPU_GPU.cpp
+++ b/examples/modular/sd_2.cpp
@@ -30,16 +30,17 @@ int main(int argc, char **argv) {
 
     // Pass forest to P4IRS
     // -------------------------------------------------------------------------------
-    pairs_sim->use_domain(forest);
-
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-    pairs_sim->create_sphere(0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->useDomain(forest);
+
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
 
     pairs_sim->setup_sim(0.1, 0.1, 0.1, 0.1);
     pairs_sim->update_mass_and_inertia();
@@ -59,10 +60,8 @@ int main(int argc, char **argv) {
         pairs_sim->spring_dashpot(); 
         pairs_sim->euler(dt); 
 
-        pairs_sim->reset_volatiles(); 
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, pairs_sim->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_2_local", 0, pairs_sim->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_2_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
     }
 
     pairs_sim->end();
diff --git a/examples/modular/sd_3_CPU.cpp b/examples/modular/sd_3_CPU.cpp
index 7dc10829d64ea658b76ffc5e95f251a8d43550c5..a8e4a93bc5e816eb8eee900b7ef1d6769e7ec206 100644
--- a/examples/modular/sd_3_CPU.cpp
+++ b/examples/modular/sd_3_CPU.cpp
@@ -14,18 +14,19 @@ int main(int argc, char **argv) {
     pairs_sim->initialize();
 
     auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
+    
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1);
 
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
 
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-
-    pairs::id_t pUid = pairs_sim->create_sphere(0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 0, 0);
+    pairs::id_t pUid = pairs::create_sphere(pairs_runtime ,0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 0, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 0, 0);
 
     MPI_Allreduce(MPI_IN_PLACE, &pUid, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
 
@@ -81,14 +82,13 @@ int main(int argc, char **argv) {
         // Euler
         //-------------------------------------------------------------------------------------------
         pairs_sim->euler(dt);
-        pairs_sim->reset_volatiles(); 
 
         // Communicate
         //-------------------------------------------------------------------------------------------
         pairs_sim->communicate(t);
 
-        pairs_sim->vtk_write("output/dem_sd_local", 0, ac->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_3_CPU_local", 0, ac->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_3_CPU_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
     }
 
     pairs_sim->end();
diff --git a/examples/modular/sd_3_GPU.cu b/examples/modular/sd_3_GPU.cu
index 4abdeedcdc844a6f94975bca0aedf1586df6c5e8..b44af846643ed9cf0a730b07c5f60543560e29b8 100644
--- a/examples/modular/sd_3_GPU.cu
+++ b/examples/modular/sd_3_GPU.cu
@@ -46,17 +46,18 @@ int main(int argc, char **argv) {
     // Create PairsAccessor after PairsSimulation is initialized
     auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
 
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1);
 
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
 
-    pairs::id_t pUid = pairs_sim->create_sphere(0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 1, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 1, 0);
+    pairs::id_t pUid = pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 1, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 1, 0);
 
     set_feature_properties(ac);
 
@@ -74,7 +75,6 @@ int main(int argc, char **argv) {
     int num_timesteps = 2000;
     int vtk_freq = 20;
     double dt = 1e-3;
-    pairs_sim->vtk_write_subdom("output/subdom", 0, 1);
 
     for (int t=0; t<num_timesteps; ++t){
         // Up-to-date uids might be on host or device. So sync uid in Host before accessing them from host
@@ -137,7 +137,6 @@ int main(int argc, char **argv) {
         // Euler
         //-------------------------------------------------------------------------------------------
         pairs_sim->euler(dt);
-        pairs_sim->reset_volatiles(); 
 
         // Communicate
         //-------------------------------------------------------------------------------------------
@@ -145,8 +144,8 @@ int main(int argc, char **argv) {
         // PairsAccessor requires an update when particles are communicated
         ac->update();
 
-        pairs_sim->vtk_write("output/dem_sd_local", 0, ac->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/dem_sd_local", 0, ac->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/dem_sd_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
     }
 
     pairs_sim->end();
diff --git a/examples/modular/sd_4.cpp b/examples/modular/sd_4.cpp
index be064d969e400b61940d3980786d73ed21909a39..80ec40313e13692d1b41fcee01f6b5ce7d4ef91b 100644
--- a/examples/modular/sd_4.cpp
+++ b/examples/modular/sd_4.cpp
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
     auto pairs_runtime = pairs_sim->getPairsRuntime();
 
     pairs_runtime->initDomain(&argc, &argv, 
-                    0, 20, 0, 20, 0, 20,    // Domain bounds
+                    0, 0, 0, 20, 20, 20,    // Domain bounds
                     false, false, false,    // PBCs --------------> TODO: runtime pbc
                     true                    // Enable dynamic load balancing (does initial refinement on a <1,1,1> blockforest)
                 ); 
diff --git a/examples/modular/sd_sample_1_CPU_GPU.cpp b/examples/modular/sd_sample_1_CPU_GPU.cpp
deleted file mode 100644
index 960ffbaf7dec899ea3c054b0ea8d3191569a440e..0000000000000000000000000000000000000000
--- a/examples/modular/sd_sample_1_CPU_GPU.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <iostream>
-#include <memory>
-
-#include "spring_dashpot.hpp"
-
-int main(int argc, char **argv) {
-
-    auto pairs_sim = std::make_shared<PairsSimulation>();
-    pairs_sim->initialize();
-
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
-
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-    pairs_sim->create_sphere(0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
-
-    pairs_sim->setup_sim();
-    pairs_sim->update_mass_and_inertia();
-
-    int num_timesteps = 2000;
-    int vtk_freq = 20;
-    double dt = 1e-3;
-
-    for (int t=0; t<num_timesteps; ++t){
-        if ((t%500==0) && pairs_sim->rank()==0) std::cout << "Timestep: " << t << std::endl;
-
-        pairs_sim->communicate(t);
-        
-        pairs_sim->update_cells(t); 
-
-        pairs_sim->gravity(); 
-        pairs_sim->spring_dashpot(); 
-        pairs_sim->euler(dt); 
-
-        pairs_sim->reset_volatiles(); 
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, pairs_sim->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
-    }
-
-    pairs_sim->end();
-}
diff --git a/examples/modular/sd_sample_2_CPU_GPU.cpp b/examples/modular/sd_sample_2_CPU_GPU.cpp
deleted file mode 100644
index e18ce39c3d01a5f7156d727d26a17f9080e7ca22..0000000000000000000000000000000000000000
--- a/examples/modular/sd_sample_2_CPU_GPU.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-#include <iostream>
-#include <memory>
-
-#include <blockforest/BlockForest.h>
-#include <blockforest/Initialization.h>
-
-#include "spring_dashpot.hpp"
-
-int main(int argc, char **argv) {
-
-    auto pairs_sim = std::make_shared<PairsSimulation>();
-    pairs_sim->initialize();
-
-    // Create forest
-    // -------------------------------------------------------------------------------
-    walberla::math::AABB domain(0, 0, 0, 1, 1, 1);
-    std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance();
-    mpiManager->initializeMPI(&argc, &argv);
-    mpiManager->useWorldComm();
-    auto procs = mpiManager->numProcesses();
-
-    walberla::Vector3<int> block_config;
-    if (procs==1)        block_config = walberla::Vector3<int>(1, 1, 1);
-    else if (procs==4)   block_config = walberla::Vector3<int>(2, 2, 1);
-    else { std::cout << "Error: Check block_config" << std::endl; exit(-1);} 
-
-    auto ref_level = 0;
-    std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest(
-            domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level);
-
-    // Pass forest to P4IRS
-    // -------------------------------------------------------------------------------
-    pairs_sim->use_domain(forest);
-
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-    pairs_sim->create_sphere(0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
-
-    pairs_sim->setup_sim();
-    pairs_sim->update_mass_and_inertia();
-
-    int num_timesteps = 2000;
-    int vtk_freq = 20;
-    double dt = 1e-3;
-
-    for (int t=0; t<num_timesteps; ++t){
-        if ((t%500==0) && pairs_sim->rank()==0) std::cout << "Timestep: " << t << std::endl;
-
-        pairs_sim->communicate(t);
-        
-        pairs_sim->update_cells(t); 
-
-        pairs_sim->gravity(); 
-        pairs_sim->spring_dashpot(); 
-        pairs_sim->euler(dt); 
-
-        pairs_sim->reset_volatiles(); 
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, pairs_sim->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
-    }
-
-    pairs_sim->end();
-}
diff --git a/examples/modular/sd_sample_3_CPU.cpp b/examples/modular/sd_sample_3_CPU.cpp
deleted file mode 100644
index 002e4f907b4c77b403dc0b640dcfde93511514bf..0000000000000000000000000000000000000000
--- a/examples/modular/sd_sample_3_CPU.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-#include <iostream>
-#include <memory>
-
-#include "spring_dashpot.hpp"
-
-void change_gravitational_force(std::shared_ptr<PairsAccessor> &ac, int idx){
-    pairs::Vector3<double> upward_gravity(0.0, 0.0, 2 * ac->getMass(idx) * 9.81); 
-    ac->setForce(idx, ac->getForce(idx) + upward_gravity);
-}
-
-int main(int argc, char **argv) {
-
-    auto pairs_sim = std::make_shared<PairsSimulation>();
-    pairs_sim->initialize();
-
-    auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
-
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
-
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-
-    pairs::id_t pUid = pairs_sim->create_sphere(0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 0, 0);
-
-    MPI_Allreduce(MPI_IN_PLACE, &pUid, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
-
-    auto pIsLocalInMyRank = [&](pairs::id_t uid){return ac->uidToIdxLocal(uid) != ac->getInvalidIdx();};
-
-    pairs_sim->setup_sim();
-    pairs_sim->update_mass_and_inertia();
-
-    pairs_sim->communicate(0);
-
-    int num_timesteps = 2000;
-    int vtk_freq = 20;
-    double dt = 1e-3;
-
-    for (int t=0; t<num_timesteps; ++t){
-
-        // Print position of particle pUid
-        //-------------------------------------------------------------------------------------------
-        if(pIsLocalInMyRank(pUid)){
-            std::cout << "Timestep (" << t << "): Particle " << pUid << " is in rank " << pairs_sim->rank() << std::endl;
-            int idx = ac->uidToIdxLocal(pUid);
-            std::cout << "Position = (" 
-                    << ac->getPosition(idx)[0] << ", "
-                    << ac->getPosition(idx)[1] << ", " 
-                    << ac->getPosition(idx)[2] << ")" << std::endl;
-
-        }
-
-        // Calculate forces
-        //-------------------------------------------------------------------------------------------
-        pairs_sim->update_cells(t);
-        pairs_sim->gravity(); 
-        pairs_sim->spring_dashpot(); 
-
-        // Change gravitational force on particle pUid
-        //-------------------------------------------------------------------------------------------
-        if(pIsLocalInMyRank(pUid)){
-            int idx = ac->uidToIdxLocal(pUid);
-
-            std::cout << "Force before changing = (" 
-                    << ac->getForce(idx)[0] << ", "
-                    << ac->getForce(idx)[1] << ", " 
-                    << ac->getForce(idx)[2] << ")" << std::endl;
-
-            change_gravitational_force(ac, idx);
-
-            std::cout << "Force after changing = (" 
-                    << ac->getForce(idx)[0] << ", "
-                    << ac->getForce(idx)[1] << ", " 
-                    << ac->getForce(idx)[2] << ")" << std::endl;
-        }
-
-        // Euler
-        //-------------------------------------------------------------------------------------------
-        pairs_sim->euler(dt);
-        pairs_sim->reset_volatiles(); 
-
-        // Communicate
-        //-------------------------------------------------------------------------------------------
-        pairs_sim->communicate(t);
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, ac->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
-    }
-
-    pairs_sim->end();
-}
\ No newline at end of file
diff --git a/examples/modular/sd_sample_3_GPU.cu b/examples/modular/sd_sample_3_GPU.cu
deleted file mode 100644
index e18bbf0705542f05bac1ab28be06e16fa2527f5b..0000000000000000000000000000000000000000
--- a/examples/modular/sd_sample_3_GPU.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-#include <iostream>
-#include <memory>
-#include <cuda_runtime.h>
-
-#include "spring_dashpot.hpp"
-
-void checkCudaError(cudaError_t err, const char* func) {
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA error in %s: %s\n", func, cudaGetErrorString(err));
-        exit(err);
-    }
-}
-
-__global__ void print_position(PairsAccessor ac, int idx){
-    printf("Position [from device] = (%f, %f, %f) \n", ac.getPosition(idx)[0], ac.getPosition(idx)[1], ac.getPosition(idx)[2]);
-}
-
-__global__ void change_gravitational_force(PairsAccessor ac, int idx){
-    printf("Force [from device] before setting = (%f, %f, %f) \n", ac.getForce(idx)[0], ac.getForce(idx)[1], ac.getForce(idx)[2]);
-
-    pairs::Vector3<double> upward_gravity(0.0, 0.0, 2 * ac.getMass(idx) * 9.81); 
-    ac.setForce(idx, ac.getForce(idx) + upward_gravity);
-
-    printf("Force [from device] after setting = (%f, %f, %f) \n", ac.getForce(idx)[0], ac.getForce(idx)[1], ac.getForce(idx)[2]);
-}
-
-void set_feature_properties(std::shared_ptr<PairsAccessor> &ac){
-    ac->setTypeStiffness(0,0, 0);
-    ac->setTypeStiffness(0,1, 1000);
-    ac->setTypeStiffness(1,0, 1000);
-    ac->setTypeStiffness(1,1, 3000);
-    ac->syncTypeStiffness();
-
-    ac->setTypeDampingNorm(0,0, 0);
-    ac->setTypeDampingNorm(0,1, 20);
-    ac->setTypeDampingNorm(1,0, 20);
-    ac->setTypeDampingNorm(1,1, 10);
-    ac->syncTypeDampingNorm();
-}
-
-int main(int argc, char **argv) {
-
-    auto pairs_sim = std::make_shared<PairsSimulation>();
-    pairs_sim->initialize();
-
-    // Create PairsAccessor after PairsSimulation is initialized
-    auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
-
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
-
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-
-    pairs::id_t pUid = pairs_sim->create_sphere(0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 1, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 1, 0);
-
-    set_feature_properties(ac);
-
-    MPI_Allreduce(MPI_IN_PLACE, &pUid, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
-
-    auto pIsLocalInMyRank = [&](pairs::id_t uid){return ac->uidToIdxLocal(uid) != ac->getInvalidIdx();};
-
-    pairs_sim->setup_sim();
-    pairs_sim->update_mass_and_inertia();
-
-    pairs_sim->communicate(0);
-    // PairsAccessor requires an update when particles are communicated 
-    ac->update();
-
-    int num_timesteps = 2000;
-    int vtk_freq = 20;
-    double dt = 1e-3;
-
-    for (int t=0; t<num_timesteps; ++t){
-        // Up-to-date uids might be on host or device. So sync uid in Host before accessing them from host
-        ac->syncUid(PairsAccessor::Host);
-
-        // Print position of particle pUid
-        //-------------------------------------------------------------------------------------------
-        if(pIsLocalInMyRank(pUid)){
-            std::cout << "Timestep (" << t << "): Particle " << pUid << " is in rank " << pairs_sim->rank() << std::endl;
-            int idx = ac->uidToIdxLocal(pUid);
-
-            // Up-to-date position might be on host or device. 
-            // Sync position on Host before reading it from host:
-            ac->syncPosition(PairsAccessor::Host); 
-            std::cout << "Position [from host] = (" 
-                    << ac->getPosition(idx)[0] << ", "
-                    << ac->getPosition(idx)[1] << ", " 
-                    << ac->getPosition(idx)[2] << ")" << std::endl;
-            
-            // Sync position on Device before reading it from device:
-            ac->syncPosition(PairsAccessor::Device); 
-            print_position<<<1,1>>>(*ac, idx);
-            checkCudaError(cudaDeviceSynchronize(), "print_position");
-            
-            // There's no need to sync position here to continue the simulation, since position wasn't modified.
-        }
-
-        // Calculate forces
-        //-------------------------------------------------------------------------------------------
-        pairs_sim->update_cells(t);
-        pairs_sim->gravity(); 
-        pairs_sim->spring_dashpot(); 
-
-        // Change gravitational force on particle pUid
-        //-------------------------------------------------------------------------------------------
-        ac->syncUid(PairsAccessor::Host);
-
-        if(pIsLocalInMyRank(pUid)){
-            std::cout << "Force Timestep (" << t << "): Particle " << pUid << " is in rank " << pairs_sim->rank() << std::endl;
-            int idx = ac->uidToIdxLocal(pUid);
-
-            // Up-to-date force and mass might be on host or device. 
-            // So sync them in Device before accessing them on device. (No data will be transfered if they are already on device)
-            ac->syncForce(PairsAccessor::Device);
-            ac->syncMass(PairsAccessor::Device);
-
-            // Modify force from device:
-            change_gravitational_force<<<1,1>>>(*ac, idx);
-            checkCudaError(cudaDeviceSynchronize(), "change_gravitational_force");
-
-            // Force on device was modified.
-            // So sync force before continuing the simulation.
-            ac->syncForce(PairsAccessor::Host);
-            std::cout << "Force [from host] after changing = (" 
-                    << ac->getForce(idx)[0] << ", "
-                    << ac->getForce(idx)[1] << ", " 
-                    << ac->getForce(idx)[2] << ")" << std::endl;
-        }
-
-        // Euler
-        //-------------------------------------------------------------------------------------------
-        pairs_sim->euler(dt);
-        pairs_sim->reset_volatiles(); 
-
-        // Communicate
-        //-------------------------------------------------------------------------------------------
-        pairs_sim->communicate(t);
-        // PairsAccessor requires an update when particles are communicated
-        ac->update();
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, ac->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
-    }
-
-    pairs_sim->end();
-}
\ No newline at end of file
diff --git a/examples/dem.py b/examples/whole-program-generation/linear_spring_dashpot.py
similarity index 91%
rename from examples/dem.py
rename to examples/whole-program-generation/linear_spring_dashpot.py
index 7f3d4d508dad66b372e4000b93a7476116697bae..9a13b2bd363d29face05936ade47bcf47f0f2735 100644
--- a/examples/dem.py
+++ b/examples/whole-program-generation/linear_spring_dashpot.py
@@ -97,9 +97,6 @@ if target != 'cpu' and target != 'gpu':
 
 # Config file parameters
 domainSize_SI = [0.8, 0.015, 0.2]
-#domainSize_SI = [0.4, 0.4, 0.2] # node base
-#domainSize_SI = [0.6, 0.6, 0.2] # node base
-#domainSize_SI = [0.8, 0.8, 0.2] # node base
 diameter_SI = 0.0029
 gravity_SI = 9.81
 densityFluid_SI = 1000
@@ -112,7 +109,6 @@ restitutionCoefficient = 0.1
 collisionTime_SI = 5e-4
 poissonsRatio = 0.22
 timeSteps = 10000
-#timeSteps = 1000
 visSpacing = 100
 denseBottomLayer = False
 bottomLayerOffsetFactor = 1.0
@@ -128,13 +124,14 @@ frictionStatic = 0.0
 frictionDynamic = frictionCoefficient
 
 psim = pairs.simulation(
-    "dem",
+    "linear_spring_dashpot",
     [pairs.sphere(), pairs.halfspace()],
     timesteps=timeSteps,
     double_prec=True,
     use_contact_history=True,
     particle_capacity=1000000,
-    neighbor_capacity=20)
+    neighbor_capacity=20,
+    generate_whole_program=True)
 
 if target == 'gpu':
     psim.target(pairs.target_gpu())
@@ -167,29 +164,16 @@ psim.dem_sc_grid(
     domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
     diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
 
-#psim.read_particle_data(
-#    "data/spheres.input",
-#    "data/spheres_4x4x2.input",
-#    "data/spheres_6x6x2.input",
-#    "data/spheres_8x8x2.input",
-#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
-#    pairs.sphere())
-
-#psim.read_particle_data(
-#    "data/spheres_bottom.input",
-#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
-#    pairs.sphere())
 
 psim.read_particle_data(
     "data/planes.input",
-    ['uid', 'type', 'mass', 'position', 'normal', 'flags'],
+    ['type', 'mass', 'position', 'normal', 'flags'],
     pairs.halfspace())
 
 psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
                                      'pi': math.pi,
                                      'infinity': math.inf })
 
-#psim.compute_half()
 psim.build_cell_lists(linkedCellWidth)
 psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
 
diff --git a/examples/md.py b/examples/whole-program-generation/md.py
similarity index 83%
rename from examples/md.py
rename to examples/whole-program-generation/md.py
index 32bc1da44d44e6f1e3027979090ce511aa052f02..22eb3c1f58d0fb7d8892232b04a9aa8b146cccf4 100644
--- a/examples/md.py
+++ b/examples/whole-program-generation/md.py
@@ -35,8 +35,12 @@ nz = 32
 rho = 0.8442
 temp = 1.44
 
-psim = pairs.simulation("md", [pairs.point_mass()], timesteps=200, double_prec=True, debug=True)
-#psim = pairs.simulation("md", [pairs.point_mass()], timesteps=200, double_prec=True, debug=True, generate_whole_program=True)
+psim = pairs.simulation("md", 
+                        [pairs.point_mass()],
+                        timesteps=200, 
+                        double_prec=True, 
+                        debug=True,
+                        generate_whole_program=True)
 
 if target == 'gpu':
     psim.target(pairs.target_gpu())
@@ -52,14 +56,12 @@ psim.add_feature_property('type', 'epsilon', pairs.real(), [epsilon for i in ran
 psim.add_feature_property('type', 'sigma6', pairs.real(), [sigma6 for i in range(ntypes * ntypes)])
 
 psim.copper_fcc_lattice(nx, ny, nz, rho, temp, ntypes)
-#psim.set_domain_partitioner(pairs.block_forest())
 psim.set_domain_partitioner(pairs.regular_domain_partitioner())
 psim.compute_thermo(100)
 
 psim.reneighbor_every(20)
-#psim.compute_half()
 psim.build_neighbor_lists(cutoff_radius + skin)
-psim.vtk_output(f"output/md_{target}")
+# psim.vtk_output(f"output/md_{target}")
 
 psim.compute(initial_integrate, symbols={'dt': dt}, pre_step=True, skip_first=True)
 psim.compute(lennard_jones, cutoff_radius)
diff --git a/examples/whole-program-generation/spring_dashpot.py b/examples/whole-program-generation/spring_dashpot.py
index c12ec34af1934466d5c05f5d26e90561bd995f36..6212b0aa7fd41d3d78ad4655f3d9de8f8afa1750 100644
--- a/examples/whole-program-generation/spring_dashpot.py
+++ b/examples/whole-program-generation/spring_dashpot.py
@@ -49,13 +49,6 @@ def gravity(i):
     force[i][2] -= mass[i] * gravity_SI
 
 
-# Number of 'type' features and their pair-wise properties
-ntypes = 2
-stiffness_SI = [100000 for i in range(ntypes * ntypes)]
-dampingNorm_SI = [300 for i in range(ntypes * ntypes)]
-dampingTan_SI = [0.0 for i in range(ntypes * ntypes)]
-friction_SI = [0.0 for i in range(ntypes * ntypes)]
-
 # Domain size
 domainSize_SI=[10, 10, 10]
 
@@ -81,6 +74,8 @@ visSpacing = 20
 
 timeSteps = 2000
 
+# file_name_without_extension is the simulation identifer (in this case "spring_dashpot")
+# TODO: Integration with cmake
 file_name = os.path.basename(__file__)
 file_name_without_extension = os.path.splitext(file_name)[0]
 
@@ -103,6 +98,7 @@ else:
     print(f"Invalid target, use {sys.argv[0]} <cpu/gpu>")
 
 
+# Register properties
 psim.add_position('position')
 psim.add_property('mass', pairs.real())
 psim.add_property('linear_velocity', pairs.vector())
@@ -115,19 +111,28 @@ psim.add_property('inv_inertia', pairs.matrix())
 psim.add_property('rotation_matrix', pairs.matrix())
 psim.add_property('rotation', pairs.quaternion())
 
+# Define the number of 'type' features and their pair-wise properties
+ntypes = 2
+stiffness_SI = [100000 for i in range(ntypes * ntypes)]
+dampingNorm_SI = [300 for i in range(ntypes * ntypes)]
+dampingTan_SI = [0.5 for i in range(ntypes * ntypes)]
+friction_SI = [20.0 for i in range(ntypes * ntypes)]
+
+# Register 'type' as a feature
 psim.add_feature('type', ntypes)
+
+# Register properties for the 'type' feature
 psim.add_feature_property('type', 'stiffness', pairs.real(), stiffness_SI)
 psim.add_feature_property('type', 'damping_norm', pairs.real(), dampingNorm_SI)
 psim.add_feature_property('type', 'damping_tan', pairs.real(), dampingTan_SI)
 psim.add_feature_property('type', 'friction', pairs.real(), friction_SI)
 
-psim.set_domain_partitioner(pairs.block_forest())
-psim.pbc([False, False, False])
-psim.build_cell_lists(linkedCellWidth)
-
+# Define the domain and optimization strategies
 psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
-
+psim.pbc([False, False, False])
+psim.set_domain_partitioner(pairs.block_forest())
 psim.set_workload_balancer(pairs.morton(), regrid_min=100, regrid_max=1000, rebalance_frequency=200)
+psim.build_cell_lists(linkedCellWidth)
 
 # Generate particles
 psim.dem_sc_grid(domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], 
@@ -144,13 +149,14 @@ psim.read_particle_data( "data/sd_planes.input", ['type', 'mass', 'position', 'n
 
 psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
 
-# The user-defined setup functions are executed only once before the timestep loop
+# The user-defined 'setup' functions are executed only once before the timestep loop
 psim.setup(update_mass_and_inertia, symbols={'infinity': math.inf })
 
-# The user-defined compute functions are added to the timestep loop in the order they are given to 'compute'
+# The user-defined 'compute' functions are added to the timestep loop in the order they are given to 'compute'
 psim.compute(spring_dashpot, linkedCellWidth)
 psim.compute(gravity, symbols={'gravity_SI': gravity_SI })
 psim.compute(euler, symbols={'dt': dt_SI})
 
+# Triger code generation
 psim.generate()
 
diff --git a/runtime/domain/ParticleDataHandling.hpp b/runtime/domain/ParticleDataHandling.hpp
index cf467d99548884612fd1e4181664e79ebde34a49..c54bae6404434af71f43c3d6a98d32d0e0537574 100644
--- a/runtime/domain/ParticleDataHandling.hpp
+++ b/runtime/domain/ParticleDataHandling.hpp
@@ -127,7 +127,7 @@ public:
     }
 
     void serialize(IBlock *const block, const BlockDataID& id, mpi::SendBuffer& buffer) override {
-        serializeImpl(static_cast<Block *const>(block), id, buffer, 0, false);
+        serializeImpl(static_cast<Block*>(block), id, buffer, 0, false);
     }
 
     internal::ParticleDeleter* deserialize(IBlock *const block) override {
diff --git a/runtime/interfaces/dem.hpp b/runtime/interfaces/dem.hpp
deleted file mode 100644
index b593c2707d053e0c6ac547b8ec23783f5d15e5f6..0000000000000000000000000000000000000000
--- a/runtime/interfaces/dem.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "../pairs.hpp"
-
-namespace pairs_host_interface {
-
-int get_uid(int *uid, int i) { return uid[i]; }
-int get_shape(int *shape, int i) { return shape[i]; }
-int get_flags(int *flags, int i) { return flags[i]; }
-double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; }
-double get_mass(double *mass, int i) { return mass[i]; }
-double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; }
-double get_angular_velocity(double *angular_velocity, int i, int j, int capacity) { return angular_velocity[i * 3 + j]; }
-double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; }
-double get_torque(double *torque, int i, int j, int capacity) { return torque[i * 3 + j]; }
-double get_radius(double *radius, int i) { return radius[i]; }
-double get_normal(double *normal, int i, int j, int capacity) { return normal[i * 3 + j]; }
-double get_inv_inertia(double *inv_inertia, int i, int j, int capacity) { return inv_inertia[i * 9 + j]; }
-double get_rotation_matrix(double *rotation_matrix, int i, int j, int capacity) { return rotation_matrix[i * 9 + j]; }
-double get_rotation_quat(double *rotation_quat, int i, int j, int capacity) { return rotation_quat[i * 4 + j]; }
-int get_type(int *type, int i) { return type[i]; }
-
-}
-
-namespace pairs_cuda_interface {
-
-__inline__ __device__ int get_uid(int *uid, int i) { return uid[i]; }
-__inline__ __device__ int get_shape(int *shape, int i) { return shape[i]; }
-__inline__ __device__ int get_flags(int *flags, int i) { return flags[i]; }
-__inline__ __device__ double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; }
-__inline__ __device__ double get_mass(double *mass, int i) { return mass[i]; }
-__inline__ __device__ double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; }
-__inline__ __device__ double get_angular_velocity(double *angular_velocity, int i, int j, int capacity) { return angular_velocity[i * 3 + j]; }
-__inline__ __device__ double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; }
-__inline__ __device__ double get_torque(double *torque, int i, int j, int capacity) { return torque[i * 3 + j]; }
-__inline__ __device__ double get_radius(double *radius, int i) { return radius[i]; }
-__inline__ __device__ double get_normal(double *normal, int i, int j, int capacity) { return normal[i * 3 + j]; }
-__inline__ __device__ double get_inv_inertia(double *inv_inertia, int i, int j, int capacity) { return inv_inertia[i * 9 + j]; }
-__inline__ __device__ double get_rotation_matrix(double *rotation_matrix, int i, int j, int capacity) { return rotation_matrix[i * 9 + j]; }
-__inline__ __device__ double get_rotation_quat(double *rotation_quat, int i, int j, int capacity) { return rotation_quat[i * 4 + j]; }
-__inline__ __device__ int get_type(int *type, int i) { return type[i]; }
-
-}
diff --git a/runtime/interfaces/md.hpp b/runtime/interfaces/md.hpp
deleted file mode 100644
index c8bb6b29da34073cb58e82a8b54dc5d0f832f3ce..0000000000000000000000000000000000000000
--- a/runtime/interfaces/md.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "../pairs.hpp"
-
-namespace pairs_host_interface {
-
-int get_uid(int *uid, int i) { return uid[i]; }
-int get_shape(int *shape, int i) { return shape[i]; }
-int get_flags(int *flags, int i) { return flags[i]; }
-double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; }
-double get_mass(double *mass, int i) { return mass[i]; }
-double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; }
-double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; }
-int get_type(int *type, int i) { return type[i]; }
-
-}
-
-namespace pairs_cuda_interface {
-
-__inline__ __device__ int get_uid(int *uid, int i) { return uid[i]; }
-__inline__ __device__ int get_shape(int *shape, int i) { return shape[i]; }
-__inline__ __device__ int get_flags(int *flags, int i) { return flags[i]; }
-__inline__ __device__ double get_position(double *position, int i, int j, int capacity) { return position[i * 3 + j]; }
-__inline__ __device__ double get_mass(double *mass, int i) { return mass[i]; }
-__inline__ __device__ double get_linear_velocity(double *linear_velocity, int i, int j, int capacity) { return linear_velocity[i * 3 + j]; }
-__inline__ __device__ double get_force(double *force, int i, int j, int capacity) { return force[i * 3 + j]; }
-__inline__ __device__ int get_type(int *type, int i) { return type[i]; }
-
-}
diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp
index 4897a35ebf1d8614b2b81b72075faaed6491e683..6efead8d8cb598c1fcd481beb35c12464c235830 100644
--- a/runtime/pairs.cpp
+++ b/runtime/pairs.cpp
@@ -14,7 +14,7 @@ namespace pairs {
 
 void PairsRuntime::initDomain(
     int *argc, char ***argv,
-    real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, 
+    real_t xmin, real_t ymin, real_t zmin, real_t xmax, real_t ymax, real_t zmax, 
     bool pbcx, bool pbcy, bool pbcz, 
     bool balance_workload) {
 
diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp
index 54abd6d4546453714638800844c6afc73d24db1f..e87dec06224d830f8f15fdc2e593278c00d100e6 100644
--- a/runtime/pairs.hpp
+++ b/runtime/pairs.hpp
@@ -320,7 +320,7 @@ public:
     // Communication
     void initDomain(
         int *argc, char ***argv,
-        real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax,
+        real_t xmin, real_t ymin, real_t zmin, real_t xmax, real_t ymax, real_t zmax, 
         bool pbcx = 0, bool pbcy = 0, bool pbcz = 0, bool balance_workload = 0);
 
     template<typename Domain_T>
diff --git a/runtime/vtk.cpp b/runtime/vtk.cpp
index db805a3b63a3f6c0ed52638edb69c45e1a2cd308..b6235725c9d7d10cb38e4033a1e456b6b50ab32a 100644
--- a/runtime/vtk.cpp
+++ b/runtime/vtk.cpp
@@ -23,12 +23,6 @@ void vtk_write_aabb(PairsRuntime *ps, const char *filename, int num,
     filename_oss <<".vtk";
     std::ofstream out_file(filename_oss.str());
 
-    double aabb[3][3];
-    for (int d=0; d<3; ++d){
-        aabb[d][0] = ps->getDomainPartitioner()->getSubdomMin(d);
-        aabb[d][1] = ps->getDomainPartitioner()->getSubdomMax(d);
-    }
-
     out_file << std::fixed << std::setprecision(prec);
     if(out_file.is_open()) {
         out_file << "# vtk DataFile Version 2.0\n";
diff --git a/src/pairs/code_gen/accessor.py b/src/pairs/code_gen/accessor.py
index eb2a73dde25ec1e47b2a2ded20b3068de98ade09..34421cb64cde53f7a75a9079b39ee90c25b7b235 100644
--- a/src/pairs/code_gen/accessor.py
+++ b/src/pairs/code_gen/accessor.py
@@ -26,9 +26,9 @@ class PairsAcessor:
         if self.target.is_gpu():
             self.host_device_attr = "__host__ __device__ "
             self.host_attr = "__host__ "
-        self.print("#include \"runtime/math/Vector3.hpp\"")
-        # self.print("#include \"runtime/math/Quaternion.hpp\"")
-        # self.print("#include \"runtime/math/Matrix3.hpp\"")
+        self.print("#include \"math/Vector3.hpp\"")
+        # self.print("#include \"math/Quaternion.hpp\"")
+        # self.print("#include \"math/Matrix3.hpp\"")
         self.print("")
 
         self.print("class PairsAccessor {")
diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py
index d4e09851ca747ec81bc0817da7580ea8f081bd21..76e22283330f531eed2f449a30eee798bf62c00e 100644
--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -57,6 +57,15 @@ class CGen:
 
     def real_type(self):
         return Types.c_keyword(self.sim, Types.Real)
+    
+    # def generate_cmake_config_file(self):
+    #     self.print = Printer("pairs_cmake_params.txt")
+    #     self.print.start()
+    #     self.print(f"PAIRS_TARGET={self.ref}")
+    #     self.print(f"GENERATE_WHOLE_PROGRAM={'ON' if self.sim._generate_whole_program else 'OFF'}")
+    #     self.print(f"USE_WALBERLA={'ON' if self.sim._partitioner == DomainPartitioners.BlockForest else 'OFF'}")
+    #     # self.print(f"COMPILE_CUDA={'ON' if self.target.is_gpu() else 'OFF'}")
+    #     self.print.end()
 
     def generate_object_reference(self, obj, device=False, index=None):
         if device and (not self.target.is_gpu() or not obj.device_flag):
@@ -99,8 +108,8 @@ class CGen:
         return f"&({ref})"
 
     def generate_interfaces(self):
-        #self.print = Printer(f"runtime/interfaces/{self.ref}.hpp")
-        self.print = Printer("runtime/interfaces/last_generated.hpp")
+        #self.print = Printer(f"interfaces/{self.ref}.hpp")
+        self.print = Printer("internal_interfaces/last_generated.hpp")
         self.print.start()
         self.print("#pragma once")
         self.generate_interface_namespace('pairs_host_interface')
@@ -141,14 +150,9 @@ class CGen:
         self.print("}")
 
     def generate_preamble(self):
-        self.print(f"#define APPLICATION_REFERENCE \"{self.ref}\"")
-
-        # TODO: Either do this, or add USE_WALBERLA to you compile definitions
-        if self.sim.partitioner()==DomainPartitioners.BlockForest:
-            self.print("#define USE_WALBERLA")
+        # self.print(f"#define APPLICATION_REFERENCE \"{self.ref}\"")
 
         if self.target.is_gpu():
-            self.print("#define PAIRS_TARGET_CUDA")
             self.print("#include <math_constants.h>")
              
         if self.target.is_openmp():
@@ -161,16 +165,16 @@ class CGen:
         self.print("#include <stdio.h>")
         self.print("#include <stdlib.h>")
         self.print("//---")
-        self.print("#include \"runtime/likwid-marker.h\"")
-        self.print("#include \"runtime/copper_fcc_lattice.hpp\"")
-        self.print("#include \"runtime/create_body.hpp\"")
-        self.print("#include \"runtime/dem_sc_grid.hpp\"")
-        self.print("#include \"runtime/pairs.hpp\"")
-        self.print("#include \"runtime/read_from_file.hpp\"")
-        self.print("#include \"runtime/stats.hpp\"")
-        self.print("#include \"runtime/timing.hpp\"")
-        self.print("#include \"runtime/thermo.hpp\"")
-        self.print("#include \"runtime/vtk.hpp\"")
+        self.print("#include \"likwid-marker.h\"")
+        self.print("#include \"copper_fcc_lattice.hpp\"")
+        self.print("#include \"create_body.hpp\"")
+        self.print("#include \"dem_sc_grid.hpp\"")
+        self.print("#include \"pairs.hpp\"")
+        self.print("#include \"read_from_file.hpp\"")
+        self.print("#include \"stats.hpp\"")
+        self.print("#include \"timing.hpp\"")
+        self.print("#include \"thermo.hpp\"")
+        self.print("#include \"vtk.hpp\"")
         self.print("")
         self.print("using namespace pairs;")
         self.print("")
diff --git a/src/pairs/sim/domain_partitioning.py b/src/pairs/sim/domain_partitioning.py
index ef0502dd2c0ccd7e6588e89f041a42e89040d3d2..485e1a6cbb8c8fe54dabdadd7efd0e7368c3d23b 100644
--- a/src/pairs/sim/domain_partitioning.py
+++ b/src/pairs/sim/domain_partitioning.py
@@ -46,8 +46,8 @@ class DimensionRanges:
        return sum([array[i] for i in self.step_indexes(step)])
 
     def initialize(self):
-        grid_array = [(self.sim.grid.min(d), self.sim.grid.max(d)) for d in range(self.sim.ndims())]
-        Call_Void(self.sim, "pairs_runtime->initDomain", [param for delim in grid_array for param in delim])
+        grid_array = [self.sim.grid.min(d) for d in range(self.sim.ndims())] + [self.sim.grid.max(d) for d in range(self.sim.ndims())]
+        Call_Void(self.sim, "pairs_runtime->initDomain", grid_array)
 
     def update(self):
         Call_Void(self.sim, "pairs_runtime->updateDomain", [])
@@ -141,11 +141,10 @@ class BlockForest:
         return self.reduce_step
 
     def initialize(self):
-        grid_array = [(self.sim.grid.min(d), self.sim.grid.max(d)) for d in range(self.sim.ndims())]
+        grid_array = [self.sim.grid.min(d) for d in range(self.sim.ndims())] + [self.sim.grid.max(d) for d in range(self.sim.ndims())]
 
         Call_Void(self.sim, "pairs_runtime->initDomain", 
-                  [param for delim in grid_array for param in delim] + 
-                  self.sim._pbc + ([True] if self.load_balancer is not None else []))
+                  grid_array + self.sim._pbc + ([True] if self.load_balancer is not None else []))
         
         if self.load_balancer is not None:
             PrintCode(self.sim, "pairs_runtime->getDomainPartitioner()->initWorkloadBalancer"