Improve CMake, update README and examples, remove outdated files

0a0a2dd6 · Behzad Safaei · 842850ea · 0a0a2dd6 · 842850ea · 0a0a2dd6
Commit 0a0a2dd6 authored 5 months ago by Behzad Safaei
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 project(pairs CXX)
-
 # Set default build type if none is specified
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build (Debug, Release, etc.)" FORCE)
@@ -9,30 +8,27 @@ endif()
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DDEBUG")

-option(USE_WALBERLA "Enable waLBerla support for using BlockForest partitioning" ON)
-option(USE_MPI "USE_MPI" ON)
-option(COMPILE_CUDA "COMPILE_CUDA" ON)
-option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON)
-option(GENERATE_WHOLE_PROGRAM "Generate the whole program (i.e. including the 'main' function). No additional source files are needed." OFF)
-option(BUILD_APP "Build a stand-alone app which uses the P4IRS modular interface. Provide your source files with -DUSER_SOURCE_FILES" OFF)
+option(USE_MPI                      "USE_MPI" ON)
+option(COMPILE_CUDA                 "COMPILE_CUDA" OFF)
+option(GENERATE_WHOLE_PROGRAM       "Generate the whole program (i.e. including the 'main' function). No additional source files are needed." OFF)
+option(BUILD_APP                    "Build a stand-alone app which uses the P4IRS modular interface. Provide your source files with -DUSER_SOURCE_FILES" OFF)

 if(GENERATE_WHOLE_PROGRAM AND BUILD_APP)
    message(FATAL_ERROR "You must choose either GENERATE_WHOLE_PROGRAM or BUILD_APP or neither.\n
-        Choose neither if you only want to use the generated libraries in your project (in a seperate build system).")
+        Choose neither if you only want to use the P4IRS library in your project (in a seperate build system).")
 endif()

-set(PYTHON_SCRIPT ${PYTHON_SCRIPT} CACHE PATH "The python script triggering code generation")
-if(NOT EXISTS ${PYTHON_SCRIPT})
-    message(FATAL_ERROR "PYTHON_SCRIPT doesn't exist! Specify it with -DPYTHON_SCRIPT=/path/to/script.py")
+set(INPUT_SCRIPT ${INPUT_SCRIPT} CACHE PATH "The input python script triggering code generation")
+if(NOT EXISTS ${INPUT_SCRIPT})
+    message(FATAL_ERROR "INPUT_SCRIPT doesn't exist! Specify it with -DINPUT_SCRIPT=/path/to/script.py")
 endif()
-get_filename_component(PYTHON_SCRIPT_NAME ${PYTHON_SCRIPT} NAME_WE)
+get_filename_component(INPUT_SCRIPT_NAME ${INPUT_SCRIPT} NAME_WE)

 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")

 #================================================================================
 # Setup directories =============================================================
 #================================================================================
-file(COPY runtime DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY data DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

 set(OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/output)
@@ -44,8 +40,10 @@ file(MAKE_DIRECTORY ${OUTPUT_DIR})
 #================================================================================
 # Generated header (internally used by runtime files) ===========================
 #================================================================================
-set(GEN_INTERFACE_HEADER ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces/last_generated.hpp)
-set(GEN_INTERFACE_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces)
+# TODO: Unify all interfaces
+set(GEN_INTERFACE_DIR ${CMAKE_CURRENT_BINARY_DIR}/internal_interfaces)
+set(GEN_INTERFACE_HEADER ${CMAKE_CURRENT_BINARY_DIR}/last_generated.hpp)
+file(MAKE_DIRECTORY ${GEN_INTERFACE_DIR})

 #================================================================================
 # RUNTIME_COMMON_FILES ==========================================================
@@ -80,6 +78,10 @@ else()
    list(APPEND PAIRS_LINK_LIBRARIES ${PAIRS_TARGET})
 endif()

+# Include P4IRS 'runtime' dir
+target_include_directories(${PAIRS_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime)
+list(APPEND PAIRS_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/runtime)
+
 set_target_properties(${PAIRS_TARGET} PROPERTIES 
    CXX_STANDARD_REQUIRED ON
    CXX_STANDARD 17
@@ -106,7 +108,15 @@ endif()
 #================================================================================
 # waLBerla ======================================================================
 #================================================================================
-if(USE_WALBERLA)
+set(WALBERLA_DIR ${WALBERLA_DIR} CACHE PATH "Path to waLBerla source directory (required only when using BlockForest partitioning).")
+
+if(WALBERLA_DIR)
+    if(EXISTS "${WALBERLA_DIR}")
+        target_compile_definitions(${PAIRS_TARGET} PUBLIC USE_WALBERLA)
+    else()
+        message(FATAL_ERROR "Invalid WALBERLA_DIR: '${WALBERLA_DIR}' does not exist.")
+    endif()
+
    set(RUNTIME_WALBERLA_FILES
        runtime/domain/block_forest.cpp
    )
@@ -119,7 +129,6 @@ if(USE_WALBERLA)
    endif()

    target_sources(${PAIRS_TARGET} PRIVATE ${RUNTIME_WALBERLA_FILES})
-    target_compile_definitions(${PAIRS_TARGET} PUBLIC USE_WALBERLA)

    ## Linking walberla modules
    set(PAIRS_WALBERLA_DEPENDENCIES blockforest core pe)
@@ -157,7 +166,7 @@ execute_process(
 if(COMPILE_CUDA)
    find_package(CUDA REQUIRED)
    enable_language(CUDA)
-    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${PYTHON_SCRIPT_NAME}.cu")
+    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${INPUT_SCRIPT_NAME}.cu")
    set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.")
    set(TARGET_ARG "gpu")

@@ -174,6 +183,13 @@ if(COMPILE_CUDA)
        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
    endif()

+    if(NOT DEFINED ENABLE_GPU_DIRECT)
+        set(ENABLE_GPU_DIRECT ON "Enable GPU Direct (default: ON when COMPILE_CUDA is ON)" FORCE)
+    else()
+         # User choice is respected here if they opt to COMPILE_CUDA without GPU Direct
+        set(ENABLE_GPU_DIRECT ${ENABLE_GPU_DIRECT} "Enable GPU Direct (user-defined)" FORCE)
+    endif()
+
    if(ENABLE_GPU_DIRECT)
        target_compile_definitions(${PAIRS_TARGET} PRIVATE ENABLE_CUDA_AWARE_MPI)
    endif()
@@ -192,7 +208,11 @@ if(COMPILE_CUDA)
    target_link_libraries(${PAIRS_TARGET} PUBLIC ${CUDA_LIBRARIES})
    list(APPEND PAIRS_LINK_LIBRARIES ${CUDA_LIBRARIES})
 else()
-    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${PYTHON_SCRIPT_NAME}.cpp")
+    if(ENABLE_GPU_DIRECT)
+        message(FATAL_ERROR "Invalid combination: ENABLE_GPU_DIRECT requires COMPILE_CUDA to be ON")
+    endif()
+
+    set(GEN_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/${INPUT_SCRIPT_NAME}.cpp")
    set(TARGET_ARG "cpu")
    target_sources(${PAIRS_TARGET} PRIVATE runtime/devices/dummy.cpp)
 endif()
@@ -202,9 +222,9 @@ endif()
 #================================================================================
 add_custom_command(
    OUTPUT ${GEN_SOURCES} ${GEN_INTERFACE_HEADER}
-    COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_SCRIPT} ${TARGET_ARG}
+    COMMAND ${PYTHON_EXECUTABLE} ${INPUT_SCRIPT} ${TARGET_ARG}
    COMMENT "Generating code with P4IRS"
-    DEPENDS ${PYTHON_SCRIPT})
+    DEPENDS ${INPUT_SCRIPT})

 add_custom_target(generated_code DEPENDS ${GEN_SOURCES} ${GEN_INTERFACE_HEADER})
 add_dependencies(${PAIRS_TARGET} generated_code)
@@ -230,7 +250,11 @@ endif()
 # LIKWID ========================================================================
 #================================================================================
 if(LIKWID_DIR)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIKWID_PERFMON -pthread -L${LIKWID_DIR}/lib -llikwid ")
+    target_compile_options(${PAIRS_TARGET} PRIVATE -DLIKWID_PERFMON -pthread)
+
+    target_link_libraries(${PAIRS_TARGET} PRIVATE ${LIKWID_DIR}/lib/liblikwid.a)
+    list(APPEND PAIRS_LINK_LIBRARIES ${LIKWID_DIR}/lib/liblikwid.a)
+
    include_directories(${LIKWID_DIR}/include)
    list(APPEND PAIRS_INCLUDE_DIRS "${LIKWID_DIR}/include")
 endif()

--- a/Makefile
+++ b/Makefile
-.PHONY: all build clean
-
-# General settings
-TESTCASE=md
-PYCMD=python3
-
-# C/C++ compiler settings
-CC=mpic++
-CFLAGS=-O3 -mavx2 -mfma -fopenmp ${MPI_FLAGS} ${LIKWID_FLAGS}
-#CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS}
-#CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${MPI_FLAGS} ${LIKWID_FLAGS}
-DEBUG_FLAGS=
-#DEBUG_FLAGS=-DDEBUG
-
-# CUDA settings
-NVCC=nvcc
-#NVCC_FLAGS=-O3 -mavx2 -mfma
-NVCC_FLAGS=-O3 -arch=sm_80 -mavx2 -mfma -ffast-math -funroll-loops --forward-unknown-to-host-compiler
-#NVCC_FLAGS=-O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler
-NVCC_PATH:="$(shell which ${NVCC})"
-CUDA_FLAGS=-DENABLE_CUDA_AWARE_MPI
-CUDART_FLAGS=-lcudart -L /apps/SPACK/0.19.1/opt/linux-almalinux8-zen/gcc-8.5.0/nvhpc-23.7-bzxcokzjvx4stynglo4u2ffpljajzlam/Linux_x86_64/23.7/cuda/12.2/targets/x86_64-linux/lib
-
-# MPI settings
-MPI_PATH=/apps/SPACK/0.19.1/opt/linux-almalinux8-zen/intel-2021.10.0/openmpi-4.1.6-ijsnjhq77rjc256wlrp52m37rsq6miff
-MPI_FLAGS=-I${MPI_PATH}/include
-
-# Likwid settings
-LIKWID_INC ?= -I/usr/local/include
-LIKWID_DEFINES ?= -DLIKWID_PERFMON
-LIKWID_LIB ?= -L/usr/local/lib
-LIKWID_FLAGS = -llikwid ${LIKWID_INC} ${LIKWID_DEFINES} ${LIKWID_LIB}
-
-# Other
-CPU_OBJ_PATH=obj_cpu
-CPU_SRC="$(TESTCASE).cpp"
-CPU_BIN="$(TESTCASE)_cpu"
-GPU_OBJ_PATH=obj_gpu
-GPU_SRC="$(TESTCASE).cu"
-GPU_BIN="$(TESTCASE)_gpu"
-
-all: clean build $(CPU_BIN) $(GPU_BIN)
-	@echo "Everything was done!"
-
-build:
-	@echo "Building pairs package..."
-	$(PYCMD) setup.py build && $(PYCMD) setup.py install --user
-
-$(CPU_SRC):
-	@echo "Generating and compiling $(TESTCASE) example for CPU..."
-	@mkdir -p $(CPU_OBJ_PATH)
-	$(PYCMD) examples/$(TESTCASE).py cpu
-
-$(GPU_SRC):
-	@echo "Generating and compiling $(TESTCASE) example for GPU..."
-	@mkdir -p $(GPU_OBJ_PATH)
-	$(PYCMD) examples/$(TESTCASE).py gpu
-
-$(CPU_OBJ_PATH)/pairs.o: runtime/pairs.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CFLAGS)
-
-$(CPU_OBJ_PATH)/regular_6d_stencil.o: runtime/domain/regular_6d_stencil.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CFLAGS)
-
-$(CPU_OBJ_PATH)/dummy.o: runtime/devices/dummy.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CFLAGS)
-
-$(GPU_OBJ_PATH)/pairs.o: runtime/pairs.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(MPI_FLAGS) $(CFLAGS) $(CUDA_FLAGS)
-
-$(GPU_OBJ_PATH)/regular_6d_stencil.o: runtime/domain/regular_6d_stencil.cpp
-	$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(MPI_FLAGS) $(CFLAGS) $(CUDA_FLAGS)
-
-$(GPU_OBJ_PATH)/cuda_runtime.o: runtime/devices/cuda.cu
-	$(NVCC) $(NVCC_FLAGS) -c -o $@ $< $(DEBUG_FLAGS) $(MPI_FLAGS) $(CUDA_FLAGS)
-
-# Targets
-$(CPU_BIN): $(CPU_SRC) $(CPU_OBJ_PATH)/pairs.o $(CPU_OBJ_PATH)/regular_6d_stencil.o $(CPU_OBJ_PATH)/dummy.o
-	$(CC) $(CFLAGS) -o $(CPU_BIN) $(CPU_SRC) $(CPU_OBJ_PATH)/pairs.o $(CPU_OBJ_PATH)/regular_6d_stencil.o $(CPU_OBJ_PATH)/dummy.o $(DEBUG_FLAGS)
-
-$(GPU_BIN): $(GPU_SRC) $(GPU_OBJ_PATH)/pairs.o $(GPU_OBJ_PATH)/regular_6d_stencil.o $(GPU_OBJ_PATH)/cuda_runtime.o
-	$(NVCC) $(NVCC_FLAGS) -c -o $(GPU_OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) $(DEBUG_FLAGS) $(MPI_FLAGS) $(CUDA_FLAGS)
-	$(CC) -o $(GPU_BIN) $(GPU_OBJ_PATH)/$(GPU_BIN).o $(GPU_OBJ_PATH)/cuda_runtime.o $(GPU_OBJ_PATH)/pairs.o $(GPU_OBJ_PATH)/regular_6d_stencil.o $(CUDART_FLAGS) $(CUDA_FLAGS) $(CFLAGS)
-
-clean:
-	@echo "Cleaning..."
-	rm -rf build $(CPU_BIN) $(GPU_BIN) $(CPU_SRC) $(GPU_SRC) dist pairs.egg-info functions functions.pdf $(CPU_OBJ_PATH) $(GPU_OBJ_PATH)
--- a/README.md
+++ b/README.md
@@ -3,10 +3,6 @@
 P4IRS is an open-source, stand-alone compiler and domain-specific language for particle simulations which aims at generating optimized code for different target hardwares.
 It is released as a Python package and allows users to define kernels, integrators and other particle routines in a high-level and straightforward fashion without the need to implement any backend code.

-## Build instructions
-
-There is a Makefile which contains configurable environment variables such as `TESTCASE` compiler parameters evaluate P4IRS performance on different scenarios.
-`TESTCASE` refers to any of the files within the `examples` directory, such as `md` and `dem`.

 ## Usage

@@ -104,6 +100,64 @@ else:
 psim.generate()
 ```

+## Build instructions
+
+P4IRS can be built in 3 different modes using the CMake build system. Before we demostrate each mode, ensure you have CMake, MPI and CUDA (if targeting GPU execution) available in your environment.
+
+In the following, we assume we have created and navigated to a build directory: `mkdir build; cd build` 
+
+**General CMake flags (applicable to all 3 modes):**  
+* Pass your input script to CMake using `-DINPUT_SCRIPT=path/to/script.py`  
+* Enable CUDA with `-DCOMPILE_CUDA=ON`
+* Enable support for BlockForest domain partitioning and dynamic load balancing by providing the path to waLBerla source directory `-DWALBERLA_DIR=path/to/walberla` (TODO: waLBerla as a submodule)
+
+
+### 1. Whole-program generation:
+---------------------
+In this mode, everything including the `main` function is generated by P4IRS.
+
+1. Set `generate_whole_program=True` in the input script
+2. Set the CMake flag `-DGENERATE_WHOLE_PROGRAM=ON`
+
+Example: Build [md.py](examples/whole-program-generation/md.py)
+```
+cmake -DINPUT_SCRIPT=../examples/whole-program-generation/md.py -DGENERATE_WHOLE_PROGRAM=ON ..
+``` 
+Now call `make` and an **executable** is built.
+
+
+### 2. Modular stand-alone app
+---------------------
+You can build a stand-alone C++ app which uses the P4IRS modular interface.
+
+1. Set `generate_whole_program=False` in the input script
+2. Set the CMake flag `-DBUILD_APP=ON`
+3. Provide the list of your source files to CMake (semicolon-seperated):`-DUSER_SOURCE_FILES=path/to/main.cpp;path/to/helper.cpp`
+
+Example: Build the application [sd_1.cpp](examples/modular/sd_1.cpp) with [spring_dashpot.py](examples/modular/spring_dashpot.py)  
+Note: In this example we assume waLBerla has been already cloned next to the P4IRS directory.
+
+```
+cmake -DINPUT_SCRIPT=../examples/modular/spring_dashpot.py -DBUILD_APP=ON -DUSER_SOURCE_FILES=../examples/modular/sd_1.cpp -DWALBERLA_DIR=../../walberla ..
+```
+Now call `make` and an **executable** is built.
+
+
+### 3. P4IRS as a library
+---------------------
+In this mode, P4IRS is compiled as a library that can be integrated into other projects.
+
+1. Set `generate_whole_program=False` in the input script
+2. Ensure both `BUILD_APP` and `GENERATE_WHOLE_PROGRAM` are `OFF` (they are OFF by default)
+
+Configure CMake and call `make` as usual, and a **static library** is built. You can then include P4IRS and its dependencies in your build system as follows:
+```cmake
+find_package(pairs REQUIRED HINTS "path/to/pairs/build" NO_DEFAULT_PATH)
+target_include_directories(my_app PUBLIC ${PAIRS_INCLUDE_DIRS})
+target_link_directories(my_app PUBLIC ${PAIRS_LINK_DIRS})
+target_link_libraries(my_app PUBLIC ${PAIRS_LINK_LIBRARIES})
+```
+
 ## Citations

 TBD

--- a/cmake/FindwaLBerla.cmake
+++ b/cmake/FindwaLBerla.cmake
-set( WALBERLA_DIR    WALBERLA_DIR-NOTFOUND   CACHE  PATH  "waLBerla path"  )
-
 if ( WALBERLA_DIR )
    # WALBERLA_DIR has to point to the waLBerla source directory
    # this command builds waLBerla (again) in the current build directory in the subfolder "walberla" (second argument)

--- a/data/planes.input
+++ b/data/planes.input
-0,1,0.0,0.0,0.0,0.0,0.0,1.0,13
-0,1,0.8,0.015,0.2,0.0,0.0,-1.0,13
+0,  1,  0.0,    0.0,    0.0,    0.0,    0.0,    1.0,    13
+0,  1,  0.8,    0.015,  0.2,    0.0,    0.0,    -1.0,   13
--- a/data/sd_planes.input
+++ b/data/sd_planes.input
 0, 1, 0, 0, 0, 1, 0, 0, 13
 0, 1, 0, 0, 0, 0, 1, 0, 13
 0, 1, 0, 0, 0, 0, 0, 1, 13
-0, 1, 1, 1, 1, -1, 0, 0, 13
-0, 1, 1, 1, 1, 0, -1, 0, 13
-0, 1, 1, 1, 1, 0, 0, -1, 13
+0, 1, 10, 10, 10, -1, 0, 0, 13
+0, 1, 10, 10, 10, 0, -1, 0, 13
+0, 1, 10, 10, 10, 0, 0, -1, 13
--- a/examples/dem2.py
+++ b/examples/dem2.py
-import pairs
-import sys
-
-
-def linear_spring_dashpot(i, j):
-    penetration_depth = squared_distance(i, j) - radius[i] - radius[j]
-    skip_when(penetration_depth >= 0.0)
-
-    delta_ij = delta(i, j)
-    contact_normal = delta_ij / length(delta_ij)
-    k = radius[j] + 0.5 * penetration_depth
-    contact_point = position[j] + contact_normal * k
-
-    rel_vel = -velocity_wf[i] - velocity_wf[j]
-    rel_vel_n = dot(rel_vel, contact_normal) * contact_normal
-    rel_vel_t = rel_vel - rel_vel_n
-
-    fN = stiffness_norm[i, j] * (-penetration_depth) * contact_normal + damping_norm[i, j] * rel_vel_n;
-
-    tan_spring_disp = tangential_spring_displacement[i, j]
-    impact_vel_magnitude = impact_velocity_magnitude[i, j]
-    impact_magnitude = select(impact_vel_magnitude > 0.0, impact_vel_magnitude, length(rel_vel))
-    sticking = is_sticking[i, j]
-
-    rotated_tan_disp = tan_spring_disp - contact_normal * (contact_normal * tan_spring_disp)
-    new_tan_spring_disp = dt * rel_vel_t + \
-                          select(squared_length(rotated_tan_disp) <= 0.0,
-                                 zero_vector(),
-                                 rotated_tan_disp * length(tan_spring_disp) / length(rotated_tan_disp))
-
-    fTLS = stiffness_tan[i, j] * new_tan_spring_disp + damping_tan[i, j] * rel_vel_t
-    fTLS_len = length(fTLS)
-    fTLS_inv_len = 1.0 / fTLS_len
-    t = select(fTLS_len > 0, fTLS / fTLS_inv_len, zero_vector())
-
-    f_friction_abs_static = friction_static[i, j] * length(fN)
-    f_friction_abs_dynamic = friction_dynamic[i, j] * length(fN)
-    tan_vel_threshold = 1e-8
-
-    cond1 = sticking == 1 and length(rel_vel_t) < tan_vel_threshold and fTLS_len < f_friction_abs_static
-    cond2 = sticking == 1 and fTLS_len < f_friction_abs_dynamic
-    f_friction_abs = select(cond1, f_friction_abs_static, f_friction_abs_dynamic)
-    n_sticking = select(cond1 or cond2 or fTLS_len < f_friction_abs_dynamic, 1, 0)
-
-    if not cond1 and not cond2 and stiffness_tan[i, j] > 0.0:
-        tangential_spring_displacement[i, j] = \
-            (f_friction_abs * t - damping_tan[i, j] * rel_vel_t) / stiffness_tan[i, j]
-
-    else:
-        tangential_spring_displacement[i, j] = new_tan_spring_disp
-
-    impact_velocity_magnitude[i, j] = impact_magnitude
-    is_sticking[i, j] = n_sticking
-
-    fTabs = min(fTLS_len, f_friction_abs)
-    fT = fTabs * t
-    force[i] += fN + fT
-
-
-def euler(i):
-    velocity[i] += dt * force[i] / mass[i]
-    position[i] += dt * velocity[i]
-
-
-cmd = sys.argv[0]
-target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
-if target != 'cpu' and target != 'gpu':
-    print(f"Invalid target, use {cmd} <cpu/gpu>")
-
-
-dt = 0.005
-cutoff_radius = 2.5
-skin = 0.3
-ntypes = 4
-stiffness_norm = 1.0
-stiffness_tan = 1.0
-damping_norm = 1.0
-damping_tan = 1.0
-friction_static = 1.0
-friction_dynamic = 1.0
-
-psim = pairs.simulation("dem", debug=True)
-psim.add_position('position')
-psim.add_property('mass', pairs.double(), 1.0)
-psim.add_property('velocity', pairs.vector())
-psim.add_property('velocity_wf', pairs.vector())
-psim.add_property('force', pairs.vector(), vol=True)
-psim.add_property('radius', pairs.double(), 1.0)
-psim.add_feature('type', ntypes)
-psim.add_feature_property('type', 'stiffness_norm', pairs.double(), [stiffness_norm for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'stiffness_tan', pairs.double(), [stiffness_tan for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'damping_norm', pairs.double(), [damping_norm for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'damping_tan', pairs.double(), [damping_tan for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'friction_static', pairs.double(), [friction_static for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'friction_dynamic', pairs.double(), [friction_dynamic for i in range(ntypes * ntypes)])
-psim.add_contact_property('is_sticking', pairs.int32(), False)
-psim.add_contact_property('tangential_spring_displacement', pairs.vector(), [0.0, 0.0, 0.0])
-psim.add_contact_property('impact_velocity_magnitude', pairs.double(), 0.0)
-
-psim.read_particle_data("data/fluidized_bed.input", ['mass', 'position', 'velocity'])
-psim.build_neighbor_lists(cutoff_radius + skin)
-psim.vtk_output(f"output/test_{target}")
-psim.compute(linear_spring_dashpot, cutoff_radius, symbols={'dt': dt})
-psim.compute(euler, symbols={'dt': dt})
-
-if target == 'gpu':
-    psim.target(pairs.target_gpu())
-else:
-    psim.target(pairs.target_cpu())
-
-psim.generate()
--- a/examples/dem_sd.py
+++ b/examples/dem_sd.py
-import math
-import pairs
-import sys
-import os
-
-def update_mass_and_inertia(i):
-    rotation_matrix[i] = diagonal_matrix(1.0)
-    rotation[i] = default_quaternion()
-
-    if is_sphere(i):
-        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
-
-    else:
-        mass[i] = infinity
-        inv_inertia[i] = 0.0
-
-
-def linear_spring_dashpot(i, j):
-    delta_ij = -penetration_depth(i, j)
-    skip_when(delta_ij < 0.0)
-
-    meff = 1.0 / ((1.0 / mass[i]) + (1.0 / mass[j]))
-    stiffness_norm = meff * (pi * pi + lnDryResCoeff * lnDryResCoeff) / \
-                     (collisionTime_SI * collisionTime_SI)
-    damping_norm = -2.0 * meff * lnDryResCoeff / collisionTime_SI
-    damping_tan = sqrt(kappa) * damping_norm
-
-    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
-    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
-
-    rel_vel = -(velocity_wf_i - velocity_wf_j)
-    rel_vel_n = dot(rel_vel, contact_normal(i, j))
-    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
-    fNabs = stiffness_norm * delta_ij + damping_norm * rel_vel_n
-    fN = fNabs * contact_normal(i, j)
-
-    fTabs = min(damping_tan * length(rel_vel_t), friction_dynamic[i, j] * fNabs) # static or dynamic?
-    fT = fTabs * normalized(rel_vel_t)
-
-    partial_force = fN + fT
-
-    apply(force, partial_force)
-    apply(torque, cross(contact_point(i, j) - position, partial_force))
-
-
-def euler(i):
-    inv_mass = 1.0 / mass[i]
-    position[i] += 0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
-    linear_velocity[i] += inv_mass * force[i] * dt
-    wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
-    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
-    rotation[i] = quaternion(phi, length(phi)) * rotation[i]
-    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation[i])
-    angular_velocity[i] += wdot * dt
-
-
-def gravity(i):
-    volume = (4.0 / 3.0) * pi * radius[i] * radius[i] * radius[i]
-    force[i][2] = force[i][2] - (densityParticle_SI - densityFluid_SI) * volume * gravity_SI
-    # if type[i]==0:
-    #     printf("gravity", i, volume, force[i][2], volume/2, 2>1)
-
-
-cmd = sys.argv[0]
-target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
-if target != 'cpu' and target != 'gpu':
-    print(f"Invalid target, use {cmd} <cpu/gpu>")
-
-# Config file parameters
-domainSize_SI=[0.1, 0.1, 0.1]
-diameter_SI = 0.009
-gravity_SI = 9.81
-densityFluid_SI = 1000
-densityParticle_SI = 2550
-generationSpacing_SI = 0.01
-initialVelocity_SI = 1
-dt_SI = 5e-5
-frictionCoefficient = 0.5
-restitutionCoefficient = 0.1
-collisionTime_SI = 5e-4
-poissonsRatio = 0.22
-timeSteps = 1
-visSpacing = 200
-denseBottomLayer = False
-bottomLayerOffsetFactor = 1.0
-kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
-
-minDiameter_SI = diameter_SI #* 0.9
-maxDiameter_SI = diameter_SI #* 1.1
-linkedCellWidth = 1.01 * maxDiameter_SI
-
-ntypes = 1
-lnDryResCoeff = math.log(restitutionCoefficient)
-frictionStatic = 0.0
-frictionDynamic = frictionCoefficient
-
-file_name = os.path.basename(__file__)
-file_name_without_extension = os.path.splitext(file_name)[0]
-
-psim = pairs.simulation(
-    file_name_without_extension,
-    [pairs.sphere(), pairs.halfspace()],
-    timesteps=timeSteps,
-    double_prec=True,
-    use_contact_history=False,
-    particle_capacity=1000000,
-    neighbor_capacity=20,
-    debug=True, generate_whole_program=False)
-
-if target == 'gpu':
-    psim.target(pairs.target_gpu())
-else:
-    psim.target(pairs.target_cpu())
-    #psim.target(pairs.target_cpu(parallel=True))
-
-psim.add_position('position')
-psim.add_property('mass', pairs.real(), 1.0)
-psim.add_property('linear_velocity', pairs.vector())
-psim.add_property('angular_velocity', pairs.vector())
-psim.add_property('force', pairs.vector(), volatile=True)
-psim.add_property('torque', pairs.vector(), volatile=True)
-psim.add_property('hydrodynamic_force', pairs.vector(), reduce=True)
-psim.add_property('hydrodynamic_torque', pairs.vector(), reduce=True)
-# psim.add_property('old_hydrodynamic_force', pairs.vector())
-# psim.add_property('old_hydrodynamic_torque', pairs.vector())
-psim.add_property('radius', pairs.real(), 1.0)
-psim.add_property('normal', pairs.vector())
-psim.add_property('inv_inertia', pairs.matrix())
-psim.add_property('rotation_matrix', pairs.matrix())
-psim.add_property('rotation', pairs.quaternion())
-psim.add_feature('type', ntypes)
-psim.add_feature_property('type', 'friction_static', pairs.real(), [frictionStatic for i in range(ntypes * ntypes)])
-psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDynamic for i in range(ntypes * ntypes)])
-
-# psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
-psim.set_domain_partitioner(pairs.block_forest())
-# psim.set_domain_partitioner(pairs.regular_domain_partitioner())
-psim.pbc([False, False, False])
-psim.dem_sc_grid(
-    domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
-    diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
-
-#psim.read_particle_data(
-#    "data/spheres.input",
-#    "data/spheres_4x4x2.input",
-#    "data/spheres_6x6x2.input",
-#    "data/spheres_8x8x2.input",
-#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
-#    pairs.sphere())
-
-#psim.read_particle_data(
-#    "data/spheres_bottom.input",
-#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
-#    pairs.sphere())
-
-# psim.read_particle_data(
-#     "data/planes.input",
-#     ['type', 'mass', 'position', 'normal', 'flags'],
-#     pairs.halfspace())
-
-psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
-                                     'pi': math.pi,
-                                     'infinity': math.inf })
-
-#psim.compute_half()
-psim.build_cell_lists(linkedCellWidth)
-psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
-
-psim.compute(gravity,
-             symbols={'densityParticle_SI': densityParticle_SI,
-                      'densityFluid_SI': densityFluid_SI,
-                      'gravity_SI': gravity_SI,
-                      'pi': math.pi })
-
-psim.compute(linear_spring_dashpot,
-             linkedCellWidth,
-             symbols={'pi': math.pi,
-                      'kappa': kappa,
-                      'lnDryResCoeff': lnDryResCoeff,
-                      'collisionTime_SI': collisionTime_SI})
-
-psim.compute(euler, parameters={'dt' : pairs.real()})
-# psim.compute(euler, symbols={'dt' : dt_SI})
-psim.generate()
--- a/examples/lift.py
+++ b/examples/lift.py
-from coupling.parse_cpp import parse_walberla_file
-from coupling.parse_cpp import get_class_method, print_tree
-
-filename = "mesa_pd/kernel/SpringDashpot.hpp"
-translation_unit = parse_walberla_file(filename)
-
-# subtree = get_subtree(tu.cursor, "walberla::mesa_pd::kernel")
-# print_tree(subtree)
-
-kernel = get_class_method(
-        translation_unit.cursor,
-        "walberla::mesa_pd::kernel::SpringDashpot",
-        "operator()")
-print_tree(kernel)
--- a/examples/lj_embedded.py
+++ b/examples/lj_embedded.py
-import pairs
-
-
-dt = 0.005
-cutoff_radius = 2.5
-skin = 0.3
-sigma = 1.0
-epsilon = 1.0
-sigma6 = sigma ** 6
-
-psim = pairs.simulation("lj")
-mass = psim.add_real_property('mass', 1.0)
-position = psim.add_vector_property('position')
-velocity = psim.add_vector_property('velocity')
-force = psim.add_vector_property('force', vol=True)
-psim.from_file("data/minimd_setup_4x4x4.input", ['mass', 'position', 'velocity'])
-psim.create_cell_lists(2.8, 2.8)
-psim.create_neighbor_lists()
-psim.periodic(2.8)
-psim.vtk_output("output/test")
-
-for i, j, delta, rsq in psim.particle_pairs(cutoff_radius, position):
-    sr2 = 1.0 / rsq
-    sr6 = sr2 * sr2 * sr2 * sigma6
-    f = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon
-    force[i].add(delta * f)
-
-for i in psim.particles():
-    velocity[i].add(dt * force[i] / mass[i])
-    position[i].add(dt * velocity[i])
-
-psim.generate()
--- a/examples/lj_onetype.py
+++ b/examples/lj_onetype.py
-import pairs
-import sys
-
-
-def lj(i, j):
-    sr2 = 1.0 / rsq
-    sr6 = sr2 * sr2 * sr2 * sigma6
-    force[i] += delta * 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon
-
-
-def euler(i):
-    velocity[i] += dt * force[i] / mass[i]
-    position[i] += dt * velocity[i]
-
-
-cmd = sys.argv[0]
-target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
-if target != 'cpu' and target != 'gpu':
-    print(f"Invalid target, use {cmd} <cpu/gpu>")
-
-
-dt = 0.005
-cutoff_radius = 2.5
-skin = 0.3
-sigma = 1.0
-epsilon = 1.0
-sigma6 = sigma ** 6
-
-psim = pairs.simulation("lj", debug=True)
-psim.add_real_property('mass', 1.0)
-psim.add_position('position')
-psim.add_vector_property('velocity')
-psim.add_vector_property('force', vol=True)
-psim.from_file("data/minimd_setup_32x32x32.input", ['mass', 'position', 'velocity'])
-psim.build_neighbor_lists(cutoff_radius + skin)
-psim.vtk_output(f"output/test_{target}")
-psim.compute(lj, cutoff_radius, {'sigma6': sigma6, 'epsilon': epsilon})
-psim.compute(euler, symbols={'dt': dt})
-
-if target == 'gpu':
-    psim.target(pairs.target_gpu())
-else:
-    psim.target(pairs.target_cpu())
-
-psim.generate()
--- a/examples/main.cpp
+++ b/examples/main.cpp
-#include <iostream>
-//---
-#include "dem_sd.hpp"
-#include <blockforest/BlockForest.h>
-#include <blockforest/Initialization.h>
-
-int main(int argc, char **argv) {
-    double dt = 5e-5;
-    auto pairs_sim = std::make_shared<PairsSimulation>();
-    auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim);
-
-    // Create forest (make sure to use_domain(forest)) ----------------------------------------------
-    // walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1);
-    // std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance();
-    // mpiManager->initializeMPI(&argc, &argv);
-    // mpiManager->useWorldComm();
-    // auto procs = mpiManager->numProcesses();
-    // auto block_config = walberla::Vector3<int>(2, 2, 2);
-    // auto ref_level = 0;
-    // std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest(
-    //         domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level);
-    //-----------------------------------------------------------------------------------------------
-
-    // initialize pairs data structures ----------------------------------------------
-    pairs_sim->initialize();
-
-    // either create new domain or use an existing one ----------------------------------------
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 0.1, 0.1, 0.1);
-
-    // pairs_sim->use_domain(forest);
-
-    // create planes and particles ------------------------------------------------------------
-    pairs_sim->create_halfspace(0,     0,      0,      1, 0, 0,        0, 13);
-    pairs_sim->create_halfspace(0,     0,      0,      0, 1, 0,        0, 13);
-    pairs_sim->create_halfspace(0,     0,      0,      0, 0, 1,        0, 13);
-    pairs_sim->create_halfspace(0.1,   0.1,    0.1,    -1, 0, 0,       0, 13);
-    pairs_sim->create_halfspace(0.1,   0.1,    0.1,    0, -1, 0,       0, 13);
-    pairs_sim->create_halfspace(0.1,   0.1,    0.1,    0, 0, -1,       0, 13);
-
-    // pairs::id_t pUid = pairs_sim->create_sphere(0.0499,   0.0499,   0.07,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
-    // pairs::id_t pUid2 = pairs_sim->create_sphere(0.0499,   0.0499,   0.0499,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
-
-    // Tracking a particle ------------------------------------------------------------------------
-    // if (pUid != pairs_acc->getInvalidUid()){
-    //     std::cout<< "Particle " << pUid << " is created in rank " << rank << std::endl;
-    // }
-    
-    // walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM);
-    // walberla::mpi::allReduceInplace(pUid2, walberla::mpi::SUM);
-    // MPI_Allreduce(MPI_IN_PLACE, &pUid, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
-    // MPI_Allreduce(MPI_IN_PLACE, &pUid2, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
-
-
-    // if (pUid != pairs_acc->getInvalidUid()){
-    //     std::cout<< "Particle " << pUid << " will be tracked by rank " << rank << std::endl;
-    // }
-
-    // auto pIsLocalInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();};
-    // auto pIsGhostInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdxGhost(uid) != pairs_acc->getInvalidIdx();};
-
-
-    // TODO: make sure linkedCellWidth is larger than max diameter in the system
-    // setup particles, setup functions, and the cell list stencil-------------------------------
-    pairs_sim->setup_sim();
-
-    // for(int i=0; i<pairs_acc->size(); ++i){
-    //     if(pairs_acc->getShape(i) == 0){
-    //         std::cout<< "rank: " << rank << " sphere " <<  pairs_acc->getUid(i) << "  " << pairs_acc->getPosition(i) << std::endl; 
-    //     }
-    //     else if(pairs_acc->getShape(i) == 1){
-    //         std::cout<< "rank: " << rank << " halfspace " <<  pairs_acc->getUid(i) << "  " << pairs_acc->getPosition(i) << pairs_acc->getNormal(i) << std::endl; 
-    //     }
-    // }
-
-    // if (pIsLocalInMyRank(pUid)){
-    //     int idx = pairs_acc->uidToIdx(pUid);
-    //     pairs_acc->setPosition(idx, walberla::Vector3<double>(0.01, 0.01, 0.08));
-    //     pairs_acc->setLinearVelocity(idx, walberla::Vector3<double>(0.5, 0.5, 0.5));
-    // }
-
-    for (int t=0; t<5000; ++t){
-        // if ((t%200==0) && (pIsLocalInMyRank(pUid))){
-        //     int idx = pairs_acc->uidToIdx(pUid);
-        //     std::cout<< "Tracked particle is now in rank " << rank << " --- " << pairs_acc->getPosition(idx)<< std::endl;
-        // }
-
-        pairs_sim->communicate(t);
-
-        // if(pairs_sim->rank() == 0){
-        //     pairs_sim->pairs_runtime->copyPropertyToHost(0, ReadOnly, (((pairs_sim->pobj->nlocal + pairs_sim->pobj->nghost) * 1) * sizeof(unsigned long long int))); // uid
-        //     for(int i=0; i<pairs_sim->pobj->nlocal; ++i){
-        //         std::cout << "rank ##0## local uid[" << i << "] = " << pairs_sim->pobj->uid[i] << std::endl;
-        //     }
-        // }
-        // if(pairs_sim->rank() == 2){
-        //     pairs_sim->pairs_runtime->copyPropertyToHost(0, ReadOnly, (((pairs_sim->pobj->nlocal + pairs_sim->pobj->nghost) * 1) * sizeof(unsigned long long int))); // uid
-        //     for(int i=0; i<pairs_sim->pobj->nlocal; ++i){
-        //         std::cout << "rank 2 local uid[" << i << "] = " << pairs_sim->pobj->uid[i] << std::endl;
-        //     }
-        //     for(int i=pairs_sim->pobj->nlocal; i<pairs_sim->pobj->nlocal + pairs_sim->pobj->nghost; ++i){
-        //         std::cout << "rank 2 ghost uid[" << i << "] = " << pairs_sim->pobj->uid[i] << std::endl;
-        //     }
-        // }
-
-        // std::cout << pairs_sim->rank() << " --------- nlocal = " << pairs_acc->nlocal() << " nghost = " << pairs_acc->nghost() << std::endl;
-
-        // if (pairs_sim->rank() == 2) {
-        //     for(int i=0; i<pairs_acc->nlocal(); ++i){
-        //         std::cout << pairs_sim->rank() << " Local  " << i << " -- uid= " << pairs_acc->getUid(i) << std::endl; 
-        //     }
-        //     for(int i=pairs_acc->nlocal(); i<pairs_acc->size(); ++i){
-        //         std::cout << pairs_sim->rank() << " Ghost -- " << i << " -- uid= " << pairs_acc->getUid(i) << std::endl; 
-        //     }
-        // }
-
-
-
-        // if (pIsLocalInMyRank(pUid)){
-        //     int idx = pairs_acc->uidToIdx(pUid);
-        //     pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(1, 1, 1));
-        //     pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(2, 2, 2));
-        // }
-        // if (pIsLocalInMyRank(pUid2)){
-        //     int idx = pairs_acc->uidToIdx(pUid2);
-        //     pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(10, 10, 10));
-        //     pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(20, 20, 20));
-        // }
-        
-        pairs_sim->update_cells(); 
-        pairs_sim->gravity(); 
-        pairs_sim->linear_spring_dashpot(); 
-        pairs_sim->euler(dt); 
-
-        // std::cout << "reset_volatiles" << std::endl;
-        pairs_sim->reset_volatiles(); 
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, pairs_acc->nlocal(), t, 200);
-        pairs_sim->vtk_write("output/dem_sd_ghost", pairs_acc->nlocal(), pairs_acc->size(), t, 200);
-
-        // if (pIsGhostInMyRank(pUid)){
-        //     std::cout << pairs_sim->rank() << " - ghost 1: " << pUid << std::endl;
-        //     int idx = pairs_acc->uidToIdxGhost(pUid);
-        //     pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(1, 1, 1));
-        //     pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(3, 3, 3));
-        // }
-        // if (pIsGhostInMyRank(pUid2)){
-        //     std::cout << pairs_sim->rank() << " - ghost 2: " << pUid2 << std::endl;
-        //     int idx = pairs_acc->uidToIdxGhost(pUid2);
-        //     pairs_acc->setHydrodynamicForce(idx, walberla::Vector3<double>(2, 2, 2));
-        //     pairs_acc->setHydrodynamicTorque(idx, walberla::Vector3<double>(4, 4, 4));
-        // }
-
-        // std::cout << "reverse comm and reduce" << std::endl;
-        // pairs_sim->reverse_comm();
-
-        // if (pIsLocalInMyRank(pUid)){
-        //     int idx = pairs_acc->uidToIdx(pUid);
-        //     auto forceSum = pairs_acc->getHydrodynamicForce(idx);
-        //     auto torqueSum = pairs_acc->getHydrodynamicTorque(idx);
-        //     std::cout << pUid << " @@@@ reduced force = " << forceSum << std::endl;
-        //     std::cout << pUid << " @@@@ reduced torque = " << torqueSum << std::endl;
-        // }
-        // if (pIsLocalInMyRank(pUid2)){
-        //     int idx = pairs_acc->uidToIdx(pUid2);
-        //     auto forceSum = pairs_acc->getHydrodynamicForce(idx);
-        //     auto torqueSum = pairs_acc->getHydrodynamicTorque(idx);
-        //     std::cout << pUid2 << " @@@@ reduced force = " << forceSum << std::endl;
-        //     std::cout << pUid2 << " @@@@ reduced torque = " << torqueSum << std::endl;
-        // }
-    }
-
-    pairs_sim->end();
-
-    return 0;
-}
--- a/examples/main.cu
+++ b/examples/main.cu
-#include <iostream>
-#include <cuda_runtime.h>
-
-void checkCudaError(cudaError_t err, const char* func) {
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA error in %s: %s\n", func, cudaGetErrorString(err));
-        exit(err);
-    }
-}
-
-template< typename Type >
-class PairsVector3 {
-public:
-    PairsVector3() = default;
-
-    // If the constructor is called from device, v_ is automatically allocated on 
-    // device because it's a static array embeded in the object itself 
-    __host__ __device__ PairsVector3( Type x, Type y, Type z ) {
-        v_[0] = x;
-        v_[1] = y;
-        v_[2] = z;
-    }
-
-    __host__ __device__ Type& operator[]( int index ) { 
-        return v_[index]; 
-    }
-    __host__ __device__ const Type& operator[] ( int index ) const { 
-        return v_[index]; 
-    }
-
-private:
-    Type v_[3] = {Type(), Type(), Type()};
-};
-
-
-struct PairsObjects {
-    double *position_h;
-    double *position_d;
-};
-
-
-class PairsAccessor{
-    private:
-    PairsObjects *pobj_h;
-    PairsObjects *pobj_d;
-
-    public:
-    // PairsAccessor is only constructable from host, but its getters and setters can be called from both host and device
-    __host__ PairsAccessor(PairsObjects *pobj_): pobj_h(pobj_) {
-
-        // NOTE: Here we copy pobj_h to device, but we will use pobj_d to ONLY work with device pointers 
-        // So for example, we only try to access pobj_d->position_d, NOT pobj_d->position_h 
-        // NOTE: What's copied to device here is the PairsObject, which is a bunch of pointers (including valid device pointers like position_d)
-        // TODO (maybe): Split PairsObjects into two structs, one holding host pointers, the other holding device pointers.
-        cudaMalloc(&pobj_d, sizeof(PairsObjects));
-        cudaMemcpy(pobj_d, pobj_h, sizeof(PairsObjects), cudaMemcpyHostToDevice);
-    }
-
-    __host__ __device__ int getTest(){
-        #ifdef __CUDA_ARCH__ 
-            return 12;
-        #else
-            return 34;
-        #endif
-    }
-
-    // If this function is called from device, it returns a PairsVector3 that's constructed on device
-    // If this function is called from host, it returns a PairsVector3 that's constructed on host
-    __host__ __device__ PairsVector3<double> getPosition(const size_t i) const {
-        #ifdef __CUDA_ARCH__ 
-            // Assume postion_d points to uptodate data (we can't do copyPropertyToDevice from __device__ )
-            return PairsVector3<double>(pobj_d->position_d[i*3 + 0], pobj_d->position_d[i*3 + 1], pobj_d->position_d[i*3 + 2]);
-        #else
-            // Here we can do copyPropertyToHost (ReadOnly) if needed, to make sure position_h has uptodate data
-            return PairsVector3<double>(pobj_h->position_h[i*3 + 0], pobj_h->position_h[i*3 + 1], pobj_h->position_h[i*3 + 2]);
-        #endif 
-            
-    }
-
-    __host__ __device__ void setPosition(const size_t i, PairsVector3<double> const &vec) {
-        #ifdef __CUDA_ARCH__ 
-            // Assume vec is on device
-            pobj_d->position_d[i*3 + 0] = vec[0]; 
-            pobj_d->position_d[i*3 + 1] = vec[1]; 
-            pobj_d->position_d[i*3 + 2] = vec[2];
-            // Assume we don't need position_h data back on host (we can't do copyPropertyToHost from __device__)
-        #else  
-            // Assume vec is on host
-            pobj_h->position_h[i*3 + 0] = vec[0]; 
-            pobj_h->position_h[i*3 + 1] = vec[1]; 
-            pobj_h->position_h[i*3 + 2] = vec[2];
-            // Here we can do copyPropertyToDevice (WriteOnly) if needed (just so host and device data match)
-        #endif 
-    
-    }
-    
-};
-
-
-__global__ void mykernel(PairsAccessor ac){
-    printf("getTest from device = %d\n", ac.getTest());
-
-    PairsVector3<double> pos(7,8,9); 
-    ac.setPosition(0, pos);
-    printf("getPosition(0) from device = (%f, %f, %f) \n", ac.getPosition(0)[0], ac.getPosition(0)[1], ac.getPosition(0)[2]);
-}
-
-
-int main(int argc, char **argv) {
-
-    PairsObjects *pobj = new PairsObjects;
-
-    // User doesn't bother with the stuff below, they are done when PairsSimulation is initialized
-    //----------------------------------------------------------------------------------------------------
-    int numParticles = 1;
-    int numElements = numParticles * 3;
-    pobj->position_h = new double[numElements];
-    cudaMalloc(&pobj->position_d, numElements * sizeof(double));
-    cudaMemcpy(pobj->position_d, pobj->position_h, numElements * sizeof(double), cudaMemcpyHostToDevice);
-    //----------------------------------------------------------------------------------------------------
-
-    PairsAccessor ac(pobj);
-    printf("getTest from host = %d\n", ac.getTest());
-
-    PairsVector3<double> pos(1.2, 3.4, 5.6); 
-    ac.setPosition(0, pos);
-    printf("getPosition(0) from host = (%f, %f, %f) \n", ac.getPosition(0)[0], ac.getPosition(0)[1], ac.getPosition(0)[2]);
-    
-
-    mykernel<<<1,1>>>(ac);
-    checkCudaError(cudaDeviceSynchronize(), "mykernel");
-
-    // In mykernel, we modify the position of the particle ON DEVICE
-    // TODO: To reflect this modification on the host, we need a 'sync' function along with getters and setters (eg: 'syncPosition')
-    // to make sure both host and device data are uptodate and in sync with eachother.
-    // But unlike getters and setters, sync functions are only callable form host
-    // The 'sync' function copies the property to device if its host flag is set, or copies property to host if its device flag is set
-    // If setter is called from host, set host flag and unset device flag for that property
-    // If setter is called from device, set device flag and unset host flag for that property
-
-}
-
-
--- a/examples/modular/force_reduction.cpp
+++ b/examples/modular/force_reduction.cpp
@@ -9,10 +9,11 @@ int main(int argc, char **argv) {
    auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
    
    // Set domain
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 0.1, 0.1, 0.1);
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 0.1, 0.1, 0.1);

    // Create bodies
-    pairs::id_t pUid = pairs_sim->create_sphere(0.0499,   0.0499,   0.07,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
+    pairs::id_t pUid = pairs::create_sphere(pairs_runtime, 0.0499,   0.0499,   0.07,   0.5, 0.5, 0 ,   1000, 0.0045, 0, 0);
    
    // setup_sim after creating all bodies
    pairs_sim->setup_sim();
@@ -93,8 +94,7 @@ int main(int argc, char **argv) {
        pairs_sim->update_cells(t); 
        pairs_sim->gravity(); 
        pairs_sim->spring_dashpot();
-        pairs_sim->euler(5e-5); 
-        pairs_sim->reset_volatiles();         
+        pairs_sim->euler(5e-5);        
        //-------------------------------------------------------------------------------------------

        std::cout << "---- reverse_comm and reduce ----" << std::endl;

--- a/examples/modular/sd_1_CPU_GPU.cpp
+++ b/examples/modular/sd_1_CPU_GPU.cpp
@@ -8,16 +8,18 @@ int main(int argc, char **argv) {
    auto pairs_sim = std::make_shared<PairsSimulation>();
    pairs_sim->initialize();

-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
+    auto pairs_runtime = pairs_sim->getPairsRuntime();

-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-    pairs_sim->create_sphere(0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1); 
+
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);

    pairs_sim->setup_sim(0.1, 0.1, 0.1, 0.1);
    pairs_sim->update_mass_and_inertia();
@@ -37,10 +39,8 @@ int main(int argc, char **argv) {
        pairs_sim->spring_dashpot(); 
        pairs_sim->euler(dt); 

-        pairs_sim->reset_volatiles(); 
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, pairs_sim->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_1_local", 0, pairs_sim->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_1_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
    }

    pairs_sim->end();

--- a/examples/modular/sd_2_CPU_GPU.cpp
+++ b/examples/modular/sd_2_CPU_GPU.cpp
@@ -30,16 +30,17 @@ int main(int argc, char **argv) {

    // Pass forest to P4IRS
    // -------------------------------------------------------------------------------
-    pairs_sim->use_domain(forest);
-
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-    pairs_sim->create_sphere(0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->useDomain(forest);
+
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);

    pairs_sim->setup_sim(0.1, 0.1, 0.1, 0.1);
    pairs_sim->update_mass_and_inertia();
@@ -59,10 +60,8 @@ int main(int argc, char **argv) {
        pairs_sim->spring_dashpot(); 
        pairs_sim->euler(dt); 

-        pairs_sim->reset_volatiles(); 
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, pairs_sim->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_2_local", 0, pairs_sim->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_2_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
    }

    pairs_sim->end();

--- a/examples/modular/sd_3_CPU.cpp
+++ b/examples/modular/sd_3_CPU.cpp
@@ -14,18 +14,19 @@ int main(int argc, char **argv) {
    pairs_sim->initialize();

    auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());
+    
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1);

-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);

-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-
-    pairs::id_t pUid = pairs_sim->create_sphere(0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 0, 0);
+    pairs::id_t pUid = pairs::create_sphere(pairs_runtime ,0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 0, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 0, 0);

    MPI_Allreduce(MPI_IN_PLACE, &pUid, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);

@@ -81,14 +82,13 @@ int main(int argc, char **argv) {
        // Euler
        //-------------------------------------------------------------------------------------------
        pairs_sim->euler(dt);
-        pairs_sim->reset_volatiles(); 

        // Communicate
        //-------------------------------------------------------------------------------------------
        pairs_sim->communicate(t);

-        pairs_sim->vtk_write("output/dem_sd_local", 0, ac->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_3_CPU_local", 0, ac->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/sd_3_CPU_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
    }

    pairs_sim->end();

--- a/examples/modular/sd_3_GPU.cu
+++ b/examples/modular/sd_3_GPU.cu
@@ -46,17 +46,18 @@ int main(int argc, char **argv) {
    // Create PairsAccessor after PairsSimulation is initialized
    auto ac = std::make_shared<PairsAccessor>(pairs_sim.get());

-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
+    auto pairs_runtime = pairs_sim->getPairsRuntime();
+    pairs_runtime->initDomain(&argc, &argv, 0, 0, 0, 1, 1, 1);

-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  1, 0, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 1, 0,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 0,0,0,  0, 0, 1,     0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  -1, 0, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, -1, 0,    0, 13);
+    pairs::create_halfspace(pairs_runtime, 1,1,1,  0, 0, -1,    0, 13);

-    pairs::id_t pUid = pairs_sim->create_sphere(0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 1, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 1, 0);
+    pairs::id_t pUid = pairs::create_sphere(pairs_runtime, 0.6, 0.6, 0.7,      0, 0, 0,  1000, 0.05, 1, 0);
+    pairs::create_sphere(pairs_runtime, 0.4, 0.4, 0.76,    2, 2, 0,    1000, 0.05, 1, 0);

    set_feature_properties(ac);

@@ -74,7 +75,6 @@ int main(int argc, char **argv) {
    int num_timesteps = 2000;
    int vtk_freq = 20;
    double dt = 1e-3;
-    pairs_sim->vtk_write_subdom("output/subdom", 0, 1);

    for (int t=0; t<num_timesteps; ++t){
        // Up-to-date uids might be on host or device. So sync uid in Host before accessing them from host
@@ -137,7 +137,6 @@ int main(int argc, char **argv) {
        // Euler
        //-------------------------------------------------------------------------------------------
        pairs_sim->euler(dt);
-        pairs_sim->reset_volatiles(); 

        // Communicate
        //-------------------------------------------------------------------------------------------
@@ -145,8 +144,8 @@ int main(int argc, char **argv) {
        // PairsAccessor requires an update when particles are communicated
        ac->update();

-        pairs_sim->vtk_write("output/dem_sd_local", 0, ac->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/dem_sd_local", 0, ac->nlocal(), t, vtk_freq);
+        pairs::vtk_write_data(pairs_runtime, "output/dem_sd_ghost", ac->nlocal(), ac->size(), t, vtk_freq);
    }

    pairs_sim->end();

--- a/examples/modular/sd_4.cpp
+++ b/examples/modular/sd_4.cpp
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
    auto pairs_runtime = pairs_sim->getPairsRuntime();

    pairs_runtime->initDomain(&argc, &argv, 
-                    0, 20, 0, 20, 0, 20,    // Domain bounds
+                    0, 0, 0, 20, 20, 20,    // Domain bounds
                    false, false, false,    // PBCs --------------> TODO: runtime pbc
                    true                    // Enable dynamic load balancing (does initial refinement on a <1,1,1> blockforest)
                ); 

--- a/examples/modular/sd_sample_1_CPU_GPU.cpp
+++ b/examples/modular/sd_sample_1_CPU_GPU.cpp
-#include <iostream>
-#include <memory>
-
-#include "spring_dashpot.hpp"
-
-int main(int argc, char **argv) {
-
-    auto pairs_sim = std::make_shared<PairsSimulation>();
-    pairs_sim->initialize();
-
-    pairs_sim->set_domain(argc, argv, 0, 0, 0, 1, 1, 1);
-
-    pairs_sim->create_halfspace(0,0,0,  1, 0, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 1, 0,     0, 13);
-    pairs_sim->create_halfspace(0,0,0,  0, 0, 1,     0, 13);
-    pairs_sim->create_halfspace(1,1,1,  -1, 0, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, -1, 0,    0, 13);
-    pairs_sim->create_halfspace(1,1,1,  0, 0, -1,    0, 13);
-    pairs_sim->create_sphere(0.6, 0.6, 0.7,      -2, -2, 0,  1000, 0.05, 0, 0);
-    pairs_sim->create_sphere(0.4, 0.4, 0.68,    2, 2, 0,    1000, 0.05, 0, 0);
-
-    pairs_sim->setup_sim();
-    pairs_sim->update_mass_and_inertia();
-
-    int num_timesteps = 2000;
-    int vtk_freq = 20;
-    double dt = 1e-3;
-
-    for (int t=0; t<num_timesteps; ++t){
-        if ((t%500==0) && pairs_sim->rank()==0) std::cout << "Timestep: " << t << std::endl;
-
-        pairs_sim->communicate(t);
-        
-        pairs_sim->update_cells(t); 
-
-        pairs_sim->gravity(); 
-        pairs_sim->spring_dashpot(); 
-        pairs_sim->euler(dt); 
-
-        pairs_sim->reset_volatiles(); 
-
-        pairs_sim->vtk_write("output/dem_sd_local", 0, pairs_sim->nlocal(), t, vtk_freq);
-        pairs_sim->vtk_write("output/dem_sd_ghost", pairs_sim->nlocal(), pairs_sim->size(), t, vtk_freq);
-    }
-
-    pairs_sim->end();
-}