diff --git a/.gitignore b/.gitignore
index fcbb18e667efb2eb74151572917015f302a33eff..a089e434420f04421987efc8324734d14708ea90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,7 @@ dem.cpp
 debug
 results
 scripts
+jobscripts
+fritz_settling_spheres
+/examples/dem_bzd_velocity_verlet_old.py
+/examples/dem_vel_ver.py
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed0f398a55db732b9496c6c77f4da2a3ca4d2eb7..2743e5f54a29c95905d4feb841b8018df17091d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,11 +6,14 @@ if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build (Debug, Release, etc.)" FORCE)
 endif()
 
+# set(CMAKE_CXX_FLAGS "-O3 -mavx2 -mfma -fopenmp -DLIKWID_PERFMON")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DDEBUG")
 
 option(USE_WALBERLA "Enable waLBerla support for using BlockForest partitioning" ON)
 option(USE_MPI "USE_MPI" ON)
+option(USE_LIKWID "USE_LIKWID" OFF)
+option(BUILD_WITH_LIKWID_MARKERS "BUILD_WITH_LIKWID_MARKERS" OFF)
 option(COMPILE_CUDA "COMPILE_CUDA" ON)
 option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON)
 option(GENERATE_WHOLE_PROGRAM "Generate the whole program (i.e. including the 'main' function). No additional source files are needed." OFF)
@@ -60,7 +63,8 @@ set(RUNTIME_COMMON_FILES
     runtime/thermo.cpp
     runtime/timing.cpp
     runtime/vtk.cpp
-    runtime/domain/regular_6d_stencil.cpp)
+    runtime/domain/regular_6d_stencil.cpp
+    runtime/likwid-marker.h)
 
 #================================================================================
 # PAIRS_TARGET ==================================================================
@@ -75,6 +79,8 @@ set(PAIRS_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR})
 # The target can either be an executable or a static library
 if(GENERATE_WHOLE_PROGRAM OR BUILD_APP)
     add_executable(${PAIRS_TARGET} ${RUNTIME_COMMON_FILES})
+    # target_compile_options(${PAIRS_TARGET} PRIVATE -DLIKWID_PERFMON -L${LIKWID_DIR}/lib -llikwid)
+
 else()
     add_library(${PAIRS_TARGET} STATIC ${RUNTIME_COMMON_FILES})
     list(APPEND PAIRS_LINK_LIBRARIES ${PAIRS_TARGET})
@@ -222,12 +228,53 @@ endif()
 #================================================================================
 # LIKWID ========================================================================
 #================================================================================
-if(LIKWID_DIR)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIKWID_PERFMON -pthread -L${LIKWID_DIR}/lib -llikwid ")
-    include_directories(${LIKWID_DIR}/include)
-    list(APPEND PAIRS_INCLUDE_DIRS "${LIKWID_DIR}/include")
+if(USE_LIKWID)
+    find_package(Likwid REQUIRED)
+    if (BUILD_WITH_LIKWID_MARKERS)
+        message(STATUS "Building with LIKWID markers")
+        target_compile_definitions(${PAIRS_TARGET} PRIVATE -DLIKWID_PERFMON)
+        target_include_directories(${PAIRS_TARGET} PRIVATE ${LIKWID_INCLUDE_DIR})
+        target_link_libraries(${PAIRS_TARGET} PRIVATE ${LIKWID_LIBRARY})
+    endif()
+    if(NOT LIKWID_INCLUDE_DIR OR NOT LIKWID_LIBRARY)
+        message(FATAL_ERROR "Likwid include directory or library not found.")
+    endif()
 endif()
 
+
+
+# if(USE_LIKWID)
+#     message(STATUS "USE_LIKWID is enabled")
+#     # find_package(likwid REQUIRED)
+#     if (BUILD_WITH_LIKWID_MARKERS)
+#         message(STATUS "Building with LIKWID markers")
+
+#         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIKWID_PERFMON -I${LIKWID_DIR}/include -L${LIKWID_DIR}/lib -llikwid")
+
+#         # add_definitions("-DLIKWID_PERFMON")
+    
+#         # include_directories(${LIKWID_INCLUDE_DIR})
+        
+#         # # message(STATUS "LIKWID LIBRARIES: ${LIKWID_LIBRARIES}")
+
+#         # target_link_libraries(${PAIRS_TARGET} PUBLIC ${LIKWID_LIBRARIES})
+#         # list(APPEND PAIRS_INCLUDE_DIRS "${LIKWID_INCLUDE_DIR}")
+#         # list(APPEND PAIRS_LINK_LIBRARIES "${LIKWID_LIBRARIES}")
+
+#         list(APPEND PAIRS_INCLUDE_DIRS "${LIKWID_DIR}/include")
+#         list(APPEND PAIRS_LINK_LIBRARIES "${LIKWID_DIR}/lib")
+
+#         # message(STATUS "PAIRS INCLUDE DIRECTORIES: ${PAIRS_INCLUDE_DIRS}")
+#         # message(STATUS "PAIRS LINK LIBRARIES: ${PAIRS_LINK_LIBRARIES}")
+
+#         # message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+#     endif()
+# endif()
+
+message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+
 #================================================================================
 # config file ===================================================================
 #================================================================================
diff --git a/data/planes2.input b/data/planes2.input
new file mode 100644
index 0000000000000000000000000000000000000000..0ce1c7f28af335b4bc8d92f46a50c1cb4b4ab3d9
--- /dev/null
+++ b/data/planes2.input
@@ -0,0 +1,2 @@
+1,0.0,0.0,0.0,0.0,0.0,1.0,13
+1,0.8,0.015,0.2,0.0,0.0,-1.0,13
diff --git a/examples/dem_bzd_euler.py b/examples/dem_bzd_euler.py
new file mode 100644
index 0000000000000000000000000000000000000000..23d56ac7f945488befc7831ffd58baead0c9e338
--- /dev/null
+++ b/examples/dem_bzd_euler.py
@@ -0,0 +1,251 @@
+import math
+import pairs
+import sys
+import os
+
+
+def update_mass_and_inertia(i):
+    rotation_matrix[i] = diagonal_matrix(1.0)
+    rotation_quat[i] = default_quaternion()
+
+    if is_sphere(i):
+        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
+        inertia[i] = diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i])
+
+    else:
+        mass[i] = infinity
+        inv_inertia[i] = 0.0
+
+
+def linear_spring_dashpot(i, j):
+    delta_ij = -penetration_depth(i, j)
+    skip_when(delta_ij < 0.0)
+
+    meff = 1.0 / ((1.0 / mass[i]) + (1.0 / mass[j]))
+    stiffness_norm = meff * (pi * pi + lnDryResCoeff * lnDryResCoeff) / \
+                     (collisionTime_SI * collisionTime_SI)
+    damping_norm = -2.0 * meff * lnDryResCoeff / collisionTime_SI
+    damping_tan = sqrt(kappa) * damping_norm
+
+    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+
+    rel_vel = -(velocity_wf_i - velocity_wf_j)
+    rel_vel_n = dot(rel_vel, contact_normal(i, j))
+    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
+    fNabs = stiffness_norm * delta_ij + damping_norm * rel_vel_n
+    fN = fNabs * contact_normal(i, j)
+
+    fTabs = min(damping_tan * length(rel_vel_t), friction_dynamic[i, j] * fNabs)
+    fT = fTabs * normalized(rel_vel_t)
+
+    partial_force = fN + fT
+
+    apply(force, partial_force)
+    apply(torque, cross(contact_point(i, j) - position, partial_force))
+
+
+def euler(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] += 0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
+    linear_velocity[i] += inv_mass * force[i] * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+    angular_velocity[i] += wdot * dt
+    
+def velocity_verlet_pre_force_update(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] += linear_velocity[i] * dt + 0.5 * inv_mass * old_force[i] * dt * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * old_torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+    
+
+def velocity_post_force_update(i):
+    inv_mass = 1.0 / mass[i]
+    linear_velocity[i] += inv_mass * 0.5 * (old_force[i] + force[i]) * dt
+    torque_avg = 0.5 * (old_torque[i] + torque[i])
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque_avg) * transposed(rotation_matrix[i])
+    angular_velocity[i] += wdot * dt
+    
+    old_force[i] = force[i]
+    old_torque[i] = torque[i]
+    
+    
+
+def gravity(i):
+    volume = (4.0 / 3.0) * pi * radius[i] * radius[i] * radius[i]
+    force[i][2] = force[i][2] - (densityParticle_SI - densityFluid_SI) * volume * gravity_SI
+
+# def energy_cal(i):
+#     kinetic_energy[i] += 0.5 * mass[i] * dot(linear_velocity[i], linear_velocity[i])
+#     potential_energy[i] += mass[i] * position[i][2] * gravity_SI
+#     rotational_energy[i] += 0.5 * dot( angular_velocity[i], inertia[i] * angular_velocity[i])
+#     #* rotational_matrix[i] * (inv_inertia[i] * angular_velocity[i] * transposed(rotation_matrix[i]))
+#     # rotational_energy[i] = 0.5 * (angular_velocity[i][0]    
+#     total_energy[i] += kinetic_energy[i] + potential_energy[i] + rotational_energy[i]
+
+    
+
+# def energy_cal(i):
+#     kinetic_energy[i] = 0.5 * mass[i] * (linear_velocity[i] * linear_velocity[i])s
+#     kinetic_energy[i] += 0.5 * angular_velocity[i] * (rotation_matrix[i] * (inv_inertia[i] * transposed(rotation_matrix[i])) * angular_velocity[i])
+#     potential_energy[i] = mass[i] * position[i] * gravity_SI
+
+    # energy[i] = kinetic_energy[i] + potential_energy[i]
+
+
+cmd = sys.argv[0]
+target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
+if target != 'cpu' and target != 'gpu':
+    print(f"Invalid target, use {cmd} <cpu/gpu>")
+
+# Config file parameters
+domainSize_SI=[0.1, 0.1, 0.1]
+#domainSize_SI = [0.4, 0.4, 0.2] # node base
+#domainSize_SI = [0.6, 0.6, 0.2] # node base
+#domainSize_SI = [0.8, 0.8, 0.2] # node base
+diameter_SI = 0.0029
+gravity_SI = 9.81
+densityFluid_SI = 1000
+densityParticle_SI = 2550
+generationSpacing_SI = 0.005
+initialVelocity_SI = 1
+dt_SI = 5e-5
+frictionCoefficient = 0.5
+restitutionCoefficient = 0.1
+collisionTime_SI = 5e-4
+poissonsRatio = 0.22
+timeSteps = 10000
+#timeSteps = 1000
+visSpacing = 2000
+denseBottomLayer = False
+bottomLayerOffsetFactor = 1.0
+kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
+
+minDiameter_SI = diameter_SI * 0.9
+maxDiameter_SI = diameter_SI * 1.1
+linkedCellWidth = 1.01 * maxDiameter_SI
+
+ntypes = 1
+lnDryResCoeff = math.log(restitutionCoefficient)
+frictionStatic = 0.0
+frictionDynamic = frictionCoefficient
+
+file_name = os.path.basename(__file__)
+file_name_without_extension = os.path.splitext(file_name)[0]
+
+psim = pairs.simulation(
+    file_name_without_extension,
+    [pairs.sphere(), pairs.halfspace()],
+    timesteps=timeSteps,
+    double_prec=True,
+    use_contact_history=False,
+    particle_capacity=1000000,
+    neighbor_capacity=20,
+    debug=True, generate_whole_program=True)
+
+if target == 'gpu':
+    psim.target(pairs.target_gpu())
+else:
+    psim.target(pairs.target_cpu())
+    #psim.target(pairs.target_cpu(parallel=True))
+
+psim.add_position('position')
+psim.add_property('mass', pairs.real(), 1.0)
+psim.add_property('linear_velocity', pairs.vector())
+psim.add_property('angular_velocity', pairs.vector())
+psim.add_property('force', pairs.vector(), volatile=True)
+psim.add_property('torque', pairs.vector(), volatile=True)
+psim.add_property('old_force', pairs.vector(), volatile=True) #old force added
+psim.add_property('old_torque', pairs.vector(), volatile=True)
+psim.add_property('radius', pairs.real(), 1.0)
+psim.add_property('normal', pairs.vector())
+psim.add_property('inv_inertia', pairs.matrix())
+psim.add_property('inertia', pairs.matrix())
+psim.add_property('rotation_matrix', pairs.matrix())
+psim.add_property('rotation_quat', pairs.quaternion())
+psim.add_feature('type', ntypes)
+psim.add_feature_property('type', 'friction_static', pairs.real(), [frictionStatic for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDynamic for i in range(ntypes * ntypes)])
+
+# psim.add_property('kinetic_energy', pairs.real(), volatile=True)
+# psim.add_property('rotational_energy', pairs.real(), volatile=True)
+# psim.add_property('potential_energy', pairs.real(), volatile=True)
+# psim.add_property('total_energy', pairs.real(), volatile=True)
+# psim.add_contact_property('is_sticking', pairs.int32(), 0)
+# psim.add_contact_property('tangential_spring_displacement', pairs.vector(), [0.0, 0.0, 0.0])
+# psim.add_contact_property('impact_velocity_magnitude', pairs.real(), 0.0)
+
+psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
+# psim.set_domain_partitioner(pairs.block_forest())
+psim.set_domain_partitioner(pairs.regular_domain_partitioner())
+psim.pbc([False, False, False])
+psim.dem_sc_grid(
+    domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
+    diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
+
+#psim.read_particle_data(
+#    "data/spheres.input",
+#    "data/spheres_4x4x2.input",
+#    "data/spheres_6x6x2.input",
+#    "data/spheres_8x8x2.input",
+#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+#psim.read_particle_data(
+#    "data/spheres_bottom.input",
+#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+psim.read_particle_data(
+    "data/planes2.input",
+    # ['uid', 'type', 'mass', 'position', 'normal', 'flags'],
+     ['type', 'mass', 'position', 'normal', 'flags'],
+    pairs.halfspace())
+
+psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
+                                     'pi': math.pi,
+                                     'infinity': math.inf })
+
+# psim.compute(velocity_verlet_pre_force_update, symbols={'dt': dt_SI})
+#psim.compute_half()
+psim.build_cell_lists(linkedCellWidth)
+psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
+
+psim.compute(gravity,
+             symbols={'densityParticle_SI': densityParticle_SI,
+                      'densityFluid_SI': densityFluid_SI,
+                      'gravity_SI': gravity_SI,
+                      'pi': math.pi })
+
+psim.compute(linear_spring_dashpot,
+             linkedCellWidth,
+             symbols={'dt': dt_SI,
+                      'pi': math.pi,
+                      'kappa': kappa,
+                      'lnDryResCoeff': lnDryResCoeff,
+                      'collisionTime_SI': collisionTime_SI})
+
+psim.compute(euler, symbols={'dt': dt_SI})
+# psim.compute(velocity_post_force_update, symbols={'dt': dt_SI})
+# psim.compute(kinetic_energy_cal, symbols={'dt': dt_SI})
+# psim.compute(potential_energy_cal, symbols={'dt': dt_SI, 'gravity_SI': gravity_SI,})
+# psim.compute(energy_cal, symbols={'gravity_SI': gravity_SI})
+# kinetic_energies = []
+# potential_energies = []
+# total_energies = []
+
+# ke = kinetic_energy()
+# pe = potential_energy()
+# te = ke + pe
+
+# kinetic_energies.append(ke)
+# potential_energies.append(pe)   
+# total_energies.append(te)
+
+psim.generate()
+
diff --git a/examples/dem_bzd_velocity_verlet.py b/examples/dem_bzd_velocity_verlet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1dd3a7c7fa9c1c14fc2e845444d45374a9eb80d
--- /dev/null
+++ b/examples/dem_bzd_velocity_verlet.py
@@ -0,0 +1,253 @@
+import math
+import pairs
+import sys
+import os
+
+def update_mass_and_inertia(i):
+    rotation_matrix[i] = diagonal_matrix(1.0)
+    rotation_quat[i] = default_quaternion()
+
+    if is_sphere(i):
+        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
+        inertia[i] = diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i])
+
+    else:
+        mass[i] = infinity
+        inv_inertia[i] = 0.0
+
+
+def linear_spring_dashpot(i, j):
+    delta_ij = -penetration_depth(i, j)
+    skip_when(delta_ij < 0.0)
+
+    meff = 1.0 / ((1.0 / mass[i]) + (1.0 / mass[j]))
+    stiffness_norm = meff * (pi * pi + lnDryResCoeff * lnDryResCoeff) / \
+                     (collisionTime_SI * collisionTime_SI)
+    damping_norm = -2.0 * meff * lnDryResCoeff / collisionTime_SI
+    damping_tan = sqrt(kappa) * damping_norm
+
+    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+
+    rel_vel = -(velocity_wf_i - velocity_wf_j)
+    rel_vel_n = dot(rel_vel, contact_normal(i, j))
+    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
+    fNabs = stiffness_norm * delta_ij + damping_norm * rel_vel_n
+    fN = fNabs * contact_normal(i, j)
+
+    fTabs = min(damping_tan * length(rel_vel_t), friction_dynamic[i, j] * fNabs)
+    fT = fTabs * normalized(rel_vel_t)
+
+    partial_force = fN + fT
+
+    apply(force, partial_force)
+    apply(torque, cross(contact_point(i, j) - position, partial_force))
+
+
+# def euler(i):
+#     inv_mass = 1.0 / mass[i]
+#     position[i] += 0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
+#     linear_velocity[i] += inv_mass * force[i] * dt
+#     wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
+#     phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+#     rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+#     rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+#     angular_velocity[i] += wdot * dt
+    
+def velocity_verlet_pre_force_update(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] += linear_velocity[i] * dt + 0.5 * inv_mass * old_force[i] * dt * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * old_torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+    
+
+def velocity_post_force_update(i):
+    inv_mass = 1.0 / mass[i]
+    linear_velocity[i] += inv_mass * 0.5 * (old_force[i] + force[i]) * dt
+    torque_avg = 0.5 * (old_torque[i] + torque[i])
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque_avg) * transposed(rotation_matrix[i])
+    angular_velocity[i] += wdot * dt
+    
+    old_force[i] = force[i]
+    old_torque[i] = torque[i]
+    
+    
+
+def gravity(i):
+    volume = (4.0 / 3.0) * pi * radius[i] * radius[i] * radius[i]
+    force[i][2] = force[i][2] - (densityParticle_SI - densityFluid_SI) * volume * gravity_SI
+
+# def energy_cal(i):
+#     kinetic_energy[i] += 0.5 * mass[i] * dot(linear_velocity[i], linear_velocity[i])
+#     potential_energy[i] += mass[i] * position[i][2] * gravity_SI
+#     rotational_energy[i] += 0.5 * dot( angular_velocity[i], inertia[i] * angular_velocity[i])
+#     #* rotational_matrix[i] * (inv_inertia[i] * angular_velocity[i] * transposed(rotation_matrix[i]))
+
+    
+#     # rotational_energy[i] = 0.5 * (angular_velocity[i][0]    
+#     total_energy[i] += kinetic_energy[i] + potential_energy[i] + rotational_energy[i]
+
+    
+
+# def energy_cal(i):
+#     kinetic_energy[i] = 0.5 * mass[i] * (linear_velocity[i] * linear_velocity[i])s
+#     kinetic_energy[i] += 0.5 * angular_velocity[i] * (rotation_matrix[i] * (inv_inertia[i] * transposed(rotation_matrix[i])) * angular_velocity[i])
+#     potential_energy[i] = mass[i] * position[i] * gravity_SI
+
+    # energy[i] = kinetic_energy[i] + potential_energy[i]
+
+
+cmd = sys.argv[0]
+target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
+if target != 'cpu' and target != 'gpu':
+    print(f"Invalid target, use {cmd} <cpu/gpu>")
+
+# Config file parameters
+# domainSize_SI=[0.1, 0.1, 0.1]
+domainSize_SI = [0.4, 0.4, 0.2] # node base
+#domainSize_SI = [0.6, 0.6, 0.2] # node base
+#domainSize_SI = [0.8, 0.8, 0.2] # node base
+diameter_SI = 0.0029
+gravity_SI = 9.81
+densityFluid_SI = 1000
+densityParticle_SI = 2550
+generationSpacing_SI = 0.005
+initialVelocity_SI = 1
+dt_SI = 5e-5
+frictionCoefficient = 0.5
+restitutionCoefficient = 0.1
+collisionTime_SI = 5e-4
+poissonsRatio = 0.22
+timeSteps = 10000
+#timeSteps = 1000
+visSpacing = 1000
+denseBottomLayer = False
+bottomLayerOffsetFactor = 1.0
+kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
+
+minDiameter_SI = diameter_SI * 0.9
+maxDiameter_SI = diameter_SI * 1.1
+linkedCellWidth = 1.01 * maxDiameter_SI
+
+ntypes = 1
+lnDryResCoeff = math.log(restitutionCoefficient);
+frictionStatic = 0.0
+frictionDynamic = frictionCoefficient
+
+file_name = os.path.basename(__file__)
+file_name_without_extension = os.path.splitext(file_name)[0]
+
+
+psim = pairs.simulation(
+    # "dem_bzd_velocity_verlet",
+    file_name_without_extension,
+    [pairs.sphere(), pairs.halfspace()],
+    timesteps=timeSteps,
+    double_prec=True,
+    use_contact_history=False,
+    particle_capacity=1000000,
+    neighbor_capacity=20,
+     debug=True, generate_whole_program=True)
+
+if target == 'gpu':
+    psim.target(pairs.target_gpu())
+else:
+    psim.target(pairs.target_cpu())
+    #psim.target(pairs.target_cpu(parallel=True))
+
+psim.add_position('position')
+psim.add_property('mass', pairs.real(), 1.0)
+psim.add_property('linear_velocity', pairs.vector())
+psim.add_property('angular_velocity', pairs.vector())
+psim.add_property('force', pairs.vector(), volatile=True)
+psim.add_property('torque', pairs.vector(), volatile=True)
+psim.add_property('old_force', pairs.vector(), volatile=True) #old force added
+psim.add_property('old_torque', pairs.vector(), volatile=True)
+psim.add_property('radius', pairs.real(), 1.0)
+psim.add_property('normal', pairs.vector())
+psim.add_property('inv_inertia', pairs.matrix())
+psim.add_property('inertia', pairs.matrix())
+psim.add_property('rotation_matrix', pairs.matrix())
+psim.add_property('rotation_quat', pairs.quaternion())
+psim.add_feature('type', ntypes)
+psim.add_feature_property('type', 'friction_static', pairs.real(), [frictionStatic for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDynamic for i in range(ntypes * ntypes)])
+
+# psim.add_property('kinetic_energy', pairs.real(), volatile=True)
+# psim.add_property('rotational_energy', pairs.real(), volatile=True)
+# psim.add_property('potential_energy', pairs.real(), volatile=True)
+# psim.add_property('total_energy', pairs.real(), volatile=True)
+# # psim.add_contact_property('is_sticking', pairs.int32(), 0)
+# psim.add_contact_property('tangential_spring_displacement', pairs.vector(), [0.0, 0.0, 0.0])
+# psim.add_contact_property('impact_velocity_magnitude', pairs.real(), 0.0)
+
+psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
+psim.set_domain_partitioner(pairs.block_forest())
+# psim.set_domain_partitioner(pairs.regular_domain_partitioner())
+psim.pbc([True, True, False])
+psim.dem_sc_grid(
+    domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
+    diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
+
+#psim.read_particle_data(
+#    "data/spheres.input",
+#    "data/spheres_4x4x2.input",
+#    "data/spheres_6x6x2.input",
+#    "data/spheres_8x8x2.input",
+#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+#psim.read_particle_data(
+#    "data/spheres_bottom.input",
+#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+# psim.read_particle_data(
+#     "data/planes.input",
+#     ['uid', 'type', 'mass', 'position', 'normal', 'flags'],
+#     pairs.halfspace())
+
+psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
+                                     'pi': math.pi,
+                                     'infinity': math.inf })
+
+psim.compute(velocity_verlet_pre_force_update, symbols={'dt': dt_SI})
+#psim.compute_half()
+psim.build_cell_lists(linkedCellWidth)
+psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
+
+psim.compute(gravity,
+             symbols={'densityParticle_SI': densityParticle_SI,
+                      'densityFluid_SI': densityFluid_SI,
+                      'gravity_SI': gravity_SI,
+                      'pi': math.pi })
+
+psim.compute(linear_spring_dashpot,
+             linkedCellWidth,
+             symbols={'dt': dt_SI,
+                      'pi': math.pi,
+                      'kappa': kappa,
+                      'lnDryResCoeff': lnDryResCoeff,
+                      'collisionTime_SI': collisionTime_SI})
+
+# psim.compute(euler, symbols={'dt': dt_SI})
+psim.compute(velocity_post_force_update, symbols={'dt': dt_SI})
+# psim.compute(kinetic_energy_cal, symbols={'dt': dt_SI})
+# psim.compute(potential_energy_cal, symbols={'dt': dt_SI, 'gravity_SI': gravity_SI,})
+# psim.compute(energy_cal, symbols={'gravity_SI': gravity_SI})
+# kinetic_energies = []
+# potential_energies = []
+# total_energies = []
+
+# ke = kinetic_energy()
+# pe = potential_energy()
+# te = ke + pe
+
+# kinetic_energies.append(ke)
+# potential_energies.append(pe)   
+# total_energies.append(te)
+
+psim.generate()
+
diff --git a/examples/dem_bzd_velocity_verlet_old.py b/examples/dem_bzd_velocity_verlet_old.py
new file mode 100644
index 0000000000000000000000000000000000000000..eccb7ea6aed2f2efc711a5a8dc69356130b5514c
--- /dev/null
+++ b/examples/dem_bzd_velocity_verlet_old.py
@@ -0,0 +1,251 @@
+import math
+import pairs
+import sys
+import os
+
+
+def update_mass_and_inertia(i):
+    rotation_matrix[i] = diagonal_matrix(1.0)
+    rotation_quat[i] = default_quaternion()
+
+    if is_sphere(i):
+        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
+        inertia[i] = diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i])
+
+    else:
+        mass[i] = infinity
+        inv_inertia[i] = 0.0
+
+
+def linear_spring_dashpot(i, j):
+    delta_ij = -penetration_depth(i, j)
+    skip_when(delta_ij < 0.0)
+
+    meff = 1.0 / ((1.0 / mass[i]) + (1.0 / mass[j]))
+    stiffness_norm = meff * (pi * pi + lnDryResCoeff * lnDryResCoeff) / \
+                     (collisionTime_SI * collisionTime_SI)
+    damping_norm = -2.0 * meff * lnDryResCoeff / collisionTime_SI
+    damping_tan = sqrt(kappa) * damping_norm
+
+    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+
+    rel_vel = -(velocity_wf_i - velocity_wf_j)
+    rel_vel_n = dot(rel_vel, contact_normal(i, j))
+    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
+    fNabs = stiffness_norm * delta_ij + damping_norm * rel_vel_n
+    fN = fNabs * contact_normal(i, j)
+
+    fTabs = min(damping_tan * length(rel_vel_t), friction_dynamic[i, j] * fNabs)
+    fT = fTabs * normalized(rel_vel_t)
+
+    partial_force = fN + fT
+
+    apply(force, partial_force)
+    apply(torque, cross(contact_point(i, j) - position, partial_force))
+
+
+# def euler(i):
+#     inv_mass = 1.0 / mass[i]
+#     position[i] += 0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
+#     linear_velocity[i] += inv_mass * force[i] * dt
+#     wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
+#     phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+#     rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+#     rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+#     angular_velocity[i] += wdot * dt
+    
+def velocity_verlet_pre_force_update(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] += linear_velocity[i] * dt + 0.5 * inv_mass * old_force[i] * dt * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * old_torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+    
+
+def velocity_post_force_update(i):
+    inv_mass = 1.0 / mass[i]
+    linear_velocity[i] += inv_mass * 0.5 * (old_force[i] + force[i]) * dt
+    torque_avg = 0.5 * (old_torque[i] + torque[i])
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque_avg) * transposed(rotation_matrix[i])
+    angular_velocity[i] += wdot * dt
+    
+    old_force[i] = force[i]
+    old_torque[i] = torque[i]
+    
+    
+
+def gravity(i):
+    volume = (4.0 / 3.0) * pi * radius[i] * radius[i] * radius[i]
+    force[i][2] = force[i][2] - (densityParticle_SI - densityFluid_SI) * volume * gravity_SI
+
+# def energy_cal(i):
+#     kinetic_energy[i] += 0.5 * mass[i] * dot(linear_velocity[i], linear_velocity[i])
+#     potential_energy[i] += mass[i] * position[i][2] * gravity_SI
+#     rotational_energy[i] += 0.5 * dot( angular_velocity[i], inertia[i] * angular_velocity[i])
+#     #* rotational_matrix[i] * (inv_inertia[i] * angular_velocity[i] * transposed(rotation_matrix[i]))
+#     # rotational_energy[i] = 0.5 * (angular_velocity[i][0]    
+#     total_energy[i] += kinetic_energy[i] + potential_energy[i] + rotational_energy[i]
+
+    
+
+# def energy_cal(i):
+#     kinetic_energy[i] = 0.5 * mass[i] * (linear_velocity[i] * linear_velocity[i])s
+#     kinetic_energy[i] += 0.5 * angular_velocity[i] * (rotation_matrix[i] * (inv_inertia[i] * transposed(rotation_matrix[i])) * angular_velocity[i])
+#     potential_energy[i] = mass[i] * position[i] * gravity_SI
+
+    # energy[i] = kinetic_energy[i] + potential_energy[i]
+
+
+cmd = sys.argv[0]
+target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
+if target != 'cpu' and target != 'gpu':
+    print(f"Invalid target, use {cmd} <cpu/gpu>")
+
+# Config file parameters
+domainSize_SI=[0.1, 0.1, 0.1]
+#domainSize_SI = [0.4, 0.4, 0.2] # node base
+#domainSize_SI = [0.6, 0.6, 0.2] # node base
+#domainSize_SI = [0.8, 0.8, 0.2] # node base
+diameter_SI = 0.0029
+gravity_SI = 9.81
+densityFluid_SI = 1000
+densityParticle_SI = 2550
+generationSpacing_SI = 0.005
+initialVelocity_SI = 1
+dt_SI = 5e-5
+frictionCoefficient = 0.5
+restitutionCoefficient = 0.1
+collisionTime_SI = 5e-4
+poissonsRatio = 0.22
+timeSteps = 10000
+#timeSteps = 1000
+visSpacing = 2000
+denseBottomLayer = False
+bottomLayerOffsetFactor = 1.0
+kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
+
+minDiameter_SI = diameter_SI * 0.9
+maxDiameter_SI = diameter_SI * 1.1
+linkedCellWidth = 1.01 * maxDiameter_SI
+
+ntypes = 1
+lnDryResCoeff = math.log(restitutionCoefficient)
+frictionStatic = 0.0
+frictionDynamic = frictionCoefficient
+
+file_name = os.path.basename(__file__)
+file_name_without_extension = os.path.splitext(file_name)[0]
+
+psim = pairs.simulation(
+    file_name_without_extension,
+    [pairs.sphere(), pairs.halfspace()],
+    timesteps=timeSteps,
+    double_prec=True,
+    use_contact_history=False,
+    particle_capacity=1000000,
+    neighbor_capacity=20,
+    debug=True, generate_whole_program=True)
+
+if target == 'gpu':
+    psim.target(pairs.target_gpu())
+else:
+    psim.target(pairs.target_cpu())
+    #psim.target(pairs.target_cpu(parallel=True))
+
+psim.add_position('position')
+psim.add_property('mass', pairs.real(), 1.0)
+psim.add_property('linear_velocity', pairs.vector())
+psim.add_property('angular_velocity', pairs.vector())
+psim.add_property('force', pairs.vector(), volatile=True)
+psim.add_property('torque', pairs.vector(), volatile=True)
+psim.add_property('old_force', pairs.vector(), volatile=True) #old force added
+psim.add_property('old_torque', pairs.vector(), volatile=True)
+psim.add_property('radius', pairs.real(), 1.0)
+psim.add_property('normal', pairs.vector())
+psim.add_property('inv_inertia', pairs.matrix())
+psim.add_property('inertia', pairs.matrix())
+psim.add_property('rotation_matrix', pairs.matrix())
+psim.add_property('rotation_quat', pairs.quaternion())
+psim.add_feature('type', ntypes)
+psim.add_feature_property('type', 'friction_static', pairs.real(), [frictionStatic for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDynamic for i in range(ntypes * ntypes)])
+
+# psim.add_property('kinetic_energy', pairs.real(), volatile=True)
+# psim.add_property('rotational_energy', pairs.real(), volatile=True)
+# psim.add_property('potential_energy', pairs.real(), volatile=True)
+# psim.add_property('total_energy', pairs.real(), volatile=True)
+# psim.add_contact_property('is_sticking', pairs.int32(), 0)
+# psim.add_contact_property('tangential_spring_displacement', pairs.vector(), [0.0, 0.0, 0.0])
+# psim.add_contact_property('impact_velocity_magnitude', pairs.real(), 0.0)
+
+psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
+psim.set_domain_partitioner(pairs.block_forest())
+# psim.set_domain_partitioner(pairs.regular_domain_partitioner())
+psim.pbc([False, False, False])
+psim.dem_sc_grid(
+    domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
+    diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
+
+#psim.read_particle_data(
+#    "data/spheres.input",
+#    "data/spheres_4x4x2.input",
+#    "data/spheres_6x6x2.input",
+#    "data/spheres_8x8x2.input",
+#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+#psim.read_particle_data(
+#    "data/spheres_bottom.input",
+#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+# psim.read_particle_data(
+#     "data/planes2.input",
+#     # ['uid', 'type', 'mass', 'position', 'normal', 'flags'],
+#      ['type', 'mass', 'position', 'normal', 'flags'],
+#     pairs.halfspace())
+
+psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
+                                     'pi': math.pi,
+                                     'infinity': math.inf })
+
+psim.compute(velocity_verlet_pre_force_update, symbols={'dt': dt_SI})
+#psim.compute_half()
+psim.build_cell_lists(linkedCellWidth)
+psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
+
+psim.compute(gravity,
+             symbols={'densityParticle_SI': densityParticle_SI,
+                      'densityFluid_SI': densityFluid_SI,
+                      'gravity_SI': gravity_SI,
+                      'pi': math.pi })
+
+psim.compute(linear_spring_dashpot,
+             linkedCellWidth,
+             symbols={'dt': dt_SI,
+                      'pi': math.pi,
+                      'kappa': kappa,
+                      'lnDryResCoeff': lnDryResCoeff,
+                      'collisionTime_SI': collisionTime_SI})
+
+# psim.compute(euler, symbols={'dt': dt_SI})
+psim.compute(velocity_post_force_update, symbols={'dt': dt_SI})
+# psim.compute(kinetic_energy_cal, symbols={'dt': dt_SI})
+# psim.compute(potential_energy_cal, symbols={'dt': dt_SI, 'gravity_SI': gravity_SI,})
+# psim.compute(energy_cal, symbols={'gravity_SI': gravity_SI})
+# kinetic_energies = []
+# potential_energies = []
+# total_energies = []
+
+# ke = kinetic_energy()
+# pe = potential_energy()
+# te = ke + pe
+
+# kinetic_energies.append(ke)
+# potential_energies.append(pe)   
+# total_energies.append(te)
+
+psim.generate()
+
diff --git a/examples/dem_euler_blockforest.py b/examples/dem_euler_blockforest.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b4e3f0c0a2e2f6e397d65f3fea9839385143ee6
--- /dev/null
+++ b/examples/dem_euler_blockforest.py
@@ -0,0 +1,216 @@
+import math
+import pairs
+import sys
+import os
+
+def update_mass_and_inertia(i):
+    rotation_matrix[i] = diagonal_matrix(1.0)
+    rotation_quat[i] = default_quaternion()
+
+    if is_sphere(i):
+        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
+
+    else:
+        mass[i] = infinity
+        inv_inertia[i] = 0.0
+
+
+def linear_spring_dashpot(i, j):
+    delta_ij = -penetration_depth(i, j)
+    skip_when(delta_ij < 0.0)
+
+    meff = 1.0 / ((1.0 / mass[i]) + (1.0 / mass[j]))
+    stiffness_norm = meff * (pi * pi + lnDryResCoeff * lnDryResCoeff) / \
+                     (collisionTime_SI * collisionTime_SI)
+    damping_norm = -2.0 * meff * lnDryResCoeff / collisionTime_SI
+    damping_tan = sqrt(kappa) * damping_norm
+
+    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+
+    rel_vel = -(velocity_wf_i - velocity_wf_j)
+    rel_vel_n = dot(rel_vel, contact_normal(i, j))
+    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
+    fNabs = stiffness_norm * delta_ij + damping_norm * rel_vel_n
+    fN = fNabs * contact_normal(i, j)
+
+    fTabs = min(damping_tan * length(rel_vel_t), friction_dynamic[i, j] * fNabs)
+    fT = fTabs * normalized(rel_vel_t)
+
+    partial_force = fN + fT
+
+    apply(force, partial_force)
+    apply(torque, cross(contact_point(i, j) - position, partial_force))
+    
+def euler(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] += 0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
+    linear_velocity[i] += inv_mass * force[i] * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+    angular_velocity[i] += wdot * dt
+
+# velocity verlet pre force update
+# def velocityVerletPreForce(i):
+#     inv_mass = 1.0 / mass[i]
+#     position[i] += linear_velocity[i] * dt + 0.5 * inv_mass * old_force[i] * dt * dt
+#     wdot = rotation_matrix[i] * (inv_inertia[i] * old_torque[i]) * transposed(rotation_matrix[i])
+#     phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+#     rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+#     rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+
+
+# def velocityVerletPostForce(i):
+#     inv_mass = 1.0 / mass[i]
+#     linear_velocity[i] += inv_mass * 0.5 * (old_force[i] + force[i]) * dt
+#     torque_avg = 0.5 * (old_torque[i] + torque[i])
+#     wdot = rotation_matrix[i] * (inv_inertia[i] * torque_avg) * transposed(rotation_matrix[i])
+#     angular_velocity[i] += wdot * dt
+    
+#     old_force[i] = force[i]
+#     old_torque[i] = torque[i]
+
+
+def gravity(i):
+    volume = (4.0 / 3.0) * pi * radius[i] * radius[i] * radius[i]
+    force[i][2] = force[i][2] - (densityParticle_SI - densityFluid_SI) * volume * gravity_SI
+
+
+cmd = sys.argv[0]
+target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
+if target != 'cpu' and target != 'gpu':
+    print(f"Invalid target, use {cmd} <cpu/gpu>")
+
+# Config file parameters
+# domainSize_SI = [0.8, 0.2, 0.2]
+# domainSize_SI = [0.8, 0.015, 0.2] //finel
+# domainSize_SI = [0.8, 0.03, 0.2]
+# domainSize_SI = [0.64, 0.05, 0.2]
+#domainSize_SI = [0.4, 0.4, 0.2] # node base
+#domainSize_SI = [0.6, 0.6, 0.2] # node base
+#domainSize_SI = [0.8, 0.8, 0.2] # node base
+domainSize_SI = [0.1, 0.1, 0.1]
+diameter_SI = 2.5e-3 #0.0029
+gravity_SI = 9.81
+densityFluid_SI = 1179
+densityParticle_SI = 2650
+generationSpacing_SI = 2.75e-3
+initialVelocity_SI = 0.05
+dt_SI = 5e-5
+frictionCoefficient = 0.6
+restitutionCoefficient = 0.1
+collisionTime_SI = 5e-4
+poissonsRatio = 0.22
+timeSteps = 16000
+#timeSteps = 1000
+# visSpacing = 2000
+denseBottomLayer = False
+bottomLayerOffsetFactor = 1.0
+kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
+
+minDiameter_SI = diameter_SI * 0.9
+maxDiameter_SI = diameter_SI * 1.1
+linkedCellWidth = 1.01 * maxDiameter_SI
+
+ntypes = 1
+lnDryResCoeff = math.log(restitutionCoefficient)
+frictionStatic = 0.0
+frictionDynamic = frictionCoefficient
+
+
+file_name = os.path.basename(__file__)
+file_name_without_extension = os.path.splitext(file_name)[0]
+
+psim = pairs.simulation(
+    # "dem_vel_ver",
+    file_name_without_extension,
+    [pairs.sphere(), pairs.halfspace()],
+    timesteps=timeSteps,
+    double_prec=True,
+    use_contact_history=True,
+    particle_capacity=1000000,
+    neighbor_capacity=20,
+    debug=True, generate_whole_program=True)
+
+if target == 'gpu':
+    psim.target(pairs.target_gpu())
+else:
+    psim.target(pairs.target_cpu())
+    #psim.target(pairs.target_cpu(parallel=True))
+
+psim.add_position('position')
+psim.add_property('mass', pairs.real(), 1.0)
+psim.add_property('linear_velocity', pairs.vector())
+psim.add_property('angular_velocity', pairs.vector())
+psim.add_property('force', pairs.vector(), volatile=True)
+psim.add_property('torque', pairs.vector(), volatile=True)
+psim.add_property('old_force', pairs.vector(), volatile=True) #old force added
+psim.add_property('old_torque', pairs.vector(), volatile=True)
+psim.add_property('radius', pairs.real(), 1.0)
+psim.add_property('normal', pairs.vector())
+psim.add_property('inv_inertia', pairs.matrix())
+psim.add_property('rotation_matrix', pairs.matrix())
+psim.add_property('rotation_quat', pairs.quaternion())
+psim.add_feature('type', ntypes)
+psim.add_feature_property('type', 'friction_static', pairs.real(), [frictionStatic for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDynamic for i in range(ntypes * ntypes)])
+psim.add_contact_property('is_sticking', pairs.int32(), 0)
+psim.add_contact_property('tangential_spring_displacement', pairs.vector(), [0.0, 0.0, 0.0])
+psim.add_contact_property('impact_velocity_magnitude', pairs.real(), 0.0)
+
+psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
+psim.set_domain_partitioner(pairs.block_forest())
+# psim.set_domain_partitioner(pairs.regular_domain_partitioner_xy())
+psim.pbc([True, True, False])
+psim.dem_sc_grid(
+    domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
+    diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
+
+#psim.read_particle_data(
+#    "data/spheres.input",
+#    "data/spheres_4x4x2.input",
+#    "data/spheres_6x6x2.input",
+#    "data/spheres_8x8x2.input",
+#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+#psim.read_particle_data(
+#    "data/spheres_bottom.input",
+#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+psim.read_particle_data(
+    "data/planes.input",
+    ['type', 'mass', 'position', 'normal', 'flags'],
+    pairs.halfspace())
+
+psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
+                                     'pi': math.pi,
+                                     'infinity': math.inf })
+
+#psim.compute_half()
+# psim.compute(velocityVerletPreForce, symbols={'dt': dt_SI})
+
+psim.build_cell_lists(linkedCellWidth)
+# psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
+
+psim.compute(gravity,
+             symbols={'densityParticle_SI': densityParticle_SI,
+                      'densityFluid_SI': densityFluid_SI,
+                      'gravity_SI': gravity_SI,
+                      'pi': math.pi })
+
+psim.compute(linear_spring_dashpot,
+             linkedCellWidth,
+             symbols={'dt': dt_SI,
+                      'pi': math.pi,
+                      'kappa': kappa,
+                      'lnDryResCoeff': lnDryResCoeff,
+                      'collisionTime_SI': collisionTime_SI})
+
+psim.compute(euler, symbols={'dt': dt_SI})
+# psim.compute(velocityVerletPostForce, symbols={'dt': dt_SI})
+
+psim.generate()
diff --git a/examples/dem_sd.py b/examples/dem_sd.py
index c0e021294231fc900e67a1736f81f55f6ac9db60..ba4b6a05ca5c476ffd99e3119e88a3e6dcaf1596 100644
--- a/examples/dem_sd.py
+++ b/examples/dem_sd.py
@@ -3,6 +3,7 @@ import pairs
 import sys
 import os
 
+
 def update_mass_and_inertia(i):
     rotation_matrix[i] = diagonal_matrix(1.0)
     rotation[i] = default_quaternion()
@@ -67,26 +68,52 @@ if target != 'cpu' and target != 'gpu':
     print(f"Invalid target, use {cmd} <cpu/gpu>")
 
 # Config file parameters
+# domainSize_SI=[0.1, 0.1, 0.1]
+# diameter_SI = 0.009
+# gravity_SI = 9.81
+# densityFluid_SI = 1000
+# densityParticle_SI = 2550
+# generationSpacing_SI = 0.01
+# initialVelocity_SI = 1
+# dt_SI = 5e-5
+# frictionCoefficient = 0.5
+# restitutionCoefficient = 0.1
+# collisionTime_SI = 5e-4
+# poissonsRatio = 0.22
+# timeSteps = 1
+# visSpacing = 200
+# denseBottomLayer = False
+# bottomLayerOffsetFactor = 1.0
+# kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
+
+# minDiameter_SI = diameter_SI #* 0.9
+# maxDiameter_SI = diameter_SI #* 1.1
+# linkedCellWidth = 1.01 * maxDiameter_SI
+
 domainSize_SI=[0.1, 0.1, 0.1]
-diameter_SI = 0.009
+#domainSize_SI = [0.4, 0.4, 0.2] # node base
+#domainSize_SI = [0.6, 0.6, 0.2] # node base
+#domainSize_SI = [0.8, 0.8, 0.2] # node base
+diameter_SI = 0.0029
 gravity_SI = 9.81
 densityFluid_SI = 1000
 densityParticle_SI = 2550
-generationSpacing_SI = 0.01
+generationSpacing_SI = 0.005
 initialVelocity_SI = 1
 dt_SI = 5e-5
 frictionCoefficient = 0.5
 restitutionCoefficient = 0.1
 collisionTime_SI = 5e-4
 poissonsRatio = 0.22
-timeSteps = 1
-visSpacing = 200
+timeSteps = 100
+#timeSteps = 1000
+visSpacing = 50
 denseBottomLayer = False
 bottomLayerOffsetFactor = 1.0
 kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
 
-minDiameter_SI = diameter_SI #* 0.9
-maxDiameter_SI = diameter_SI #* 1.1
+minDiameter_SI = diameter_SI * 0.9
+maxDiameter_SI = diameter_SI * 1.1
 linkedCellWidth = 1.01 * maxDiameter_SI
 
 ntypes = 1
@@ -105,7 +132,7 @@ psim = pairs.simulation(
     use_contact_history=False,
     particle_capacity=1000000,
     neighbor_capacity=20,
-    debug=True, generate_whole_program=False)
+    debug=True, generate_whole_program=True)
 
 if target == 'gpu':
     psim.target(pairs.target_gpu())
@@ -165,7 +192,7 @@ psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
 
 #psim.compute_half()
 psim.build_cell_lists(linkedCellWidth)
-# psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
+psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
 
 psim.compute(gravity,
              symbols={'densityParticle_SI': densityParticle_SI,
diff --git a/examples/dem_vel_ver.py b/examples/dem_vel_ver.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbf86893f010d3c55000758186f052c6a2ce9ac2
--- /dev/null
+++ b/examples/dem_vel_ver.py
@@ -0,0 +1,272 @@
+import math
+import pairs
+import sys
+import os
+
+def update_mass_and_inertia(i):
+    rotation_matrix[i] = diagonal_matrix(1.0)
+    rotation_quat[i] = default_quaternion()
+
+    if is_sphere(i):
+        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
+
+    else:
+        mass[i] = infinity
+        inv_inertia[i] = 0.0
+
+
+def linear_spring_dashpot(i, j):
+    delta_ij = -penetration_depth(i, j)
+    skip_when(delta_ij < 0.0)
+
+    meff = 1.0 / ((1.0 / mass[i]) + (1.0 / mass[j]))
+    stiffness_norm = meff * (pi * pi + lnDryResCoeff * lnDryResCoeff) / \
+                     (collisionTime_SI * collisionTime_SI)
+    damping_norm = -2.0 * meff * lnDryResCoeff / collisionTime_SI
+    damping_tan = sqrt(kappa) * damping_norm
+
+    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+
+    rel_vel = -(velocity_wf_i - velocity_wf_j)
+    rel_vel_n = dot(rel_vel, contact_normal(i, j))
+    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
+    fNabs = stiffness_norm * delta_ij + damping_norm * rel_vel_n
+    fN = fNabs * contact_normal(i, j)
+
+    fTabs = min(damping_tan * length(rel_vel_t), friction_dynamic[i, j] * fNabs)
+    fT = fTabs * normalized(rel_vel_t)
+
+    partial_force = fN + fT
+
+    apply(force, partial_force)
+    apply(torque, cross(contact_point(i, j) - position, partial_force))
+
+# def linear_spring_dashpot(i, j):
+#     delta_ij = -penetration_depth(i, j)
+#     skip_when(delta_ij < 0.0)
+
+#     meff = 1.0 / ((1.0 / mass[i]) + (1.0 / mass[j]))
+#     stiffness_norm = meff * (pi * pi + lnDryResCoeff * lnDryResCoeff) / \
+#                      (collisionTime_SI * collisionTime_SI)
+#     stiffness_tan = kappa * stiffness_norm
+#     damping_norm = -2.0 * meff * lnDryResCoeff / collisionTime_SI
+#     damping_tan = sqrt(kappa) * damping_norm
+
+#     velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+#     velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+
+#     rel_vel = -(velocity_wf_i - velocity_wf_j)
+#     rel_vel_n = dot(rel_vel, contact_normal(i, j)) * contact_normal(i, j)
+#     rel_vel_t = rel_vel - rel_vel_n
+#     fN = stiffness_norm * delta_ij * contact_normal(i, j) + damping_norm * rel_vel_n;
+
+#     tan_spring_disp = tangential_spring_displacement[i, j]
+#     impact_vel_magnitude = impact_velocity_magnitude[i, j]
+#     impact_magnitude = select(impact_vel_magnitude > 0.0, impact_vel_magnitude, length(rel_vel))
+#     sticking = is_sticking[i, j]
+
+#     rot_tan_disp = tan_spring_disp - contact_normal(i, j) * dot(tan_spring_disp, contact_normal(i, j))
+#     rot_tan_disp_len2 = squared_length(rot_tan_disp)
+#     new_tan_spring_disp = dt * rel_vel_t + \
+#                           select(rot_tan_disp_len2 <= 0.0,
+#                                  zero_vector(),
+#                                  rot_tan_disp * sqrt(squared_length(tan_spring_disp) / rot_tan_disp_len2))
+
+#     fTLS = stiffness_tan * new_tan_spring_disp + damping_tan * rel_vel_t
+#     fTLS_len = length(fTLS)
+#     t = normalized(fTLS)
+
+#     f_friction_abs_static = friction_static[i, j] * length(fN)
+#     f_friction_abs_dynamic = friction_dynamic[i, j] * length(fN)
+#     tan_vel_threshold = 1e-8
+
+#     cond1 = sticking == 1 and length(rel_vel_t) < tan_vel_threshold and fTLS_len < f_friction_abs_static
+#     cond2 = sticking == 1 and fTLS_len < f_friction_abs_dynamic
+#     f_friction_abs = select(cond1, f_friction_abs_static, f_friction_abs_dynamic)
+#     n_sticking = select(cond1 or cond2 or fTLS_len < f_friction_abs_dynamic, 1, 0)
+#     tangential_spring_displacement[i, j] = \
+#         select(not cond1 and not cond2 and stiffness_tan > 0.0,
+#                (f_friction_abs * t - damping_tan * rel_vel_t) / stiffness_tan,
+#                new_tan_spring_disp)
+
+#     impact_velocity_magnitude[i, j] = impact_magnitude
+#     is_sticking[i, j] = n_sticking
+
+#     fTabs = min(fTLS_len, f_friction_abs)
+#     fT = fTabs * t
+#     partial_force = fN + fT
+
+#     apply(force, partial_force)
+#     apply(torque, cross(contact_point(i, j) - position, partial_force))
+
+
+# def euler(i):
+#     inv_mass = 1.0 / mass[i]
+#     position[i] += 0.5 * inv_mass * force[i] * dt * dt + linear_velocity[i] * dt
+#     linear_velocity[i] += inv_mass * force[i] * dt
+#     wdot = rotation_matrix[i] * (inv_inertia[i] * torque[i]) * transposed(rotation_matrix[i])
+#     phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+#     rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+#     rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+#     angular_velocity[i] += wdot * dt
+
+def velocity_verlet_pre_force_update(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] += linear_velocity[i] * dt + 0.5 * inv_mass * old_force[i] * dt * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * old_torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+
+
+def velocity_post_force_update(i):
+    inv_mass = 1.0 / mass[i]
+    linear_velocity[i] += inv_mass * 0.5 * (old_force[i] + force[i]) * dt
+    torque_avg = 0.5 * (old_torque[i] + torque[i])
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque_avg) * transposed(rotation_matrix[i])
+    angular_velocity[i] += wdot * dt
+    
+    old_force[i] = force[i]
+    old_torque[i] = torque[i]
+
+
+def gravity(i):
+    volume = (4.0 / 3.0) * pi * radius[i] * radius[i] * radius[i]
+    force[i][2] = force[i][2] - (densityParticle_SI - densityFluid_SI) * volume * gravity_SI
+
+
+cmd = sys.argv[0]
+target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
+if target != 'cpu' and target != 'gpu':
+    print(f"Invalid target, use {cmd} <cpu/gpu>")
+
+# Config file parameters
+# domainSize_SI = [0.8, 0.015, 0.2]
+domainSize_SI = [0.1, 0.1, 0.1]
+# domainSize_SI = [12.66, 0.015, 0.2]
+#domainSize_SI = [0.4, 0.4, 0.2] # node base
+#domainSize_SI = [0.6, 0.6, 0.2] # node base
+#domainSize_SI = [0.8, 0.8, 0.2] # node base
+diameter_SI = 0.0029
+gravity_SI = 9.81
+densityFluid_SI = 1000
+densityParticle_SI = 2550
+generationSpacing_SI = 0.005
+initialVelocity_SI = 1
+dt_SI = 5e-5
+frictionCoefficient = 0.5
+restitutionCoefficient = 0.1
+collisionTime_SI = 5e-4
+poissonsRatio = 0.22
+timeSteps = 10000
+#timeSteps = 1000
+visSpacing = 1000
+denseBottomLayer = False
+bottomLayerOffsetFactor = 1.0
+kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
+
+minDiameter_SI = diameter_SI * 0.9
+maxDiameter_SI = diameter_SI * 1.1
+linkedCellWidth = 1.01 * maxDiameter_SI
+
+ntypes = 1
+lnDryResCoeff = math.log(restitutionCoefficient)
+frictionStatic = 0.0
+frictionDynamic = frictionCoefficient
+
+
+file_name = os.path.basename(__file__)
+file_name_without_extension = os.path.splitext(file_name)[0]
+
+psim = pairs.simulation(
+    # "dem_vel_ver",
+    file_name_without_extension,
+    [pairs.sphere(), pairs.halfspace()],
+    timesteps=timeSteps,
+    double_prec=True,
+    use_contact_history=True,
+    particle_capacity=1000000,
+    neighbor_capacity=20,
+    debug=True, generate_whole_program=True)
+
+if target == 'gpu':
+    psim.target(pairs.target_gpu())
+else:
+    psim.target(pairs.target_cpu())
+    #psim.target(pairs.target_cpu(parallel=True))
+
+psim.add_position('position')
+psim.add_property('mass', pairs.real(), 1.0)
+psim.add_property('linear_velocity', pairs.vector())
+psim.add_property('angular_velocity', pairs.vector())
+psim.add_property('force', pairs.vector(), volatile=True)
+psim.add_property('torque', pairs.vector(), volatile=True)
+psim.add_property('old_force', pairs.vector(), volatile=True) #old force added
+psim.add_property('old_torque', pairs.vector(), volatile=True)
+psim.add_property('radius', pairs.real(), 1.0)
+psim.add_property('normal', pairs.vector())
+psim.add_property('inv_inertia', pairs.matrix())
+psim.add_property('rotation_matrix', pairs.matrix())
+psim.add_property('rotation_quat', pairs.quaternion())
+psim.add_feature('type', ntypes)
+psim.add_feature_property('type', 'friction_static', pairs.real(), [frictionStatic for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDynamic for i in range(ntypes * ntypes)])
+psim.add_contact_property('is_sticking', pairs.int32(), 0)
+psim.add_contact_property('tangential_spring_displacement', pairs.vector(), [0.0, 0.0, 0.0])
+psim.add_contact_property('impact_velocity_magnitude', pairs.real(), 0.0)
+
+psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
+# psim.set_domain_partitioner(pairs.block_forest())
+psim.set_domain_partitioner(pairs.regular_domain_partitioner_xy())
+psim.pbc([True, True, False])
+psim.dem_sc_grid(
+    domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
+    diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
+
+#psim.read_particle_data(
+#    "data/spheres.input",
+#    "data/spheres_4x4x2.input",
+#    "data/spheres_6x6x2.input",
+#    "data/spheres_8x8x2.input",
+#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+#psim.read_particle_data(
+#    "data/spheres_bottom.input",
+#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+psim.read_particle_data(
+    "data/planes.input",
+    ['type', 'mass', 'position', 'normal', 'flags'],
+    pairs.halfspace())
+
+psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
+                                     'pi': math.pi,
+                                     'infinity': math.inf })
+
+#psim.compute_half()
+psim.compute(velocity_verlet_pre_force_update, symbols={'dt': dt_SI})
+
+psim.build_cell_lists(linkedCellWidth)
+psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
+
+psim.compute(gravity,
+             symbols={'densityParticle_SI': densityParticle_SI,
+                      'densityFluid_SI': densityFluid_SI,
+                      'gravity_SI': gravity_SI,
+                      'pi': math.pi })
+
+psim.compute(linear_spring_dashpot,
+             linkedCellWidth,
+             symbols={'dt': dt_SI,
+                      'pi': math.pi,
+                      'kappa': kappa,
+                      'lnDryResCoeff': lnDryResCoeff,
+                      'collisionTime_SI': collisionTime_SI})
+
+# psim.compute(euler, symbols={'dt': dt_SI})
+psim.compute(velocity_post_force_update, symbols={'dt': dt_SI})
+
+psim.generate()
diff --git a/examples/dem_velverlet_blockforest.py b/examples/dem_velverlet_blockforest.py
new file mode 100644
index 0000000000000000000000000000000000000000..eedac4f37ecefdc4c697c4559c434eb944257612
--- /dev/null
+++ b/examples/dem_velverlet_blockforest.py
@@ -0,0 +1,206 @@
+import math
+import pairs
+import sys
+import os
+
+def update_mass_and_inertia(i):
+    rotation_matrix[i] = diagonal_matrix(1.0)
+    rotation_quat[i] = default_quaternion()
+
+    if is_sphere(i):
+        inv_inertia[i] = inversed(diagonal_matrix(0.4 * mass[i] * radius[i] * radius[i]))
+
+    else:
+        mass[i] = infinity
+        inv_inertia[i] = 0.0
+
+
+def linear_spring_dashpot(i, j):
+    delta_ij = -penetration_depth(i, j)
+    skip_when(delta_ij < 0.0)
+
+    meff = 1.0 / ((1.0 / mass[i]) + (1.0 / mass[j]))
+    stiffness_norm = meff * (pi * pi + lnDryResCoeff * lnDryResCoeff) / \
+                     (collisionTime_SI * collisionTime_SI)
+    damping_norm = -2.0 * meff * lnDryResCoeff / collisionTime_SI
+    damping_tan = sqrt(kappa) * damping_norm
+
+    velocity_wf_i = linear_velocity[i] + cross(angular_velocity[i], contact_point(i, j) - position[i])
+    velocity_wf_j = linear_velocity[j] + cross(angular_velocity[j], contact_point(i, j) - position[j])
+
+    rel_vel = -(velocity_wf_i - velocity_wf_j)
+    rel_vel_n = dot(rel_vel, contact_normal(i, j))
+    rel_vel_t = rel_vel - rel_vel_n * contact_normal(i, j)
+    fNabs = stiffness_norm * delta_ij + damping_norm * rel_vel_n
+    fN = fNabs * contact_normal(i, j)
+
+    fTabs = min(damping_tan * length(rel_vel_t), friction_dynamic[i, j] * fNabs)
+    fT = fTabs * normalized(rel_vel_t)
+
+    partial_force = fN + fT
+
+    apply(force, partial_force)
+    apply(torque, cross(contact_point(i, j) - position, partial_force))
+
+# velocity verlet pre force update
+def velocityVerletPreForce(i):
+    inv_mass = 1.0 / mass[i]
+    position[i] += linear_velocity[i] * dt + 0.5 * inv_mass * old_force[i] * dt * dt
+    wdot = rotation_matrix[i] * (inv_inertia[i] * old_torque[i]) * transposed(rotation_matrix[i])
+    phi = angular_velocity[i] * dt + 0.5 * wdot * dt * dt
+    rotation_quat[i] = quaternion(phi, length(phi)) * rotation_quat[i]
+    rotation_matrix[i] = quaternion_to_rotation_matrix(rotation_quat[i])
+
+
+def velocityVerletPostForce(i):
+    inv_mass = 1.0 / mass[i]
+    linear_velocity[i] += inv_mass * 0.5 * (old_force[i] + force[i]) * dt
+    torque_avg = 0.5 * (old_torque[i] + torque[i])
+    wdot = rotation_matrix[i] * (inv_inertia[i] * torque_avg) * transposed(rotation_matrix[i])
+    angular_velocity[i] += wdot * dt
+    
+    old_force[i] = force[i]
+    old_torque[i] = torque[i]
+
+
+def gravity(i):
+    volume = (4.0 / 3.0) * pi * radius[i] * radius[i] * radius[i]
+    force[i][2] = force[i][2] - (densityParticle_SI - densityFluid_SI) * volume * gravity_SI
+
+
+cmd = sys.argv[0]
+target = sys.argv[1] if len(sys.argv[1]) > 1 else "none"
+if target != 'cpu' and target != 'gpu':
+    print(f"Invalid target, use {cmd} <cpu/gpu>")
+
+# Config file parameters
+# domainSize_SI = [0.8, 0.2, 0.2]
+# domainSize_SI = [0.8, 0.015, 0.2] #finel
+# domainSize_SI = [0.8, 0.03, 0.2]
+# domainSize_SI = [0.64, 0.05, 0.2]
+#domainSize_SI = [0.4, 0.4, 0.2] # node base
+#domainSize_SI = [0.6, 0.6, 0.2] # node base
+#domainSize_SI = [0.8, 0.8, 0.2] # node base
+domainSize_SI = [0.05, 0.05, 0.05]
+diameter_SI = 2.5e-3 #0.0029
+gravity_SI = 9.81
+densityFluid_SI = 1179
+densityParticle_SI = 2650
+generationSpacing_SI = 2.75e-3
+initialVelocity_SI = 0.05
+dt_SI = 5e-5
+frictionCoefficient = 0.6
+restitutionCoefficient = 0.1
+collisionTime_SI = 5e-4
+poissonsRatio = 0.22
+timeSteps = 16000
+#timeSteps = 1000
+visSpacing = 4000
+denseBottomLayer = False
+bottomLayerOffsetFactor = 1.0
+kappa = 2.0 * (1.0 - poissonsRatio) / (2.0 - poissonsRatio) # from Thornton et al
+
+minDiameter_SI = diameter_SI * 0.9
+maxDiameter_SI = diameter_SI * 1.1
+linkedCellWidth = 1.01 * maxDiameter_SI
+
+ntypes = 1
+lnDryResCoeff = math.log(restitutionCoefficient)
+frictionStatic = 0.0
+frictionDynamic = frictionCoefficient
+
+
+file_name = os.path.basename(__file__)
+file_name_without_extension = os.path.splitext(file_name)[0]
+
+psim = pairs.simulation(
+    # "dem_vel_ver",
+    file_name_without_extension,
+    [pairs.sphere(), pairs.halfspace()],
+    timesteps=timeSteps,
+    double_prec=True,
+    use_contact_history=True,
+    particle_capacity=1000000,
+    neighbor_capacity=20,
+    debug=True, generate_whole_program=True)
+
+if target == 'gpu':
+    psim.target(pairs.target_gpu())
+else:
+    psim.target(pairs.target_cpu())
+    #psim.target(pairs.target_cpu(parallel=True))
+
+psim.add_position('position')
+psim.add_property('mass', pairs.real(), 1.0)
+psim.add_property('linear_velocity', pairs.vector())
+psim.add_property('angular_velocity', pairs.vector())
+psim.add_property('force', pairs.vector(), volatile=True)
+psim.add_property('torque', pairs.vector(), volatile=True)
+psim.add_property('old_force', pairs.vector(), volatile=True) #old force added
+psim.add_property('old_torque', pairs.vector(), volatile=True)
+psim.add_property('radius', pairs.real(), 1.0)
+psim.add_property('normal', pairs.vector())
+psim.add_property('inv_inertia', pairs.matrix())
+psim.add_property('rotation_matrix', pairs.matrix())
+psim.add_property('rotation_quat', pairs.quaternion())
+psim.add_feature('type', ntypes)
+psim.add_feature_property('type', 'friction_static', pairs.real(), [frictionStatic for i in range(ntypes * ntypes)])
+psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDynamic for i in range(ntypes * ntypes)])
+psim.add_contact_property('is_sticking', pairs.int32(), 0)
+psim.add_contact_property('tangential_spring_displacement', pairs.vector(), [0.0, 0.0, 0.0])
+psim.add_contact_property('impact_velocity_magnitude', pairs.real(), 0.0)
+
+psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
+psim.set_domain_partitioner(pairs.block_forest())
+# psim.set_domain_partitioner(pairs.regular_domain_partitioner_xy())
+psim.pbc([True, True, False])
+psim.dem_sc_grid(
+    domainSize_SI[0], domainSize_SI[1], domainSize_SI[2], generationSpacing_SI,
+    diameter_SI, minDiameter_SI, maxDiameter_SI, initialVelocity_SI, densityParticle_SI, ntypes)
+
+#psim.read_particle_data(
+#    "data/spheres.input",
+#    "data/spheres_4x4x2.input",
+#    "data/spheres_6x6x2.input",
+#    "data/spheres_8x8x2.input",
+#    ['uid', 'type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+#psim.read_particle_data(
+#    "data/spheres_bottom.input",
+#    ['type', 'mass', 'radius', 'position', 'linear_velocity', 'flags'],
+#    pairs.sphere())
+
+psim.read_particle_data(
+    "data/planes.input",
+    ['type', 'mass', 'position', 'normal', 'flags'],
+    pairs.halfspace())
+
+psim.setup(update_mass_and_inertia, {'densityParticle_SI': densityParticle_SI,
+                                     'pi': math.pi,
+                                     'infinity': math.inf })
+
+#psim.compute_half()
+psim.compute(velocityVerletPreForce, symbols={'dt': dt_SI})
+
+psim.build_cell_lists(linkedCellWidth)
+psim.vtk_output(f"output/dem_{target}", frequency=visSpacing)
+
+psim.compute(gravity,
+             symbols={'densityParticle_SI': densityParticle_SI,
+                      'densityFluid_SI': densityFluid_SI,
+                      'gravity_SI': gravity_SI,
+                      'pi': math.pi })
+
+psim.compute(linear_spring_dashpot,
+             linkedCellWidth,
+             symbols={'dt': dt_SI,
+                      'pi': math.pi,
+                      'kappa': kappa,
+                      'lnDryResCoeff': lnDryResCoeff,
+                      'collisionTime_SI': collisionTime_SI})
+
+# psim.compute(euler, symbols={'dt': dt_SI})
+psim.compute(velocityVerletPostForce, symbols={'dt': dt_SI})
+
+psim.generate()
diff --git a/examples/main.cpp b/examples/main2.txt
similarity index 100%
rename from examples/main.cpp
rename to examples/main2.txt
diff --git a/runtime/dem_sc_grid.cpp b/runtime/dem_sc_grid.cpp
index 863f82628d39861f893db4240159c9d4d2298540..393f4b02a75de82bcdb292ee9464588ffba3df7c 100644
--- a/runtime/dem_sc_grid.cpp
+++ b/runtime/dem_sc_grid.cpp
@@ -124,6 +124,7 @@ int dem_sc_grid(PairsRuntime *ps, double xmax, double ymax, double zmax, double
         std::cout << "Particle density: " << particle_density << std::endl;
         std::cout << "Number of types: " << ntypes << std::endl;
         std::cout << "Number of particles: " << global_nparticles << std::endl;
+        std::cout << "Number of particles per rank: " << nparticles << std::endl;
     }
 
     return nparticles;
diff --git a/runtime/domain/block_forest.cpp b/runtime/domain/block_forest.cpp
index fcc532917744dbaf474de43b9b65acd018b4c762..957b5aeb9f0235055f6cc29d5b2270e630646bd0 100644
--- a/runtime/domain/block_forest.cpp
+++ b/runtime/domain/block_forest.cpp
@@ -269,8 +269,12 @@ void BlockForest::initialize(int *argc, char ***argv) {
 
     int gridsize[3] = {32, 32, 32};
     auto procs = mpiManager->numProcesses();
-    auto block_config = balance_workload ? walberla::Vector3<int>(1, 1, 1) :
-                                           getBlockConfig(procs, gridsize[0], gridsize[1], gridsize[2]);
+    // std::cout<<"procs: "<<procs<<std::endl;
+    // std::cout<<"gridsize: "<<gridsize[0]<<" "<<gridsize[1]<<" "<<gridsize[2]<<std::endl;
+
+    // auto block_config = balance_workload ? walberla::Vector3<int>(1, 1, 1) :
+    //                                        getBlockConfig(procs, gridsize[0], gridsize[1], gridsize[2]);
+    auto block_config = walberla::Vector3<int>(12,6,1);
 
     if(rank==0) std::cout << "block_config = " << block_config << std::endl;
 
diff --git a/runtime/read_from_file.cpp b/runtime/read_from_file.cpp
index 39e9bef6c279ef7ce168cc2f1a7374c564b721cb..6aec00217626b468f5abdb38ff725642e15671e4 100644
--- a/runtime/read_from_file.cpp
+++ b/runtime/read_from_file.cpp
@@ -33,7 +33,7 @@ size_t read_particle_data(
     std::ifstream in_file(filename, std::ifstream::in);
     std::string line;
     auto shape_ptr = ps->getAsIntegerProperty(ps->getPropertyByName("shape"));
-    auto uid_ptr = ps->getAsUInt64Property(ps->getPropertyByName("uid"));
+    // auto uid_ptr = ps->getAsUInt64Property(ps->getPropertyByName("uid"));
     int n = start;
 
     if(!in_file.is_open()) {
@@ -101,6 +101,8 @@ size_t read_particle_data(
 
                 if(prop.getName() == "uid") {
                     std::cerr << "Can't read uid from file." << std::endl;
+                    std::cerr << "Property name: " << prop.getName() << std::endl;
+                     std::cerr << "Value: " << in0 << std::endl;
                     exit(-1);
                 }
             } else if(prop_type == Prop_Real) {
@@ -115,7 +117,7 @@ size_t read_particle_data(
         }
 
         if(within_domain || flags & (FLAGS_INFINITE | FLAGS_FIXED | FLAGS_GLOBAL)) {
-            uid_ptr(n) = (flags & FLAGS_GLOBAL) ? UniqueID::createGlobal(ps) : UniqueID::create(ps);
+            // uid_ptr(n) = (flags & FLAGS_GLOBAL) ? UniqueID::createGlobal(ps) : UniqueID::create(ps);
             shape_ptr(n++) = shape_id;
         }
     }
diff --git a/runtime/timers.hpp b/runtime/timers.hpp
index c4cdc943aa5faeed57b4971684277e0844d3e7da..469f534d5e898a87597e06c46807f989c253760a 100644
--- a/runtime/timers.hpp
+++ b/runtime/timers.hpp
@@ -34,6 +34,7 @@ public:
         std::unordered_map<std::string, TimeType> categorySums;
 
         std::cout << "all: " << counters[0] << std::endl;
+        
         for (size_t i = 1; i < counters.size(); ++i) {
             const std::string& counterName = counter_names[i];
             TimeType counterValue = counters[i];
@@ -63,6 +64,8 @@ public:
             }
         }
 
+        // std::cout<<"Particle update per second: " <<
+
         // Print the accumulated sums for each category
         for(const auto& category: categorySums) {
             std::cout << category.first << ": " << category.second << std::endl;
diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py
index 18d4771e891f05b69b8d1814e2402e0f92d20229..bc6ec97fc5a5025ae08148109bbcd0aec89f6817 100644
--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -370,7 +370,8 @@ class CGen:
 
         self.print.end()
 
-    def generate_library(self, initialize_module, create_domain_module, setup_sim_module,  do_timestep_module, reverse_comm_module, communicate_module):
+    # def generate_library(self, initialize_module, create_domain_module, setup_sim_module,  do_timestep_module, reverse_comm_module, communicate_module):
+    def generate_library(self, initialize_module, create_domain_module, setup_sim_module,  do_timestep_module, communicate_module):
         self.generate_interfaces()
         # Generate CUDA/CPP file with modules
         ext = ".cu" if self.target.is_gpu() else ".cpp"
@@ -398,7 +399,8 @@ class CGen:
             self.generate_kernel(kernel)
 
         for module in self.sim.modules():
-            if module.name not in ['initialize', 'create_domain', 'setup_sim', 'do_timestep', 'reverse_comm', 'communicate']:
+            # if module.name not in ['initialize', 'create_domain', 'setup_sim', 'do_timestep', 'reverse_comm', 'communicate']:
+            if module.name not in ['initialize', 'create_domain', 'setup_sim', 'do_timestep', 'communicate']:
                 self.generate_module(module)
 
         self.print.end()
@@ -480,9 +482,9 @@ class CGen:
         self.print("}")
         self.print("")
 
-        self.print("void reverse_comm() {")
+        # self.print("void reverse_comm() {")
         self.print.add_indent(4)
-        self.generate_statement(reverse_comm_module.block)
+        # self.generate_statement(reverse_comm_module.block)
         self.print.add_indent(-4)
         self.print("}")
         self.print("")
diff --git a/src/pairs/sim/simulation.py b/src/pairs/sim/simulation.py
index 870231bd9f83e09c911f49c330e71b35bf53a817..0c5c3c9c6913e47164ed14e8f56d3d31f31b859f 100644
--- a/src/pairs/sim/simulation.py
+++ b/src/pairs/sim/simulation.py
@@ -442,7 +442,7 @@ class Simulation:
 
         # Initialize communication instance with specified domain-partitioner
         comm = Comm(self, self._dom_part)
-        reverse_comm_module = comm.reverse_comm(reduce=True)
+        # reverse_comm_module = comm.reverse_comm(reduce=False)
 
         # Params that determine when a method must be called only when reneighboring
         every_reneighbor_params = {'every': self.reneighbor_frequency}
@@ -489,8 +489,8 @@ class Simulation:
         timestep_procedures += [ResetVolatileProperties(self)]
 
         # For whole-program-generation, add reverse_comm wherever needed in the timestep loop (eg: after computational kernels) like this:
-        if self._generate_whole_program:
-            timestep_procedures += [reverse_comm_module]
+        # if self._generate_whole_program:
+        #     timestep_procedures += [reverse_comm_module]
 
         # Clear unused contact history
         if self._use_contact_history:
@@ -561,12 +561,15 @@ class Simulation:
             do_timestep_module = Module(self, name='do_timestep', block=timestep.as_block())
             communicate_module = Module(self, name='communicate', block=Timestep(self, 0, comm_routine).as_block())
 
-            modules_list = [initialize_module, create_domain_module, setup_sim_module, do_timestep_module, reverse_comm_module, communicate_module]
+            # modules_list = [initialize_module, create_domain_module, setup_sim_module, do_timestep_module, reverse_comm_module, communicate_module]
+            modules_list = [initialize_module, create_domain_module, setup_sim_module, do_timestep_module, communicate_module]
+
 
             transformations = Transformations(modules_list, self._target)
             transformations.apply_all()
 
             # Generate library
-            self.code_gen.generate_library(initialize_module, create_domain_module, setup_sim_module, do_timestep_module, reverse_comm_module, communicate_module)
+            # self.code_gen.generate_library(initialize_module, create_domain_module, setup_sim_module, do_timestep_module, reverse_comm_module, communicate_module)
+            self.code_gen.generate_library(initialize_module, create_domain_module, setup_sim_module, do_timestep_module, communicate_module)
 
         self.code_gen.generate_interfaces()