diff --git a/FindwaLBerla.cmake b/FindwaLBerla.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1bbb2d2c6d9f6c7b20cc3cc98d02e627a686f90f
--- /dev/null
+++ b/FindwaLBerla.cmake
@@ -0,0 +1,15 @@
+set( WALBERLA_DIR    WALBERLA_DIR-NOTFOUND   CACHE  PATH  "waLBerla path"  )
+
+if ( WALBERLA_DIR )
+    # WALBERLA_DIR has to point to the waLBerla source directory
+    # this command builds waLBerla (again) in the current build directory in the subfolder "walberla" (second argument)
+    add_subdirectory( ${WALBERLA_DIR} walberla  )
+    
+    waLBerla_import()
+    # Adds the 'src' and 'tests' directory of current app
+    list( APPEND WALBERLA_MODULE_DIRS "${CMAKE_SOURCE_DIR}/src" "${CMAKE_SOURCE_DIR}/tests" )
+    list( REMOVE_DUPLICATES  WALBERLA_MODULE_DIRS )
+    set ( WALBERLA_MODULE_DIRS  ${WALBERLA_MODULE_DIRS} CACHE INTERNAL "All folders that contain modules or tests" )
+else()
+    message( FATAL_ERROR "waLBerla not found - Use 'cmake -DWALBERLA_DIR=path_to_waLBerla_sources  pathToApplicationSources' "  )
+endif()
diff --git a/examples/md.py b/examples/md.py
index 574eb2e562517415639493061d3ee0e64a6ae979..30a669a839e26869071d56b74c282f9124619667 100644
--- a/examples/md.py
+++ b/examples/md.py
@@ -51,7 +51,8 @@ psim.add_feature_property('type', 'epsilon', pairs.real(), [sigma for i in range
 psim.add_feature_property('type', 'sigma6', pairs.real(), [epsilon for i in range(ntypes * ntypes)])
 
 psim.copper_fcc_lattice(nx, ny, nz, rho, temp, ntypes)
-psim.set_domain_partitioner(pairs.regular_domain_partitioner())
+psim.set_domain_partitioner(pairs.block_forest())
+#psim.set_domain_partitioner(pairs.regular_domain_partitioner())
 psim.compute_thermo(100)
 
 psim.reneighbor_every(20)
diff --git a/runtime/domain/block_forest.cpp b/runtime/domain/block_forest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f45da8b8b80164a77646462d90e6830a26f55
--- /dev/null
+++ b/runtime/domain/block_forest.cpp
@@ -0,0 +1,473 @@
+#include <mpi.h>
+#include <vector>
+//---
+#include <blockforest/BlockForest.h>
+#include <blockforest/Initialization.h>
+#include <blockforest/loadbalancing/DynamicCurve.h>
+#include <blockforest/loadbalancing/DynamicDiffusive.h>
+#include <blockforest/loadbalancing/DynamicParMetis.h>
+#include <blockforest/loadbalancing/InfoCollection.h>
+#include <blockforest/loadbalancing/PODPhantomData.h>
+#include <pe/amr/level_determination/MinMaxLevelDetermination.h>
+#include <pe/amr/weight_assignment/MetisAssignmentFunctor.h>
+#include <pe/amr/weight_assignment/WeightAssignmentFunctor.h>
+//---
+#include "../pairs_common.hpp"
+#include "../devices/device.hpp"
+#include "regular_6d_stencil.hpp"
+#include "MDDataHandling.h"
+
+namespace pairs {
+
+void BlockForest::updateNeighborhood(
+    std::shared_ptr<BlockForest> forest,
+    blockforest::InfoCollection& info,
+    real_t *ranks,
+    real_t *naabbs,
+    real_t *aabbs) {
+
+    auto me = mpi::MPIManager::instance()->rank();
+    for(auto& iblock: *forest) {
+        auto block = static_cast<blockforest::Block *>(&iblock);
+        auto& block_info = info[block->getId()];
+
+        if(block_info.computationalWeight > 0) {
+            for(uint neigh = 0; neigh < block->getNeighborhoodSize(); ++neigh) {
+                auto neighbor_rank = int_c(block->getNeighborProcess(neigh));
+
+                if(neighbor_rank != me) {
+                    const BlockID& neighbor_block = block->getNeighborId(neigh);
+                    math::AABB neighbor_aabb = block->getNeighborAABB(neigh);
+                    auto neighbor_info = info[neighbor_block];
+                    auto begin = blocks_pushed[neighbor_rank].begin();
+                    auto end = blocks_pushed[neighbor_rank].end();
+
+                    if(neighbor_info.computationalWeight > 0 &&
+                       find_if(begin, end, [nb](const auto &nbh) { return nbh == nb; }) == end) {
+
+                        neighborhood[neighbor_rank].push_back(neighbor_aabb);
+                        blocks_pushed[neighbor_rank].push_back(neighbor_block);
+                    }
+                }
+            }
+        }
+    }
+
+    vec_ranks.clear();
+    vec_naabbs.clear();
+    vec_aabbs.clear();
+    total_aabbs = 0;
+
+    for(auto& nbh: neighborhood) {
+        auto rank = nbh.first;
+        auto aabb_list = nbh.second;
+        vec_ranks.push_back((int) rank);
+        vec_naabbs.push_back((int) aabb_list.size());
+
+        for(auto &aabb: aabb_list) {
+            vec_aabbs.push_back(aabb.xMin());
+            vec_aabbs.push_back(aabb.xMax());
+            vec_aabbs.push_back(aabb.yMin());
+            vec_aabbs.push_back(aabb.yMax());
+            vec_aabbs.push_back(aabb.zMin());
+            vec_aabbs.push_back(aabb.zMax());
+            total_aabbs++;
+        }
+    }
+
+    *nranks = nranks;
+    if(nranks > *rank_capacity) {
+        // reallocateArray?
+        const int new_capacity = nranks + 10;
+        delete[] ranks;
+        delete[] naabbs;
+        delete[] offsets;
+        ranks = new int[new_capacity];
+        naabbs = new int[new_capacity];
+        offsets = new int[new_capacity];
+        *rank_capacity = new_capacity;
+    }    
+
+    if(total_aabbs > *aabb_capacity) {
+        const int new_capacity = total_aabbs + 10;
+        aabbs = new real_t[new_capacity * 6];
+        *aabb_capacity = new_capacity;
+    }
+
+    int offset = 0;
+    for(int i = 0; i < nranks; i++) {
+        ranks[i] = vec_ranks.data()[i];
+        naabbs[i] = vec_naabbs.data()[i];
+        offsets[i] = offset;
+        offset += naabbs[i];
+    }
+
+    for(int i = 0; i < total_aabbs * 6; i++) {
+        aabbs[i] = vec_aabbs.data()[i];
+    }
+
+    ps->copyToDevice(aabbs);
+}
+
+/*
+  extern fn md_compute_boundary_weights(
+    xmin: real_t, xmax: real_t, ymin: real_t, ymax: real_t, zmin: real_t, zmax: real_t,
+    computational_weight: &mut u32, communication_weight: &mut u32) -> () {
+
+    let grid = grid_;
+    let particle = make_particle(grid, array_dev, ParticleDataLayout(), null_layout());
+    let sum = @|a: i32, b: i32| { a + b };
+    let aabb = AABB {
+        xmin: xmin,
+        xmax: xmax,
+        ymin: ymin,
+        ymax: ymax,
+        zmin: zmin,
+        zmax: zmax
+    };
+
+    *computational_weight = reduce_i32(grid.nparticles, 0, sum, |i| {
+        select(is_within_domain(particle.get_position(i), aabb), 1, 0)
+    }) as u32;
+
+    *communication_weight = reduce_i32(grid.nghost, 0, sum, |i| {
+        select(is_within_domain(particle.get_position(grid.nparticles + i), aabb), 1, 0)
+ }) as u32;
+ */
+
+void BlockForest::updateWeights(
+    shared_ptr<BlockForest> forest,
+    blockforest::InfoCollection& info,
+    real_t *position,
+    int nparticles) {
+
+    mpi::BufferSystem bs(mpi::MPIManager::instance()->comm(), 756);
+
+    info.clear();
+    for(auto& iblock: *forest) {
+        auto block = static_cast<blockforest::Block *>(&iblock);
+        auto aabb = block->getAABB();
+        auto& block_info = info[block->getId()];
+
+        pairs->callModule(
+            "computeBoundaryWeights",
+            aabb.xMin(), aabb.xMax(), aabb.yMin(), aabb.yMax(), aabb.zMin(), aabb.zMax(),
+            &(block_info.computationalWeight), &(block_info.communicationWeight));
+
+        for(uint_t branch = 0; branch < 8; ++branch) {
+            const auto b_id = BlockID(block->getId(), branch);
+            const auto b_aabb = forest->getAABBFromBlockId(b_id);
+            auto& b_info = info[b_id];
+
+            pairs->callModule(
+                "computeBoundaryWeights",
+                b_aabb.xMin(), b_aabb.xMax(), b_aabb.yMin(), b_aabb.yMax(), b_aabb.zMin(), b_aabb.zMax(),
+                &(b_info.computationalWeight), &(b_info.communicationWeight));
+        }
+    }
+
+    for(auto& iblock: *forest) {
+        auto block = static_cast<blockforest::Block *>(&iblock);
+        auto& block_info = info[block->getId()];
+
+        for(uint_t neigh = 0; neigh < block->getNeighborhoodSize(); ++neigh) {
+            bs.sendBuffer(block->getNeighborProcess(neigh)) <<
+                blockforest::InfoCollection::value_type(block->getId(), block_info);
+        }
+
+        for(uint_t branch = 0; branch < 8; ++branch) {
+            const auto b_id = BlockID(block->getId(), branch);
+            auto& b_info = info[b_id];
+
+            for(uint_t neigh = 0; neigh < block->getNeighborhoodSize(); ++neigh) {
+                bs.sendBuffer(block->getNeighborProcess(neigh)) <<
+                    blockforest::InfoCollection::value_type(b_id, b_info);
+            }
+        }
+    }
+
+    bs.setReceiverInfoFromSendBufferState(false, true);
+    bs.sendAll();
+
+    for(auto recv = bs.begin(); recv != bs.end(); ++recv) {
+        while(!recv.buffer().isEmpty()) {
+            blockforest::InfoCollectionPair val;
+            recv.buffer() >> val;
+            info.insert(val);
+        }
+    }
+}
+
+Vector3<uint_t> BlockForest::getBlockConfig(uint_t num_processes, uint_t nx, uint_t ny, uint_t nz) {
+    const uint_t bx_factor = 1;
+    const uint_t by_factor = 1;
+    const uint_t bz_factor = 1;
+    const uint_t ax = nx * ny;
+    const uint_t ay = nx * nz;
+    const uint_t az = ny * nz;
+
+    uint_t bestsurf = 2 * (ax + ay + az);
+    uint_t x = 1;
+    uint_t y = 1;
+    uint_t z = 1;
+
+    for(uint_t i = 1; i < num_processes; ++i) {
+        if(num_processes % i == 0) {
+            const uint_t rem_yz = num_processes / i;
+
+            for(uint_t j = 1; j < rem_yz; ++j) {
+                if(rem_yz % j == 0) {
+                    const uint_t k = rem_yz / j;
+                    const uint_t surf = (ax / i / j) + (ay / i / k) + (az / j / k);
+
+                    if(surf < bestsurf) {
+                        x = i, y = j, z = k;
+                        bestsurf = surf;
+                    }
+                }
+            }
+        }
+    }
+
+    return Vector3<uint_t>(x * bx_factor, y * by_factor, z * bz_factor);
+}
+
+uint_t BlockForest::getInitialRefinementLevel(uint_t num_processes) {
+    uint_t splitFactor = 8;
+    uint_t blocks = splitFactor;
+    uint_t refinementLevel = 1;
+
+    while(blocks < num_processes) {
+        refinementLevel++;
+        blocks *= splitFactor;
+    }
+
+    return refinementLevel;
+}
+
+void BlockForest::getBlockForestAABB(double (&rank_aabb)[6]) {
+    auto aabb_union = forest->begin()->getAABB();
+
+    for(auto& iblock: *forest) {
+        auto block = static_cast<blockforest::Block *>(&iblock);
+        aabb_union.merge(block->getAABB());
+    }
+
+    rank_aabb[0] = aabb_union.xMin();
+    rank_aabb[1] = aabb_union.xMax();
+    rank_aabb[2] = aabb_union.yMin();
+    rank_aabb[3] = aabb_union.yMax();
+    rank_aabb[4] = aabb_union.zMin();
+    rank_aabb[5] = aabb_union.zMax();
+}
+
+void BlockForest::setConfig() {
+    auto mpiManager = mpi::MPIManager::instance();
+    mpiManager->initializeMPI(&argc, &argv);
+    mpiManager->useWorldComm();
+
+    math::AABB domain(xmin, ymin, zmin, xmax, ymax, zmax);
+    int gridsize[3] = {32, 32, 32};
+    auto procs = mpiManager->numProcesses();
+    auto block_config = use_load_balancing ? Vector3<uint_t>(1, 1, 1) : getBlockConfig(procs, gridsize[0], gridsize[1], gridsize[2]);
+    auto ref_level = use_load_balancing ? getInitialRefinementLevel(procs) : 0;
+
+    auto forest = blockforest::createBlockForest(
+        domain, block_config, Vector3<bool>(true, true, true), procs, ref_level);
+
+    auto is_within_domain = bind(isWithinBlockForest, _1, _2, _3, forest);
+    auto info = make_shared<blockforest::InfoCollection>();
+    getBlockForestAABB(forest, rank_aabb);
+}
+
+void BlockForest::setBoundingBox() {
+    MPI_Comm cartesian;
+    int *myloc = new int[ndims];
+    int *periods = new int[ndims];
+    real_t *rank_length = new real_t[ndims];
+    int reorder = 0;
+
+    for(int d = 0; d < ndims; d++) {
+        periods[d] = 1;
+        rank_length[d] = (this->grid_max[d] - this->grid_min[d]) / (real_t) nranks[d];
+    }
+
+    MPI_Cart_create(MPI_COMM_WORLD, ndims, nranks, periods, reorder, &cartesian);
+    MPI_Cart_get(cartesian, ndims, nranks, periods, myloc);
+    for(int d = 0; d < ndims; d++) {
+        MPI_Cart_shift(cartesian, d, 1, &(prev[d]), &(next[d]));
+        pbc_prev[d] = (myloc[d] == 0) ? 1 : 0;
+        pbc_next[d] = (myloc[d] == nranks[d] - 1) ? -1 : 0;
+        subdom_min[d] = this->grid_min[d] + rank_length[d] * (real_t)myloc[d];
+        subdom_max[d] = subdom_min[d] + rank_length[d];
+    }
+
+    delete[] myloc;
+    delete[] periods;
+    delete[] rank_length;
+    MPI_Comm_free(&cartesian);
+}
+
+void BlockForest::initialize(int *argc, char ***argv) {
+    MPI_Init(argc, argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    this->setConfig();
+    this->setBoundingBox();
+}
+
+void BlockForest::finalize() {
+    MPI_Finalize();
+}
+
+bool BlockForest::isWithinSubdomain(real_t x, real_t y, real_t z) {
+    for(auto& iblock: *forest) {
+        auto block = static_cast<blockforest::Block *>(&iblock);
+
+        if(block->getAABB().contains(x, y, z)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void BlockForest::fillArrays(int *neighbor_ranks, int *pbc, real_t *subdom) {
+    for(int d = 0; d < ndims; d++) {
+        neighbor_ranks[d * 2 + 0] = prev[d];
+        neighbor_ranks[d * 2 + 1] = next[d];
+        pbc[d * 2 + 0] = pbc_prev[d];
+        pbc[d * 2 + 1] = pbc_next[d];
+        subdom[d * 2 + 0] = subdom_min[d];
+        subdom[d * 2 + 1] = subdom_max[d];
+    }
+}
+
+void BlockForest::communicateSizes(int dim, const int *send_sizes, int *recv_sizes) {
+    if(prev[dim] != rank) {
+        MPI_Send(&send_sizes[dim * 2 + 0], 1, MPI_INT, prev[dim], 0, MPI_COMM_WORLD);
+        MPI_Recv(&recv_sizes[dim * 2 + 0], 1, MPI_INT, next[dim], 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else {
+        recv_sizes[dim * 2 + 0] = send_sizes[dim * 2 + 0];
+    }
+
+    if(next[dim] != rank) {
+        MPI_Send(&send_sizes[dim * 2 + 1], 1, MPI_INT, next[dim], 0, MPI_COMM_WORLD);
+        MPI_Recv(&recv_sizes[dim * 2 + 1], 1, MPI_INT, prev[dim], 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else {
+        recv_sizes[dim * 2 + 1] = send_sizes[dim * 2 + 1];
+    }
+}
+
+void BlockForest::communicateData(
+    int dim, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+    //MPI_Request recv_requests[2];
+    //MPI_Request send_requests[2];
+    const real_t *send_prev = &send_buf[send_offsets[dim * 2 + 0] * elem_size];
+    const real_t *send_next = &send_buf[send_offsets[dim * 2 + 1] * elem_size];
+    real_t *recv_prev = &recv_buf[recv_offsets[dim * 2 + 0] * elem_size];
+    real_t *recv_next = &recv_buf[recv_offsets[dim * 2 + 1] * elem_size];
+
+    if(prev[dim] != rank) {
+        MPI_Sendrecv(
+            send_prev, nsend[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0,
+            recv_prev, nrecv[dim * 2 + 0] * elem_size, MPI_DOUBLE, next[dim], 0,
+            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+        /*
+        MPI_Irecv(
+            recv_prev, nrecv[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0,
+            MPI_COMM_WORLD, &recv_requests[0]);
+
+        MPI_Isend(
+            send_prev, nsend[dim * 2 + 0] * elem_size, MPI_DOUBLE, prev[dim], 0,
+            MPI_COMM_WORLD, &send_requests[0]);
+        */
+    } else {
+        pairs::copy_in_device(recv_prev, send_prev, nsend[dim * 2 + 0] * elem_size * sizeof(real_t));
+    }
+
+    if(next[dim] != rank) {
+        MPI_Sendrecv(
+            send_next, nsend[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0,
+            recv_next, nrecv[dim * 2 + 1] * elem_size, MPI_DOUBLE, prev[dim], 0,
+            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+        /*
+        MPI_Irecv(
+            recv_next, nrecv[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0,
+            MPI_COMM_WORLD, &recv_requests[1]);
+
+        MPI_Isend(
+            send_next, nsend[dim * 2 + 1] * elem_size, MPI_DOUBLE, next[dim], 0,
+            MPI_COMM_WORLD, &send_requests[1]);
+        */
+    } else {
+        pairs::copy_in_device(recv_next, send_next, nsend[dim * 2 + 1] * elem_size * sizeof(real_t));
+    }
+
+    //MPI_Waitall(2, recv_requests, MPI_STATUSES_IGNORE);
+    //MPI_Waitall(2, send_requests, MPI_STATUSES_IGNORE);
+}
+
+void BlockForest::communicateAllData(
+    int ndims, int elem_size,
+    const real_t *send_buf, const int *send_offsets, const int *nsend,
+    real_t *recv_buf, const int *recv_offsets, const int *nrecv) {
+
+    //std::vector<MPI_Request> send_requests(ndims * 2, MPI_REQUEST_NULL);
+    //std::vector<MPI_Request> recv_requests(ndims * 2, MPI_REQUEST_NULL);
+
+    for (int d = 0; d < ndims; d++) {
+        const real_t *send_prev = &send_buf[send_offsets[d * 2 + 0] * elem_size];
+        const real_t *send_next = &send_buf[send_offsets[d * 2 + 1] * elem_size];
+        real_t *recv_prev = &recv_buf[recv_offsets[d * 2 + 0] * elem_size];
+        real_t *recv_next = &recv_buf[recv_offsets[d * 2 + 1] * elem_size];
+
+        if (prev[d] != rank) {
+            MPI_Sendrecv(
+                send_prev, nsend[d * 2 + 0] * elem_size, MPI_DOUBLE, prev[d], 0,
+                recv_prev, nrecv[d * 2 + 0] * elem_size, MPI_DOUBLE, next[d], 0,
+                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            /*
+            MPI_Isend(
+                send_prev, nsend[d * 2 + 0] * elem_size, MPI_DOUBLE, prev[d], 0,
+                MPI_COMM_WORLD, &send_requests[d * 2 + 0]);
+
+            MPI_Irecv(
+                recv_prev, nrecv[d * 2 + 0] * elem_size, MPI_DOUBLE, prev[d], 0,
+                MPI_COMM_WORLD, &recv_requests[d * 2 + 0]);
+            */
+        } else {
+            pairs::copy_in_device(recv_prev, send_prev, nsend[d * 2 + 0] * elem_size * sizeof(real_t));
+        }
+
+        if (next[d] != rank) {
+            MPI_Sendrecv(
+                send_next, nsend[d * 2 + 1] * elem_size, MPI_DOUBLE, next[d], 0,
+                recv_next, nrecv[d * 2 + 1] * elem_size, MPI_DOUBLE, prev[d], 0,
+                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            /*
+            MPI_Isend(
+                send_next, nsend[d * 2 + 1] * elem_size, MPI_DOUBLE, next[d], 0,
+                MPI_COMM_WORLD, &send_requests[d * 2 + 1]);
+
+            MPI_Irecv(
+                recv_next, nrecv[d * 2 + 1] * elem_size, MPI_DOUBLE, next[d], 0,
+                MPI_COMM_WORLD, &recv_requests[d * 2 + 1]);
+            */
+        } else {
+            pairs::copy_in_device(recv_next, send_next, nsend[d * 2 + 1] * elem_size * sizeof(real_t));
+        }
+    }
+
+    //MPI_Waitall(ndims * 2, send_requests.data(), MPI_STATUSES_IGNORE);
+    //MPI_Waitall(ndims * 2, recv_requests.data(), MPI_STATUSES_IGNORE);
+}
+
+}
diff --git a/runtime/domain/block_forest.hpp b/runtime/domain/block_forest.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e5f4a68745798b2d8ec0a55909b53ac47f85401e
--- /dev/null
+++ b/runtime/domain/block_forest.hpp
@@ -0,0 +1,58 @@
+#include <map>
+//---
+#include "../pairs_common.hpp"
+#include "domain_partitioning.hpp"
+
+#pragma once
+
+#define SMALL 0.00001
+
+namespace pairs {
+
+class BlockForest : public DomainPartitioner {
+private:
+    std::shared_ptr<BlockForest> forest;
+    std::map<uint_t, std::vector<math::AABB>> neighborhood;
+    std::map<uint_t, std::vector<BlockID>> blocks_pushed;
+    std::vector<int> ranks;
+    std::vector<int> naabbs;
+    std::vector<double> aabbs;
+    real_t *subdom_min;
+    real_t *subdom_max;
+    int world_size, rank, total_aabbs;
+
+public:
+    BlockForest(
+        real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax) :
+        DomainPartitioner(xmin, xmax, ymin, ymax, zmin, zmax) {
+
+        subdom_min = new real_t[ndims];
+        subdom_max = new real_t[ndims];
+    }
+
+    ~BlockForest() {
+        delete[] subdom_min;
+        delete[] subdom_max;
+    }
+
+    void setConfig();
+    void setBoundingBox();
+    void initialize(int *argc, char ***argv);
+    void finalize();
+    int getWorldSize() const { return world_size; }
+    int getRank() const { return rank; }
+    int isWithinSubdomain(real_t x, real_t y, real_t z);
+    void fillArrays(int *neighbor_ranks, int *pbc, real_t *subdom);
+    void communicateSizes(int dim, const int *send_sizes, int *recv_sizes);
+    void communicateData(
+        int dim, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+
+    void communicateAllData(
+        int ndims, int elem_size,
+        const real_t *send_buf, const int *send_offsets, const int *nsend,
+        real_t *recv_buf, const int *recv_offsets, const int *nrecv);
+};
+
+}
diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp
index 8944dfda738602867d7a9fb768cf81fd7e083d90..8eeb50f40d6ad13226b713598bfdffe8f8ac1228 100644
--- a/runtime/pairs.hpp
+++ b/runtime/pairs.hpp
@@ -13,6 +13,7 @@
 #include "runtime_var.hpp"
 #include "timers.hpp"
 #include "devices/device.hpp"
+#include "domain/block_forest.hpp"
 #include "domain/regular_6d_stencil.hpp"
 
 #pragma once
diff --git a/src/pairs/__init__.py b/src/pairs/__init__.py
index e89e0a79cbb8f3e39143dc3461a9734a6dcf8d7b..3099350a0320f56b9b5a6401565415c09f32cb75 100644
--- a/src/pairs/__init__.py
+++ b/src/pairs/__init__.py
@@ -65,3 +65,6 @@ def regular_domain_partitioner():
 
 def regular_domain_partitioner_xy():
     return DomainPartitioners.RegularXY
+
+def block_forest():
+    return DomainPartitioners.BlockForest
diff --git a/src/pairs/sim/domain.py b/src/pairs/sim/domain.py
index a30635397da67196f6d72b8c337d42e7a0062779..8126b58fea395785fcf5e486ca9573afa0d8049d 100644
--- a/src/pairs/sim/domain.py
+++ b/src/pairs/sim/domain.py
@@ -1,6 +1,4 @@
 from pairs.ir.block import pairs_inline
-from pairs.ir.functions import Call_Void
-from pairs.ir.types import Types
 from pairs.sim.lowerable import Lowerable
 
 
@@ -10,8 +8,4 @@ class InitializeDomain(Lowerable):
 
     @pairs_inline
     def lower(self):
-        dom_part = self.sim.domain_partitioning()
-        grid_array = [(self.sim.grid.min(d), self.sim.grid.max(d)) for d in range(self.sim.ndims())]
-        Call_Void(self.sim, "pairs->initDomain", [param for delim in grid_array for param in delim]),
-        Call_Void(self.sim, "pairs->fillCommunicationArrays", [dom_part.neighbor_ranks, dom_part.pbc, dom_part.subdom])
-
+        self.sim.domain_partitioning().initialize()
diff --git a/src/pairs/sim/domain_partitioners.py b/src/pairs/sim/domain_partitioners.py
index 6e00ad848f8496fbf8c24aa36ac3a06e813dcc07..b2e6fe28f5ac69539af78ade8800d3395915d095 100644
--- a/src/pairs/sim/domain_partitioners.py
+++ b/src/pairs/sim/domain_partitioners.py
@@ -2,10 +2,10 @@ class DomainPartitioners:
     Invalid = -1
     Regular = 0
     RegularXY = 1
-    BoxList = 2
+    BlockForest = 2
 
     def c_keyword(layout):
-        return "Regular"    if layout == DomainPartitioners.Regular else \
-               "RegularXY"  if layout == DomainPartitioners.RegularXY else \
-               "BoxList"    if layout == DomainPartitioners.BoxList else \
+        return "Regular"        if layout == DomainPartitioners.Regular else \
+               "RegularXY"      if layout == DomainPartitioners.RegularXY else \
+               "BlockForest"    if layout == DomainPartitioners.BlockForest else \
                "Invalid"
diff --git a/src/pairs/sim/domain_partitioning.py b/src/pairs/sim/domain_partitioning.py
index 901df44744426921cb896e9c2781caff73c5521d..b4198e02c7084d1f9fcfaf30c4b74b38ee838bd4 100644
--- a/src/pairs/sim/domain_partitioning.py
+++ b/src/pairs/sim/domain_partitioning.py
@@ -1,5 +1,6 @@
 from pairs.ir.block import pairs_device_block, pairs_host_block
 from pairs.ir.branches import Branch, Filter
+from pairs.ir.functions import Call_Void
 from pairs.ir.loops import For, ParticleFor
 from pairs.ir.utils import Print
 from pairs.ir.scalars import ScalarOp
@@ -28,6 +29,11 @@ class DimensionRanges:
     def step_indexes(self, step):
         return [step * 2 + 0, step * 2 + 1]
 
+    def initialize(self):
+        grid_array = [(self.sim.grid.min(d), self.sim.grid.max(d)) for d in range(self.sim.ndims())]
+        Call_Void(self.sim, "pairs->initDomain", [param for delim in grid_array for param in delim]),
+        Call_Void(self.sim, "pairs->fillCommunicationArrays", [self.neighbor_ranks, self.pbc, self.subdom])
+
     def ghost_particles(self, step, position, offset=0.0):
         # Particles with one of the following flags are ignored
         flags_to_exclude = (Flags.Infinite | Flags.Global)
@@ -44,7 +50,6 @@ class DimensionRanges:
 
         def prev_neighbor(self, j, step, position, offset, flags_to_exclude):
             particle_flags = self.sim.particle_flags
-            j = step * 2 + 1
             for i in For(self.sim, 0, self.sim.nlocal + self.sim.nghost):
                 for _ in Filter(self.sim, ScalarOp.cmp(particle_flags[i] & flags_to_exclude, 0)):
                     for _ in Filter(self.sim, position[i][step] > self.subdom[j] - offset):
@@ -63,3 +68,58 @@ class DimensionRanges:
             j = step * 2 + 1
             for _ in Filter(self.sim, ScalarOp.inline(ScalarOp.cmp(self.pbc[j], 0))):
                 yield from prev_neighbor(self, j, step, position, offset, flags_to_exclude)
+
+
+class BlockForest:
+    def __init__(self, sim):
+        self.sim                = sim
+        self.nranks             = sim.add_var('nranks', Types.Int32)
+        self.nranks_capacity    = sim.add_var('nranks_capacity', Types.Int32)
+        self.aabb_capacity      = sim.add_var('aabb_capacity', Types.Int32)
+        self.ranks              = sim.add_static_array('ranks', [self.nranks_capacity], Types.Int32)
+        self.naabbs             = sim.add_static_array('naabbs', [self.nranks_capacity], Types.Int32)
+        self.offsets            = sim.add_static_array('rank_offsets', [self.nranks_capacity], Types.Int32)
+        self.pbc                = sim.add_static_array('pbc', [self.aabb_capacity, 3], Types.Int32)
+        self.aabbs              = sim.add_static_array('aabbs', [self.aabb_capacity, 6], Types.Real)
+        self.subdom             = sim.add_static_array('subdom', [sim.ndims() * 2], Types.Real)
+
+    def min(self, dim):
+        return self.subdom[dim * 2 + 0]
+
+    def max(self, dim):
+        return self.subdom[dim * 2 + 1]
+
+    def number_of_steps(self):
+        return 1
+
+    def step_indexes(self, step):
+        return [step]
+
+    def initialize(self):
+        grid_array = [(self.sim.grid.min(d), self.sim.grid.max(d)) for d in range(self.sim.ndims())]
+        Call_Void(self.sim, "pairs->initDomain", [param for delim in grid_array for param in delim]),
+        Call_Void(self.sim, "pairs->fillCommunicationArrays", [self.ranks, self.pbc, self.subdom])
+
+    def ghost_particles(self, step, position, offset=0.0):
+        # Particles with one of the following flags are ignored
+        flags_to_exclude = (Flags.Infinite | Flags.Global)
+
+        for i in For(self.sim, 0, self.sim.nlocal):
+            particle_flags = self.sim.particle_flags
+
+            for _ in Filter(self.sim, ScalarOp.cmp(particle_flags[i] & flags_to_exclude, 0)):
+                for r in For(self.sim, 0, self.nranks):
+                    for aabb_id in For(self.sim, self.offsets[r], self.offsets[r] + self.naabbs[r]):
+                        full_cond = None
+
+                        for d in range(self.sim.ndims()):
+                            d_cond = ScalarOp.and_op(
+                                position[i][d] > self.aabbs[aabb_id][d * 2 + 0] + offset,
+                                position[i][d] < self.aabbs[aabb_id][d * 2 + 1] - offset)
+
+                            full_cond = d_cond if full_cond is None else \
+                                        ScalarOp.and_op(full_cond, d_cond)
+
+                        for _ in Filter(self.sim, full_cond):
+                            pbc_shifts = [self.pbc[aabb_id][d] for d in range(self.sim.ndims())]
+                            yield i, r, self.ranks[r], pbc_shifts
diff --git a/src/pairs/sim/simulation.py b/src/pairs/sim/simulation.py
index 291f085666fdb6cdac4e4f3d3b3a28969ab76c78..defaf998a2a04c12d244c3e6f6e1b47fce0e9ac4 100644
--- a/src/pairs/sim/simulation.py
+++ b/src/pairs/sim/simulation.py
@@ -19,7 +19,7 @@ from pairs.sim.copper_fcc_lattice import CopperFCCLattice
 from pairs.sim.dem_sc_grid import DEMSCGrid
 from pairs.sim.domain import InitializeDomain
 from pairs.sim.domain_partitioners import DomainPartitioners
-from pairs.sim.domain_partitioning import DimensionRanges
+from pairs.sim.domain_partitioning import BlockForest, DimensionRanges
 from pairs.sim.features import AllocateFeatureProperties
 from pairs.sim.grid import Grid2D, Grid3D
 from pairs.sim.instrumentation import RegisterMarkers, RegisterTimers
@@ -107,6 +107,9 @@ class Simulation:
         if partitioner in (DomainPartitioners.Regular, DomainPartitioners.RegularXY):
             self._dom_part = DimensionRanges(self)
 
+        elif partitioner == DomainPartitioners.BlockForest:
+            self._dom_part = BlockForest(self)
+
         else:
             raise Exception("Invalid domain partitioner.")