From 191df2bc3fd40daaecc255fd387b32fa2f0ade02 Mon Sep 17 00:00:00 2001
From: Behzad Safaei <iwia103h@alex1.nhr.fau.de>
Date: Wed, 16 Oct 2024 11:01:28 +0200
Subject: [PATCH] Made walberla seperable, resolved cuda compilation issue
 Fixed more bugs with variables, but GPU code segfaults

---
 CMakeLists.txt                  |  36 +++++------
 examples/dem_sd.py              |   4 +-
 examples/main.cpp               |  90 +++++++++++++++-------------
 runtime/boundary_weights.hpp    |   5 +-
 runtime/devices/cuda.cu         |   2 +-
 runtime/devices/device.hpp      |   2 +-
 runtime/domain/block_forest.cpp |  31 ++++++++++
 runtime/domain/block_forest.hpp |  63 ++++++++-----------
 runtime/pairs.cpp               |  10 +++-
 runtime/pairs.hpp               |   9 ++-
 src/pairs/code_gen/cgen.py      | 103 +++++++++++++++++++++++---------
 11 files changed, 223 insertions(+), 132 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a23e845..8b82269 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,8 +15,8 @@ if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build (Debug, Release, etc.)" FORCE)
 endif()
 
-set(CMAKE_CXX_FLAGS_RELEASE "-g -O0")
-set(CMAKE_CXX_FLAGS_DEBUG "-O3 -DDEBUG")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DDEBUG")
 
 string(TOLOWER "${TESTCASE}" TESTCASE)
 message(STATUS "Selected testcase: ${TESTCASE}")
@@ -28,7 +28,6 @@ option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON)
 option(GENERATE_WHOLE_PROGRAM "GENERATE_WHOLE_PROGRAM" OFF)
 
 if(USE_WALBERLA)
-    # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING -DWALBERLA_BUILD_WITH_CUDA ")
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING ")
     find_package(waLBerla REQUIRED)
     add_subdirectory(${walberla_SOURCE_DIR} ${walberla_BINARY_DIR} EXCLUDE_FROM_ALL)
@@ -40,7 +39,7 @@ set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces)
 
 set(RUNTIME_COMMON_FILES
     runtime/pairs.cpp
-    runtime/boundary_weights.cpp
+    # runtime/boundary_weights.cpp  # avoid compiling this for now. TODO: generate the host and device functions
     runtime/copper_fcc_lattice.cpp
     runtime/create_body.cpp
     runtime/dem_sc_grid.cpp
@@ -49,9 +48,11 @@ set(RUNTIME_COMMON_FILES
     runtime/thermo.cpp
     runtime/timing.cpp
     runtime/vtk.cpp
-    runtime/domain/block_forest.cpp
     runtime/domain/regular_6d_stencil.cpp)
 
+set(RUNTIME_WALBERLA_FILES
+    runtime/domain/block_forest.cpp)
+
 set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE STRING "Python executable not set.")
 
 if(NOT GENERATE_WHOLE_PROGRAM)
@@ -64,6 +65,10 @@ else()
     set(EXEC_FILES ${RUNTIME_COMMON_FILES})
 endif()
 
+if(USE_WALBERLA)
+    set(EXEC_FILES ${EXEC_FILES} ${RUNTIME_WALBERLA_FILES})
+endif()
+
 if(NOT PYTHON_EXECUTABLE)
     set(PYTHON_EXECUTABLE python3)
 endif()
@@ -88,6 +93,7 @@ execute_process(
 
 # CUDA compilation
 if(COMPILE_CUDA)
+    cmake_policy(SET CMP0074 NEW)
     find_package(CUDA REQUIRED)
     enable_language(CUDA)
     set(GEN_SOURCES "${CMAKE_BINARY_DIR}/${TESTCASE}.cu")
@@ -100,33 +106,29 @@ if(COMPILE_CUDA)
     endif()
 
     set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true")
 
     if(USE_WALBERLA)
-        set(WALBERLA_BUILD_WITH_CUDA ON)
-        set(WALBERLA_BUILD_WITH_GPU_SUPPORT ON)
-
         waLBerla_add_executable(
             NAME ${TARGET_BIN}
             FILES ${EXEC_FILES} ${GEN_SOURCES}
-            DEPENDS blockforest core pe gpu)
+            DEPENDS blockforest core pe)
     else()
-        cuda_add_executable(${TARGET_BIN} ${GEN_SOURCES} ${EXEC_FILES})
+        add_executable(${TARGET_BIN} ${GEN_SOURCES} ${EXEC_FILES})
     endif()
 
     if(ENABLE_GPU_DIRECT)
-        target_compile_options(${TARGET_BIN} PRIVATE -DENABLE_CUDA_AWARE_MPI)
+        target_compile_definitions(${TARGET_BIN} PRIVATE ENABLE_CUDA_AWARE_MPI)
     endif()
 
-    cuda_add_library(runtime STATIC runtime/devices/cuda.cu)
+    add_library(runtime STATIC runtime/devices/cuda.cu)
     target_compile_features(runtime PUBLIC cxx_std_11)
     set_target_properties(runtime PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
     set_target_properties(runtime PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
-    target_compile_options(runtime PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>)
     target_compile_definitions(runtime PRIVATE PAIRS_TARGET_CUDA)
-    target_include_directories(runtime PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR})
+    target_include_directories(runtime PRIVATE ${CUDA_INCLUDE_DIRS})
 
     set_target_properties(${TARGET_BIN} PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
-    target_compile_options(${TARGET_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>)
     target_compile_definitions(${TARGET_BIN} PRIVATE PAIRS_TARGET_CUDA)
     target_include_directories(${TARGET_BIN} PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR})
 
@@ -145,6 +147,7 @@ else()
     endif()
 
     add_library(runtime STATIC runtime/devices/dummy.cpp)
+    target_include_directories(${TARGET_BIN} PRIVATE ${GEN_HEADER_DIR})
 endif()
 
 target_link_libraries(${TARGET_BIN} runtime)
@@ -155,8 +158,7 @@ add_custom_command(
     COMMENT "Generating code with P4IRS"
     DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py)
 
-target_include_directories(${TARGET_BIN} PRIVATE ${GEN_HEADER_DIR})
-add_custom_target(generated_code DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER})
+add_custom_target(generated_code DEPENDS ${GEN_SOURCES} ${GEN_HEADER})
 add_dependencies(${TARGET_BIN} generated_code)
 
 target_link_libraries(${TARGET_BIN} ${CMAKE_EXE_LINKER_FLAGS})
diff --git a/examples/dem_sd.py b/examples/dem_sd.py
index 2fe5ed0..3a8bdfa 100644
--- a/examples/dem_sd.py
+++ b/examples/dem_sd.py
@@ -130,8 +130,8 @@ psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDyn
 
 psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]])
 # psim.set_domain_partitioner(pairs.block_forest(), initDomainFromWalberla=True)
-psim.set_domain_partitioner(pairs.block_forest())
-# psim.set_domain_partitioner(pairs.regular_domain_partitioner_xy())
+# psim.set_domain_partitioner(pairs.block_forest())
+psim.set_domain_partitioner(pairs.regular_domain_partitioner_xy())
 psim.pbc([False, False, False])
 psim.dem_sc_grid(
     domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]/2, generationSpacing_SI,
diff --git a/examples/main.cpp b/examples/main.cpp
index 5e2e609..a663ba8 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -1,31 +1,41 @@
 #include <iostream>
 //---
 #include "dem_sd.hpp"
-#include "core/mpi/Broadcast.h"
+// #include "core/mpi/Broadcast.h"
+// #include <blockforest/BlockForest.h>
+// #include <blockforest/Initialization.h>
+// #include <blockforest/loadbalancing/DynamicCurve.h>
+// #include <blockforest/loadbalancing/DynamicDiffusive.h>
+// #include <blockforest/loadbalancing/DynamicParMetis.h>
+// #include <blockforest/loadbalancing/InfoCollection.h>
+// #include <blockforest/loadbalancing/PODPhantomData.h>
+// #include <blockforest/loadbalancing/level_determination/MinMaxLevelDetermination.h>
+// #include <blockforest/loadbalancing/weight_assignment/MetisAssignmentFunctor.h>
+// #include <blockforest/loadbalancing/weight_assignment/WeightAssignmentFunctor.h>
 
 int main(int argc, char **argv) {
     auto pairs_sim = std::make_shared<PairsSimulation>();
-    auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim);
+    // auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim);
 
     // Create forest (make sure to use_domain(forest)) ----------------------------------------------
-    walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1);
-    std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance();
-    mpiManager->initializeMPI(&argc, &argv);
-    mpiManager->useWorldComm();
-    auto rank = mpiManager->rank();
-    auto procs = mpiManager->numProcesses();
-    auto block_config = walberla::Vector3<int>(2, 2, 1);
-    auto ref_level = 0;
-    std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest(
-            domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level);
+    // walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1);
+    // std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance();
+    // mpiManager->initializeMPI(&argc, &argv);
+    // mpiManager->useWorldComm();
+    // auto rank = mpiManager->rank();
+    // auto procs = mpiManager->numProcesses();
+    // auto block_config = walberla::Vector3<int>(2, 2, 1);
+    // auto ref_level = 0;
+    // std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest(
+    //         domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level);
     //-----------------------------------------------------------------------------------------------
 
     // initialize pairs data structures ----------------------------------------------
     pairs_sim->initialize();
 
     // either create new domain or use an existing one ----------------------------------------
-    // pairs_sim->create_domain(argc, argv);
-    pairs_sim->use_domain(forest);
+    pairs_sim->create_domain(argc, argv);
+    // pairs_sim->use_domain(forest);
 
     // create planes and particles ------------------------------------------------------------
     pairs_sim->create_halfspace(0,     0,      0,      1, 0, 0,        0, 13);
@@ -39,43 +49,43 @@ int main(int argc, char **argv) {
     pairs_sim->create_sphere(0.07,   0.07,   0.08,   -0.5, -0.5, 0 , 1000, 0.0045, 0, 0);
 
     // Tracking a particle ------------------------------------------------------------------------
-    if (pUid != pairs_acc->getInvalidUid()){
-        std::cout<< "Particle " << pUid << " is created in rank " << rank << std::endl;
-    }
+    // if (pUid != pairs_acc->getInvalidUid()){
+    //     std::cout<< "Particle " << pUid << " is created in rank " << rank << std::endl;
+    // }
     
-    walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM);
+    // walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM);
 
-    if (pUid != pairs_acc->getInvalidUid()){
-        std::cout<< "Particle " << pUid << " will be tracked by rank " << rank << std::endl;
-    }
+    // if (pUid != pairs_acc->getInvalidUid()){
+    //     std::cout<< "Particle " << pUid << " will be tracked by rank " << rank << std::endl;
+    // }
 
-    auto pIsInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();};
+    // auto pIsInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();};
 
 
     // TODO: make sure linkedCellWidth is larger than max diameter in the system
     // setup particles, setup functions, and the cell list stencil-------------------------------
     pairs_sim->setup_sim();
 
-    for(int i=0; i<pairs_acc->size(); ++i){
-        if(pairs_acc->getShape(i) == 0){
-            std::cout<< "rank: " << rank << " sphere " <<  pairs_acc->getUid(i) << "  " << pairs_acc->getPosition(i) << std::endl; 
-        }
-        else if(pairs_acc->getShape(i) == 1){
-            std::cout<< "rank: " << rank << " halfspace " <<  pairs_acc->getUid(i) << "  " << pairs_acc->getPosition(i) << pairs_acc->getNormal(i) << std::endl; 
-        }
-    }
-
-    if (pIsInMyRank(pUid)){
-        int idx = pairs_acc->uidToIdx(pUid);
-        pairs_acc->setPosition(idx, walberla::Vector3<double>(0.01, 0.01, 0.08));
-        pairs_acc->setLinearVelocity(idx, walberla::Vector3<double>(0.5, 0.5, 0.5));
-    }
+    // for(int i=0; i<pairs_acc->size(); ++i){
+    //     if(pairs_acc->getShape(i) == 0){
+    //         std::cout<< "rank: " << rank << " sphere " <<  pairs_acc->getUid(i) << "  " << pairs_acc->getPosition(i) << std::endl; 
+    //     }
+    //     else if(pairs_acc->getShape(i) == 1){
+    //         std::cout<< "rank: " << rank << " halfspace " <<  pairs_acc->getUid(i) << "  " << pairs_acc->getPosition(i) << pairs_acc->getNormal(i) << std::endl; 
+    //     }
+    // }
+
+    // if (pIsInMyRank(pUid)){
+    //     int idx = pairs_acc->uidToIdx(pUid);
+    //     pairs_acc->setPosition(idx, walberla::Vector3<double>(0.01, 0.01, 0.08));
+    //     pairs_acc->setLinearVelocity(idx, walberla::Vector3<double>(0.5, 0.5, 0.5));
+    // }
 
     for (int i=0; i<10000; ++i){
-        if ((i%200==0) && (pIsInMyRank(pUid))){
-            int idx = pairs_acc->uidToIdx(pUid);
-            std::cout<< "Tracked particle is now in rank " << rank << " --- " << pairs_acc->getPosition(idx)<< std::endl;
-        }
+        // if ((i%200==0) && (pIsInMyRank(pUid))){
+        //     int idx = pairs_acc->uidToIdx(pUid);
+        //     std::cout<< "Tracked particle is now in rank " << rank << " --- " << pairs_acc->getPosition(idx)<< std::endl;
+        // }
 
         pairs_sim->do_timestep(i);
     }
diff --git a/runtime/boundary_weights.hpp b/runtime/boundary_weights.hpp
index c5d0e69..cd079d1 100644
--- a/runtime/boundary_weights.hpp
+++ b/runtime/boundary_weights.hpp
@@ -15,6 +15,9 @@ namespace pairs {
 void compute_boundary_weights(
     PairsRuntime *ps,
     real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax,
-    long unsigned int *comp_weight, long unsigned int *comm_weight);
+    long unsigned int *comp_weight, long unsigned int *comm_weight){
+        std::cerr<< "TODO: boundary weights should be generated" << std::endl;
+        exit(-1);
+    };
 
 }
diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu
index adc79cb..38848b2 100644
--- a/runtime/devices/cuda.cu
+++ b/runtime/devices/cuda.cu
@@ -2,6 +2,7 @@
 #include <iostream>
 #include <cstring>
 #include "../pairs_common.hpp"
+// #include "device.hpp"
 
 #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); }
 
@@ -73,7 +74,6 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t
 }
 
 #if __CUDA_ARCH__ < 600
-//#error "CUDA architecture is less than 600"
 __device__ double atomicAdd_double(double* address, double val) {
     unsigned long long int * ull_addr = (unsigned long long int*) address;
     unsigned long long int old = *ull_addr, assumed;
diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp
index 6151ed0..60656d4 100644
--- a/runtime/devices/device.hpp
+++ b/runtime/devices/device.hpp
@@ -72,7 +72,7 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize
     return host_atomic_add(addr, val);
 }
 
-#if defined(PAIRS_TARGET_CUDA) && defined(__CUDA_ARCH__)
+#ifdef PAIRS_TARGET_CUDA
 __device__ double atomicAdd_double(double* address, double val);
 __device__ int atomic_add(int *addr, int val);
 __device__ real_t atomic_add(real_t *addr, real_t val);
diff --git a/runtime/domain/block_forest.cpp b/runtime/domain/block_forest.cpp
index 553d76b..fcb1988 100644
--- a/runtime/domain/block_forest.cpp
+++ b/runtime/domain/block_forest.cpp
@@ -21,6 +21,37 @@
 
 namespace pairs {
 
+BlockForest::BlockForest(
+        PairsRuntime *ps_,
+        real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, bool pbcx, bool pbcy, bool pbcz) :
+        DomainPartitioner(xmin, xmax, ymin, ymax, zmin, zmax), ps(ps_), globalPBC{pbcx, pbcy, pbcz} {
+
+        subdom = new real_t[ndims * 2];
+    }
+
+BlockForest::BlockForest(PairsRuntime *ps_, const std::shared_ptr<walberla::blockforest::BlockForest> &bf) :
+        forest(bf),
+        DomainPartitioner(bf->getDomain().xMin(), bf->getDomain().xMax(),
+                        bf->getDomain().yMin(), bf->getDomain().yMax(),
+                        bf->getDomain().zMin(), bf->getDomain().zMax()), 
+        ps(ps_), 
+        globalPBC{bf->isXPeriodic(), bf->isYPeriodic(), bf->isZPeriodic()} 
+        {
+            subdom = new real_t[ndims * 2];
+            balance_workload = 0;
+
+            mpiManager = walberla::mpi::MPIManager::instance();
+            world_size = mpiManager->numProcesses();
+            rank = mpiManager->rank();
+            this->info = make_shared<walberla::blockforest::InfoCollection>();
+
+            if(balance_workload) {
+                this->initializeWorkloadBalancer();
+            }
+
+        }
+
+
 void BlockForest::updateNeighborhood() {
     std::map<int, std::vector<walberla::math::AABB>> neighborhood;
     std::map<int, std::vector<walberla::BlockID>> blocks_pushed;
diff --git a/runtime/domain/block_forest.hpp b/runtime/domain/block_forest.hpp
index ee4188a..d607616 100644
--- a/runtime/domain/block_forest.hpp
+++ b/runtime/domain/block_forest.hpp
@@ -1,14 +1,6 @@
-#include <blockforest/BlockForest.h>
-#include <blockforest/Initialization.h>
-#include <blockforest/loadbalancing/DynamicCurve.h>
-#include <blockforest/loadbalancing/DynamicDiffusive.h>
-#include <blockforest/loadbalancing/DynamicParMetis.h>
-#include <blockforest/loadbalancing/InfoCollection.h>
-#include <blockforest/loadbalancing/PODPhantomData.h>
-#include <blockforest/loadbalancing/level_determination/MinMaxLevelDetermination.h>
-#include <blockforest/loadbalancing/weight_assignment/MetisAssignmentFunctor.h>
-#include <blockforest/loadbalancing/weight_assignment/WeightAssignmentFunctor.h>
-//---
+#include <memory>
+#include <map>
+
 #include "../pairs_common.hpp"
 #include "domain_partitioning.hpp"
 
@@ -16,6 +8,23 @@
 
 #define SMALL 0.00001
 
+namespace walberla {
+    namespace blockforest{
+        class BlockForest;
+        class BlockID;
+        class BlockInfo;
+        using InfoCollection = std::map<BlockID, BlockInfo>;
+    }
+
+    namespace mpi {
+        class MPIManager;
+    }
+
+    namespace math{
+        template<typename T> 
+        class Vector3;
+    }
+}
 namespace pairs {
 
 class PairsRuntime;
@@ -23,7 +32,7 @@ class PairsRuntime;
 class BlockForest : public DomainPartitioner {
 private:
     std::shared_ptr<walberla::mpi::MPIManager> mpiManager;
-    std::shared_ptr<walberla::BlockForest> forest;
+    std::shared_ptr<walberla::blockforest::BlockForest> forest;
     std::shared_ptr<walberla::blockforest::InfoCollection> info;
     std::vector<int> ranks;
     std::vector<int> naabbs;
@@ -38,33 +47,9 @@ private:
 public:
     BlockForest(
         PairsRuntime *ps_,
-        real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, bool pbcx, bool pbcy, bool pbcz) :
-        DomainPartitioner(xmin, xmax, ymin, ymax, zmin, zmax), ps(ps_), globalPBC{pbcx, pbcy, pbcz} {
-
-        subdom = new real_t[ndims * 2];
-    }
-
-    BlockForest(PairsRuntime *ps_, const std::shared_ptr<walberla::BlockForest> &bf) :
-        forest(bf),
-        DomainPartitioner(bf->getDomain().xMin(), bf->getDomain().xMax(),
-                        bf->getDomain().yMin(), bf->getDomain().yMax(),
-                        bf->getDomain().zMin(), bf->getDomain().zMax()), 
-        ps(ps_), 
-        globalPBC{bf->isXPeriodic(), bf->isYPeriodic(), bf->isZPeriodic()} 
-        {
-            subdom = new real_t[ndims * 2];
-            balance_workload = 0;
-
-            mpiManager = walberla::mpi::MPIManager::instance();
-            world_size = mpiManager->numProcesses();
-            rank = mpiManager->rank();
-            this->info = make_shared<walberla::blockforest::InfoCollection>();
-
-            if(balance_workload) {
-                this->initializeWorkloadBalancer();
-            }
+        real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, bool pbcx, bool pbcy, bool pbcz);
 
-        }
+    BlockForest(PairsRuntime *ps_, const std::shared_ptr<walberla::blockforest::BlockForest> &bf);
 
     ~BlockForest() {
         delete[] subdom;
@@ -81,7 +66,7 @@ public:
     void initializeWorkloadBalancer();
     void updateNeighborhood();
     void updateWeights();
-    walberla::Vector3<int> getBlockConfig(int num_processes, int nx, int ny, int nz);
+    walberla::math::Vector3<int> getBlockConfig(int num_processes, int nx, int ny, int nz);
     int getInitialRefinementLevel(int num_processes);
     void setBoundingBox();
     void rebalance();
diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp
index e89fff1..a0b767c 100644
--- a/runtime/pairs.cpp
+++ b/runtime/pairs.cpp
@@ -34,9 +34,15 @@ void PairsRuntime::initDomain(
     } else if(dom_part_type == RegularXYPartitioning) {
         const int flags[] = {1, 1, 0};
         dom_part = new Regular6DStencil(xmin, xmax, ymin, ymax, zmin, zmax, flags);
-    } else if(dom_part_type == BlockForestPartitioning) {
+    } 
+    
+#ifdef USE_WALBERLA
+    else if(dom_part_type == BlockForestPartitioning) {
         dom_part = new BlockForest(this, xmin, xmax, ymin, ymax, zmin, zmax, pbcx, pbcy, pbcz);
-    } else {
+    } 
+#endif
+
+    else {
         PAIRS_ERROR("Domain partitioning type not implemented!\n");
         exit(-1);
     }
diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp
index 1de253c..68fb2d6 100644
--- a/runtime/pairs.hpp
+++ b/runtime/pairs.hpp
@@ -364,10 +364,15 @@ void PairsRuntime::useDomain(const std::shared_ptr<Domain_T> &domain_ptr){
         PAIRS_ERROR("useDomain not implemented for Regular6DStencil!\n");
         exit(-1);
 
-    } else if(dom_part_type == BlockForestPartitioning) {
+    } 
+    
+#ifdef USE_WALBERLA
+    else if(dom_part_type == BlockForestPartitioning) {
         dom_part = new BlockForest(this, domain_ptr);
+    } 
+#endif
 
-    } else {
+    else {
         PAIRS_ERROR("Domain partitioning type not implemented!\n");
         exit(-1);
     }
diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py
index d88e1cb..bfdd48c 100644
--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -76,6 +76,11 @@ class CGen:
 
         if isinstance(obj, FeatureProperty) and device and obj.device_flag:
             return name
+        
+        if isinstance(obj, Array) and device and obj.device_flag:
+            if obj.is_static():
+                return name
+        
 
         if self.generate_full_object_names:
             return f"pobj->{name}"
@@ -96,7 +101,10 @@ class CGen:
         self.print.start()
         self.print("#pragma once")
         self.generate_interface_namespace('pairs_host_interface')
-        self.generate_interface_namespace('pairs_cuda_interface', "__inline__ __device__")
+
+        if self.target.is_gpu():
+            self.generate_interface_namespace('pairs_cuda_interface', "__inline__ __device__")
+            
         self.print.end()
 
     def generate_interface_namespace(self, namespace, prefix=None):
@@ -159,29 +167,12 @@ class CGen:
         self.print("using namespace pairs;")
         self.print("")
 
-        if self.target.is_gpu():
-            for array in self.sim.arrays.statics():
-                if array.device_flag:
-                    t = array.type()
-                    tkw = Types.c_keyword(self.sim, t)
-                    size = self.generate_expression(ScalarOp.inline(array.alloc_size()))
-                    self.print(f"__constant__ {tkw} d_{array.name()}[{size}];")
-
-            for feature_prop in self.sim.feature_properties:
-                if feature_prop.device_flag:
-                    t = feature_prop.type()
-                    tkw = Types.c_keyword(self.sim, t)
-                    size = feature_prop.array_size()
-                    self.print(f"__constant__ {tkw} d_{feature_prop.name()}[{size}];")
-
-        self.print("")
-
     def generate_module_headers(self):
         self.print("")
 
         self.print("// Module headers")
         for module in self.sim.modules():
-            self.print(f"void {module.name}(struct PairsObjects *pobj);")
+            self.print(f"void {module.name}(PairsRuntime *pairs_runtime, struct PairsObjects *pobj);")
 
         self.print("")
 
@@ -261,6 +252,24 @@ class CGen:
         self.print("")
 
     def generate_pairs_object_structure(self):
+
+        self.print("")
+
+        if self.target.is_gpu():
+            for array in self.sim.arrays.statics():
+                if array.device_flag:
+                    t = array.type()
+                    tkw = Types.c_keyword(self.sim, t)
+                    size = self.generate_expression(ScalarOp.inline(array.alloc_size()))
+                    self.print(f"extern __constant__ {tkw} d_{array.name()}[{size}];")
+
+            for feature_prop in self.sim.feature_properties:
+                if feature_prop.device_flag:
+                    t = feature_prop.type()
+                    tkw = Types.c_keyword(self.sim, t)
+                    size = feature_prop.array_size()
+                    self.print(f"extern __constant__ {tkw} d_{feature_prop.name()}[{size}];")
+
         self.print("")
         self.print("struct PairsObjects {")
         self.print.add_indent(4)
@@ -278,7 +287,10 @@ class CGen:
                 self.print(f"{tkw} *{ptr};")
 
             if self.target.is_gpu() and a.device_flag:
-                self.print(f"{tkw} *d_{ptr};")
+                if a.is_static():
+                    continue
+                else:
+                    self.print(f"{tkw} *d_{ptr};")
 
         self.print("// Properties")
         for p in self.sim.properties:
@@ -343,6 +355,21 @@ class CGen:
         self.generate_preamble()
         self.print(f"#include \"{self.ref}.hpp\"")
 
+        if self.target.is_gpu():
+            for array in self.sim.arrays.statics():
+                if array.device_flag:
+                    t = array.type()
+                    tkw = Types.c_keyword(self.sim, t)
+                    size = self.generate_expression(ScalarOp.inline(array.alloc_size()))
+                    self.print(f"__constant__ {tkw} d_{array.name()}[{size}];")
+
+            for feature_prop in self.sim.feature_properties:
+                if feature_prop.device_flag:
+                    t = feature_prop.type()
+                    tkw = Types.c_keyword(self.sim, t)
+                    size = feature_prop.array_size()
+                    self.print(f"__constant__ {tkw} d_{feature_prop.name()}[{size}];")
+                    
         for kernel in self.sim.kernels():
             self.generate_kernel(kernel)
 
@@ -355,6 +382,7 @@ class CGen:
         # Generate library header
         self.print = Printer(self.ref + ".hpp")
         self.print.start()
+        self.print("#pragma once")
 
         self.generate_preamble()
         self.generate_pairs_object_structure()
@@ -433,7 +461,7 @@ class CGen:
         self.print.add_indent(-4)
         self.print("};")
 
-        self.generate_host_pairs_accessor_class()
+        # self.generate_host_pairs_accessor_class()
         
         self.print.end()
         self.generate_full_object_names = False
@@ -468,7 +496,7 @@ class CGen:
             self.generate_full_object_names = False
 
         else:
-            self.print(f"void {module.name}(struct PairsObjects *pobj) {{")
+            self.print(f"void {module.name}(PairsRuntime *pairs_runtime, struct PairsObjects *pobj) {{")
             self.print.add_indent(4)
             device_cond = module.run_on_device and self.target.is_gpu()
 
@@ -481,17 +509,25 @@ class CGen:
 
             for var in module.write_variables():
                 type_kw = Types.c_keyword(self.sim, var.type())
-                name = f"rv_{var.name()}.getDevicePointer()" if device_cond and var.device_flag else var.name()
-                self.print(f"{type_kw} *{var.name()} = &(pobj->{name});")
+
+                if device_cond and var.device_flag:
+                    self.print(f"{type_kw} *{var.name()} = pobj->rv_{var.name()}.getDevicePointer();")
+                else:
+                    self.print(f"{type_kw} *{var.name()} = &(pobj->{var.name()});")
 
             for array in module.arrays():
                 type_kw = Types.c_keyword(self.sim, array.type())
                 name = array.name() if not device_cond else f"d_{array.name()}"
-                self.print(f"{type_kw} *{array.name()} = pobj->{name};")
+                # self.generate_full_object_names = True
+                if not array.is_static() or (array.is_static() and not module.run_on_device):
+                    self.print(f"{type_kw} *{array.name()} = pobj->{name};")
+                    # self.print(f"{type_kw} *{array.name()} = {self.generate_object_reference(array, device=device_cond)};")
+                # self.generate_full_object_names = False
 
                 if array in module.host_references():
                     self.print(f"{type_kw} *h_{array.name()} = pobj->{array.name()};")
 
+
             for prop in module.properties():
                 type_kw = Types.c_keyword(self.sim, prop.type())
                 name = prop.name() if not device_cond else f"d_{prop.name()}"
@@ -511,7 +547,12 @@ class CGen:
             for feature_prop in module.feature_properties():
                 type_kw = Types.c_keyword(self.sim, feature_prop.type())
                 name = feature_prop.name() if not device_cond else f"d_{feature_prop.name()}"
-                self.print(f"{type_kw} *{feature_prop.name()} = pobj->{name};")
+
+                if feature_prop.device_flag and device_cond:
+                    # self.print(f"{type_kw} *{feature_prop.name()} = {self.generate_object_reference(feature_prop, device=device_cond)};")
+                    continue
+                else:
+                    self.print(f"{type_kw} *{feature_prop.name()} = pobj->{name};")
 
                 if feature_prop in module.host_references():
                     self.print(f"{type_kw} *h_{feature_prop.name()} = pobj->{feature_prop.name()};")
@@ -533,6 +574,8 @@ class CGen:
             kernel_params += f", {decl}"
 
         for array in kernel.arrays():
+            if array.is_static():
+                continue
             type_kw = Types.c_keyword(self.sim, array.type())
             decl = f"{type_kw} *{array.name()}"
             kernel_params += f", {decl}"
@@ -548,6 +591,8 @@ class CGen:
             kernel_params += f", {decl}"
 
         for feature_prop in kernel.feature_properties():
+            if feature_prop.device_flag:
+                continue
             type_kw = Types.c_keyword(self.sim, feature_prop.type())
             decl = f"{type_kw} *{feature_prop.name()}"
             kernel_params += f", {decl}"
@@ -870,6 +915,8 @@ class CGen:
                 kernel_params += f", {var.name()}"
 
             for array in kernel.arrays():
+                if array.is_static():
+                    continue
                 kernel_params += f", {array.name()}"
 
             for prop in kernel.properties():
@@ -879,6 +926,8 @@ class CGen:
                 kernel_params += f", {contact_prop.name()}"
 
             for feature_prop in kernel.feature_properties():
+                if feature_prop.device_flag:
+                    continue     
                 kernel_params += f", {feature_prop.name()}"
 
             for array_access in kernel.array_accesses():
@@ -897,7 +946,7 @@ class CGen:
             self.print("}")
 
         if isinstance(ast_node, ModuleCall):
-            self.print(f"{ast_node.module.name}(pobj);")
+            self.print(f"{ast_node.module.name}(pairs_runtime, pobj);")
 
         if isinstance(ast_node, Print):
             args = ast_node.args
-- 
GitLab