From 191df2bc3fd40daaecc255fd387b32fa2f0ade02 Mon Sep 17 00:00:00 2001 From: Behzad Safaei <iwia103h@alex1.nhr.fau.de> Date: Wed, 16 Oct 2024 11:01:28 +0200 Subject: [PATCH] Made walberla seperable, resolved cuda compilation issue Fixed more bugs with variables, but GPU code segfaults --- CMakeLists.txt | 36 +++++------ examples/dem_sd.py | 4 +- examples/main.cpp | 90 +++++++++++++++------------- runtime/boundary_weights.hpp | 5 +- runtime/devices/cuda.cu | 2 +- runtime/devices/device.hpp | 2 +- runtime/domain/block_forest.cpp | 31 ++++++++++ runtime/domain/block_forest.hpp | 63 ++++++++----------- runtime/pairs.cpp | 10 +++- runtime/pairs.hpp | 9 ++- src/pairs/code_gen/cgen.py | 103 +++++++++++++++++++++++--------- 11 files changed, 223 insertions(+), 132 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a23e845..8b82269 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,8 +15,8 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build (Debug, Release, etc.)" FORCE) endif() -set(CMAKE_CXX_FLAGS_RELEASE "-g -O0") -set(CMAKE_CXX_FLAGS_DEBUG "-O3 -DDEBUG") +set(CMAKE_CXX_FLAGS_RELEASE "-O3") +set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DDEBUG") string(TOLOWER "${TESTCASE}" TESTCASE) message(STATUS "Selected testcase: ${TESTCASE}") @@ -28,7 +28,6 @@ option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON) option(GENERATE_WHOLE_PROGRAM "GENERATE_WHOLE_PROGRAM" OFF) if(USE_WALBERLA) - # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING -DWALBERLA_BUILD_WITH_CUDA ") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING ") find_package(waLBerla REQUIRED) add_subdirectory(${walberla_SOURCE_DIR} ${walberla_BINARY_DIR} EXCLUDE_FROM_ALL) @@ -40,7 +39,7 @@ set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces) set(RUNTIME_COMMON_FILES runtime/pairs.cpp - runtime/boundary_weights.cpp + # runtime/boundary_weights.cpp # avoid compiling this for now. TODO: generate the host and device functions runtime/copper_fcc_lattice.cpp runtime/create_body.cpp runtime/dem_sc_grid.cpp @@ -49,9 +48,11 @@ set(RUNTIME_COMMON_FILES runtime/thermo.cpp runtime/timing.cpp runtime/vtk.cpp - runtime/domain/block_forest.cpp runtime/domain/regular_6d_stencil.cpp) +set(RUNTIME_WALBERLA_FILES + runtime/domain/block_forest.cpp) + set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE STRING "Python executable not set.") if(NOT GENERATE_WHOLE_PROGRAM) @@ -64,6 +65,10 @@ else() set(EXEC_FILES ${RUNTIME_COMMON_FILES}) endif() +if(USE_WALBERLA) + set(EXEC_FILES ${EXEC_FILES} ${RUNTIME_WALBERLA_FILES}) +endif() + if(NOT PYTHON_EXECUTABLE) set(PYTHON_EXECUTABLE python3) endif() @@ -88,6 +93,7 @@ execute_process( # CUDA compilation if(COMPILE_CUDA) + cmake_policy(SET CMP0074 NEW) find_package(CUDA REQUIRED) enable_language(CUDA) set(GEN_SOURCES "${CMAKE_BINARY_DIR}/${TESTCASE}.cu") @@ -100,33 +106,29 @@ if(COMPILE_CUDA) endif() set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true") if(USE_WALBERLA) - set(WALBERLA_BUILD_WITH_CUDA ON) - set(WALBERLA_BUILD_WITH_GPU_SUPPORT ON) - waLBerla_add_executable( NAME ${TARGET_BIN} FILES ${EXEC_FILES} ${GEN_SOURCES} - DEPENDS blockforest core pe gpu) + DEPENDS blockforest core pe) else() - cuda_add_executable(${TARGET_BIN} ${GEN_SOURCES} ${EXEC_FILES}) + add_executable(${TARGET_BIN} ${GEN_SOURCES} ${EXEC_FILES}) endif() if(ENABLE_GPU_DIRECT) - target_compile_options(${TARGET_BIN} PRIVATE -DENABLE_CUDA_AWARE_MPI) + target_compile_definitions(${TARGET_BIN} PRIVATE ENABLE_CUDA_AWARE_MPI) endif() - cuda_add_library(runtime STATIC runtime/devices/cuda.cu) + add_library(runtime STATIC runtime/devices/cuda.cu) target_compile_features(runtime PUBLIC cxx_std_11) set_target_properties(runtime PROPERTIES CUDA_SEPARABLE_COMPILATION ON) set_target_properties(runtime PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH}) - target_compile_options(runtime PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>) target_compile_definitions(runtime PRIVATE PAIRS_TARGET_CUDA) - target_include_directories(runtime PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR}) + target_include_directories(runtime PRIVATE ${CUDA_INCLUDE_DIRS}) set_target_properties(${TARGET_BIN} PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH}) - target_compile_options(${TARGET_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>) target_compile_definitions(${TARGET_BIN} PRIVATE PAIRS_TARGET_CUDA) target_include_directories(${TARGET_BIN} PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR}) @@ -145,6 +147,7 @@ else() endif() add_library(runtime STATIC runtime/devices/dummy.cpp) + target_include_directories(${TARGET_BIN} PRIVATE ${GEN_HEADER_DIR}) endif() target_link_libraries(${TARGET_BIN} runtime) @@ -155,8 +158,7 @@ add_custom_command( COMMENT "Generating code with P4IRS" DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py) -target_include_directories(${TARGET_BIN} PRIVATE ${GEN_HEADER_DIR}) -add_custom_target(generated_code DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER}) +add_custom_target(generated_code DEPENDS ${GEN_SOURCES} ${GEN_HEADER}) add_dependencies(${TARGET_BIN} generated_code) target_link_libraries(${TARGET_BIN} ${CMAKE_EXE_LINKER_FLAGS}) diff --git a/examples/dem_sd.py b/examples/dem_sd.py index 2fe5ed0..3a8bdfa 100644 --- a/examples/dem_sd.py +++ b/examples/dem_sd.py @@ -130,8 +130,8 @@ psim.add_feature_property('type', 'friction_dynamic', pairs.real(), [frictionDyn psim.set_domain([0.0, 0.0, 0.0, domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]]) # psim.set_domain_partitioner(pairs.block_forest(), initDomainFromWalberla=True) -psim.set_domain_partitioner(pairs.block_forest()) -# psim.set_domain_partitioner(pairs.regular_domain_partitioner_xy()) +# psim.set_domain_partitioner(pairs.block_forest()) +psim.set_domain_partitioner(pairs.regular_domain_partitioner_xy()) psim.pbc([False, False, False]) psim.dem_sc_grid( domainSize_SI[0], domainSize_SI[1], domainSize_SI[2]/2, generationSpacing_SI, diff --git a/examples/main.cpp b/examples/main.cpp index 5e2e609..a663ba8 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -1,31 +1,41 @@ #include <iostream> //--- #include "dem_sd.hpp" -#include "core/mpi/Broadcast.h" +// #include "core/mpi/Broadcast.h" +// #include <blockforest/BlockForest.h> +// #include <blockforest/Initialization.h> +// #include <blockforest/loadbalancing/DynamicCurve.h> +// #include <blockforest/loadbalancing/DynamicDiffusive.h> +// #include <blockforest/loadbalancing/DynamicParMetis.h> +// #include <blockforest/loadbalancing/InfoCollection.h> +// #include <blockforest/loadbalancing/PODPhantomData.h> +// #include <blockforest/loadbalancing/level_determination/MinMaxLevelDetermination.h> +// #include <blockforest/loadbalancing/weight_assignment/MetisAssignmentFunctor.h> +// #include <blockforest/loadbalancing/weight_assignment/WeightAssignmentFunctor.h> int main(int argc, char **argv) { auto pairs_sim = std::make_shared<PairsSimulation>(); - auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim); + // auto pairs_acc = std::make_shared<PairsAccessor>(pairs_sim); // Create forest (make sure to use_domain(forest)) ---------------------------------------------- - walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1); - std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance(); - mpiManager->initializeMPI(&argc, &argv); - mpiManager->useWorldComm(); - auto rank = mpiManager->rank(); - auto procs = mpiManager->numProcesses(); - auto block_config = walberla::Vector3<int>(2, 2, 1); - auto ref_level = 0; - std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest( - domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level); + // walberla::math::AABB domain(0, 0, 0, 0.1, 0.1, 0.1); + // std::shared_ptr<walberla::mpi::MPIManager> mpiManager = walberla::mpi::MPIManager::instance(); + // mpiManager->initializeMPI(&argc, &argv); + // mpiManager->useWorldComm(); + // auto rank = mpiManager->rank(); + // auto procs = mpiManager->numProcesses(); + // auto block_config = walberla::Vector3<int>(2, 2, 1); + // auto ref_level = 0; + // std::shared_ptr<walberla::BlockForest> forest = walberla::blockforest::createBlockForest( + // domain, block_config, walberla::Vector3<bool>(false, false, false), procs, ref_level); //----------------------------------------------------------------------------------------------- // initialize pairs data structures ---------------------------------------------- pairs_sim->initialize(); // either create new domain or use an existing one ---------------------------------------- - // pairs_sim->create_domain(argc, argv); - pairs_sim->use_domain(forest); + pairs_sim->create_domain(argc, argv); + // pairs_sim->use_domain(forest); // create planes and particles ------------------------------------------------------------ pairs_sim->create_halfspace(0, 0, 0, 1, 0, 0, 0, 13); @@ -39,43 +49,43 @@ int main(int argc, char **argv) { pairs_sim->create_sphere(0.07, 0.07, 0.08, -0.5, -0.5, 0 , 1000, 0.0045, 0, 0); // Tracking a particle ------------------------------------------------------------------------ - if (pUid != pairs_acc->getInvalidUid()){ - std::cout<< "Particle " << pUid << " is created in rank " << rank << std::endl; - } + // if (pUid != pairs_acc->getInvalidUid()){ + // std::cout<< "Particle " << pUid << " is created in rank " << rank << std::endl; + // } - walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM); + // walberla::mpi::allReduceInplace(pUid, walberla::mpi::SUM); - if (pUid != pairs_acc->getInvalidUid()){ - std::cout<< "Particle " << pUid << " will be tracked by rank " << rank << std::endl; - } + // if (pUid != pairs_acc->getInvalidUid()){ + // std::cout<< "Particle " << pUid << " will be tracked by rank " << rank << std::endl; + // } - auto pIsInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();}; + // auto pIsInMyRank = [&](pairs::id_t uid){return pairs_acc->uidToIdx(uid) != pairs_acc->getInvalidIdx();}; // TODO: make sure linkedCellWidth is larger than max diameter in the system // setup particles, setup functions, and the cell list stencil------------------------------- pairs_sim->setup_sim(); - for(int i=0; i<pairs_acc->size(); ++i){ - if(pairs_acc->getShape(i) == 0){ - std::cout<< "rank: " << rank << " sphere " << pairs_acc->getUid(i) << " " << pairs_acc->getPosition(i) << std::endl; - } - else if(pairs_acc->getShape(i) == 1){ - std::cout<< "rank: " << rank << " halfspace " << pairs_acc->getUid(i) << " " << pairs_acc->getPosition(i) << pairs_acc->getNormal(i) << std::endl; - } - } - - if (pIsInMyRank(pUid)){ - int idx = pairs_acc->uidToIdx(pUid); - pairs_acc->setPosition(idx, walberla::Vector3<double>(0.01, 0.01, 0.08)); - pairs_acc->setLinearVelocity(idx, walberla::Vector3<double>(0.5, 0.5, 0.5)); - } + // for(int i=0; i<pairs_acc->size(); ++i){ + // if(pairs_acc->getShape(i) == 0){ + // std::cout<< "rank: " << rank << " sphere " << pairs_acc->getUid(i) << " " << pairs_acc->getPosition(i) << std::endl; + // } + // else if(pairs_acc->getShape(i) == 1){ + // std::cout<< "rank: " << rank << " halfspace " << pairs_acc->getUid(i) << " " << pairs_acc->getPosition(i) << pairs_acc->getNormal(i) << std::endl; + // } + // } + + // if (pIsInMyRank(pUid)){ + // int idx = pairs_acc->uidToIdx(pUid); + // pairs_acc->setPosition(idx, walberla::Vector3<double>(0.01, 0.01, 0.08)); + // pairs_acc->setLinearVelocity(idx, walberla::Vector3<double>(0.5, 0.5, 0.5)); + // } for (int i=0; i<10000; ++i){ - if ((i%200==0) && (pIsInMyRank(pUid))){ - int idx = pairs_acc->uidToIdx(pUid); - std::cout<< "Tracked particle is now in rank " << rank << " --- " << pairs_acc->getPosition(idx)<< std::endl; - } + // if ((i%200==0) && (pIsInMyRank(pUid))){ + // int idx = pairs_acc->uidToIdx(pUid); + // std::cout<< "Tracked particle is now in rank " << rank << " --- " << pairs_acc->getPosition(idx)<< std::endl; + // } pairs_sim->do_timestep(i); } diff --git a/runtime/boundary_weights.hpp b/runtime/boundary_weights.hpp index c5d0e69..cd079d1 100644 --- a/runtime/boundary_weights.hpp +++ b/runtime/boundary_weights.hpp @@ -15,6 +15,9 @@ namespace pairs { void compute_boundary_weights( PairsRuntime *ps, real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, - long unsigned int *comp_weight, long unsigned int *comm_weight); + long unsigned int *comp_weight, long unsigned int *comm_weight){ + std::cerr<< "TODO: boundary weights should be generated" << std::endl; + exit(-1); + }; } diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu index adc79cb..38848b2 100644 --- a/runtime/devices/cuda.cu +++ b/runtime/devices/cuda.cu @@ -2,6 +2,7 @@ #include <iostream> #include <cstring> #include "../pairs_common.hpp" +// #include "device.hpp" #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); } @@ -73,7 +74,6 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t } #if __CUDA_ARCH__ < 600 -//#error "CUDA architecture is less than 600" __device__ double atomicAdd_double(double* address, double val) { unsigned long long int * ull_addr = (unsigned long long int*) address; unsigned long long int old = *ull_addr, assumed; diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp index 6151ed0..60656d4 100644 --- a/runtime/devices/device.hpp +++ b/runtime/devices/device.hpp @@ -72,7 +72,7 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize return host_atomic_add(addr, val); } -#if defined(PAIRS_TARGET_CUDA) && defined(__CUDA_ARCH__) +#ifdef PAIRS_TARGET_CUDA __device__ double atomicAdd_double(double* address, double val); __device__ int atomic_add(int *addr, int val); __device__ real_t atomic_add(real_t *addr, real_t val); diff --git a/runtime/domain/block_forest.cpp b/runtime/domain/block_forest.cpp index 553d76b..fcb1988 100644 --- a/runtime/domain/block_forest.cpp +++ b/runtime/domain/block_forest.cpp @@ -21,6 +21,37 @@ namespace pairs { +BlockForest::BlockForest( + PairsRuntime *ps_, + real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, bool pbcx, bool pbcy, bool pbcz) : + DomainPartitioner(xmin, xmax, ymin, ymax, zmin, zmax), ps(ps_), globalPBC{pbcx, pbcy, pbcz} { + + subdom = new real_t[ndims * 2]; + } + +BlockForest::BlockForest(PairsRuntime *ps_, const std::shared_ptr<walberla::blockforest::BlockForest> &bf) : + forest(bf), + DomainPartitioner(bf->getDomain().xMin(), bf->getDomain().xMax(), + bf->getDomain().yMin(), bf->getDomain().yMax(), + bf->getDomain().zMin(), bf->getDomain().zMax()), + ps(ps_), + globalPBC{bf->isXPeriodic(), bf->isYPeriodic(), bf->isZPeriodic()} + { + subdom = new real_t[ndims * 2]; + balance_workload = 0; + + mpiManager = walberla::mpi::MPIManager::instance(); + world_size = mpiManager->numProcesses(); + rank = mpiManager->rank(); + this->info = make_shared<walberla::blockforest::InfoCollection>(); + + if(balance_workload) { + this->initializeWorkloadBalancer(); + } + + } + + void BlockForest::updateNeighborhood() { std::map<int, std::vector<walberla::math::AABB>> neighborhood; std::map<int, std::vector<walberla::BlockID>> blocks_pushed; diff --git a/runtime/domain/block_forest.hpp b/runtime/domain/block_forest.hpp index ee4188a..d607616 100644 --- a/runtime/domain/block_forest.hpp +++ b/runtime/domain/block_forest.hpp @@ -1,14 +1,6 @@ -#include <blockforest/BlockForest.h> -#include <blockforest/Initialization.h> -#include <blockforest/loadbalancing/DynamicCurve.h> -#include <blockforest/loadbalancing/DynamicDiffusive.h> -#include <blockforest/loadbalancing/DynamicParMetis.h> -#include <blockforest/loadbalancing/InfoCollection.h> -#include <blockforest/loadbalancing/PODPhantomData.h> -#include <blockforest/loadbalancing/level_determination/MinMaxLevelDetermination.h> -#include <blockforest/loadbalancing/weight_assignment/MetisAssignmentFunctor.h> -#include <blockforest/loadbalancing/weight_assignment/WeightAssignmentFunctor.h> -//--- +#include <memory> +#include <map> + #include "../pairs_common.hpp" #include "domain_partitioning.hpp" @@ -16,6 +8,23 @@ #define SMALL 0.00001 +namespace walberla { + namespace blockforest{ + class BlockForest; + class BlockID; + class BlockInfo; + using InfoCollection = std::map<BlockID, BlockInfo>; + } + + namespace mpi { + class MPIManager; + } + + namespace math{ + template<typename T> + class Vector3; + } +} namespace pairs { class PairsRuntime; @@ -23,7 +32,7 @@ class PairsRuntime; class BlockForest : public DomainPartitioner { private: std::shared_ptr<walberla::mpi::MPIManager> mpiManager; - std::shared_ptr<walberla::BlockForest> forest; + std::shared_ptr<walberla::blockforest::BlockForest> forest; std::shared_ptr<walberla::blockforest::InfoCollection> info; std::vector<int> ranks; std::vector<int> naabbs; @@ -38,33 +47,9 @@ private: public: BlockForest( PairsRuntime *ps_, - real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, bool pbcx, bool pbcy, bool pbcz) : - DomainPartitioner(xmin, xmax, ymin, ymax, zmin, zmax), ps(ps_), globalPBC{pbcx, pbcy, pbcz} { - - subdom = new real_t[ndims * 2]; - } - - BlockForest(PairsRuntime *ps_, const std::shared_ptr<walberla::BlockForest> &bf) : - forest(bf), - DomainPartitioner(bf->getDomain().xMin(), bf->getDomain().xMax(), - bf->getDomain().yMin(), bf->getDomain().yMax(), - bf->getDomain().zMin(), bf->getDomain().zMax()), - ps(ps_), - globalPBC{bf->isXPeriodic(), bf->isYPeriodic(), bf->isZPeriodic()} - { - subdom = new real_t[ndims * 2]; - balance_workload = 0; - - mpiManager = walberla::mpi::MPIManager::instance(); - world_size = mpiManager->numProcesses(); - rank = mpiManager->rank(); - this->info = make_shared<walberla::blockforest::InfoCollection>(); - - if(balance_workload) { - this->initializeWorkloadBalancer(); - } + real_t xmin, real_t xmax, real_t ymin, real_t ymax, real_t zmin, real_t zmax, bool pbcx, bool pbcy, bool pbcz); - } + BlockForest(PairsRuntime *ps_, const std::shared_ptr<walberla::blockforest::BlockForest> &bf); ~BlockForest() { delete[] subdom; @@ -81,7 +66,7 @@ public: void initializeWorkloadBalancer(); void updateNeighborhood(); void updateWeights(); - walberla::Vector3<int> getBlockConfig(int num_processes, int nx, int ny, int nz); + walberla::math::Vector3<int> getBlockConfig(int num_processes, int nx, int ny, int nz); int getInitialRefinementLevel(int num_processes); void setBoundingBox(); void rebalance(); diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp index e89fff1..a0b767c 100644 --- a/runtime/pairs.cpp +++ b/runtime/pairs.cpp @@ -34,9 +34,15 @@ void PairsRuntime::initDomain( } else if(dom_part_type == RegularXYPartitioning) { const int flags[] = {1, 1, 0}; dom_part = new Regular6DStencil(xmin, xmax, ymin, ymax, zmin, zmax, flags); - } else if(dom_part_type == BlockForestPartitioning) { + } + +#ifdef USE_WALBERLA + else if(dom_part_type == BlockForestPartitioning) { dom_part = new BlockForest(this, xmin, xmax, ymin, ymax, zmin, zmax, pbcx, pbcy, pbcz); - } else { + } +#endif + + else { PAIRS_ERROR("Domain partitioning type not implemented!\n"); exit(-1); } diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp index 1de253c..68fb2d6 100644 --- a/runtime/pairs.hpp +++ b/runtime/pairs.hpp @@ -364,10 +364,15 @@ void PairsRuntime::useDomain(const std::shared_ptr<Domain_T> &domain_ptr){ PAIRS_ERROR("useDomain not implemented for Regular6DStencil!\n"); exit(-1); - } else if(dom_part_type == BlockForestPartitioning) { + } + +#ifdef USE_WALBERLA + else if(dom_part_type == BlockForestPartitioning) { dom_part = new BlockForest(this, domain_ptr); + } +#endif - } else { + else { PAIRS_ERROR("Domain partitioning type not implemented!\n"); exit(-1); } diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py index d88e1cb..bfdd48c 100644 --- a/src/pairs/code_gen/cgen.py +++ b/src/pairs/code_gen/cgen.py @@ -76,6 +76,11 @@ class CGen: if isinstance(obj, FeatureProperty) and device and obj.device_flag: return name + + if isinstance(obj, Array) and device and obj.device_flag: + if obj.is_static(): + return name + if self.generate_full_object_names: return f"pobj->{name}" @@ -96,7 +101,10 @@ class CGen: self.print.start() self.print("#pragma once") self.generate_interface_namespace('pairs_host_interface') - self.generate_interface_namespace('pairs_cuda_interface', "__inline__ __device__") + + if self.target.is_gpu(): + self.generate_interface_namespace('pairs_cuda_interface', "__inline__ __device__") + self.print.end() def generate_interface_namespace(self, namespace, prefix=None): @@ -159,29 +167,12 @@ class CGen: self.print("using namespace pairs;") self.print("") - if self.target.is_gpu(): - for array in self.sim.arrays.statics(): - if array.device_flag: - t = array.type() - tkw = Types.c_keyword(self.sim, t) - size = self.generate_expression(ScalarOp.inline(array.alloc_size())) - self.print(f"__constant__ {tkw} d_{array.name()}[{size}];") - - for feature_prop in self.sim.feature_properties: - if feature_prop.device_flag: - t = feature_prop.type() - tkw = Types.c_keyword(self.sim, t) - size = feature_prop.array_size() - self.print(f"__constant__ {tkw} d_{feature_prop.name()}[{size}];") - - self.print("") - def generate_module_headers(self): self.print("") self.print("// Module headers") for module in self.sim.modules(): - self.print(f"void {module.name}(struct PairsObjects *pobj);") + self.print(f"void {module.name}(PairsRuntime *pairs_runtime, struct PairsObjects *pobj);") self.print("") @@ -261,6 +252,24 @@ class CGen: self.print("") def generate_pairs_object_structure(self): + + self.print("") + + if self.target.is_gpu(): + for array in self.sim.arrays.statics(): + if array.device_flag: + t = array.type() + tkw = Types.c_keyword(self.sim, t) + size = self.generate_expression(ScalarOp.inline(array.alloc_size())) + self.print(f"extern __constant__ {tkw} d_{array.name()}[{size}];") + + for feature_prop in self.sim.feature_properties: + if feature_prop.device_flag: + t = feature_prop.type() + tkw = Types.c_keyword(self.sim, t) + size = feature_prop.array_size() + self.print(f"extern __constant__ {tkw} d_{feature_prop.name()}[{size}];") + self.print("") self.print("struct PairsObjects {") self.print.add_indent(4) @@ -278,7 +287,10 @@ class CGen: self.print(f"{tkw} *{ptr};") if self.target.is_gpu() and a.device_flag: - self.print(f"{tkw} *d_{ptr};") + if a.is_static(): + continue + else: + self.print(f"{tkw} *d_{ptr};") self.print("// Properties") for p in self.sim.properties: @@ -343,6 +355,21 @@ class CGen: self.generate_preamble() self.print(f"#include \"{self.ref}.hpp\"") + if self.target.is_gpu(): + for array in self.sim.arrays.statics(): + if array.device_flag: + t = array.type() + tkw = Types.c_keyword(self.sim, t) + size = self.generate_expression(ScalarOp.inline(array.alloc_size())) + self.print(f"__constant__ {tkw} d_{array.name()}[{size}];") + + for feature_prop in self.sim.feature_properties: + if feature_prop.device_flag: + t = feature_prop.type() + tkw = Types.c_keyword(self.sim, t) + size = feature_prop.array_size() + self.print(f"__constant__ {tkw} d_{feature_prop.name()}[{size}];") + for kernel in self.sim.kernels(): self.generate_kernel(kernel) @@ -355,6 +382,7 @@ class CGen: # Generate library header self.print = Printer(self.ref + ".hpp") self.print.start() + self.print("#pragma once") self.generate_preamble() self.generate_pairs_object_structure() @@ -433,7 +461,7 @@ class CGen: self.print.add_indent(-4) self.print("};") - self.generate_host_pairs_accessor_class() + # self.generate_host_pairs_accessor_class() self.print.end() self.generate_full_object_names = False @@ -468,7 +496,7 @@ class CGen: self.generate_full_object_names = False else: - self.print(f"void {module.name}(struct PairsObjects *pobj) {{") + self.print(f"void {module.name}(PairsRuntime *pairs_runtime, struct PairsObjects *pobj) {{") self.print.add_indent(4) device_cond = module.run_on_device and self.target.is_gpu() @@ -481,17 +509,25 @@ class CGen: for var in module.write_variables(): type_kw = Types.c_keyword(self.sim, var.type()) - name = f"rv_{var.name()}.getDevicePointer()" if device_cond and var.device_flag else var.name() - self.print(f"{type_kw} *{var.name()} = &(pobj->{name});") + + if device_cond and var.device_flag: + self.print(f"{type_kw} *{var.name()} = pobj->rv_{var.name()}.getDevicePointer();") + else: + self.print(f"{type_kw} *{var.name()} = &(pobj->{var.name()});") for array in module.arrays(): type_kw = Types.c_keyword(self.sim, array.type()) name = array.name() if not device_cond else f"d_{array.name()}" - self.print(f"{type_kw} *{array.name()} = pobj->{name};") + # self.generate_full_object_names = True + if not array.is_static() or (array.is_static() and not module.run_on_device): + self.print(f"{type_kw} *{array.name()} = pobj->{name};") + # self.print(f"{type_kw} *{array.name()} = {self.generate_object_reference(array, device=device_cond)};") + # self.generate_full_object_names = False if array in module.host_references(): self.print(f"{type_kw} *h_{array.name()} = pobj->{array.name()};") + for prop in module.properties(): type_kw = Types.c_keyword(self.sim, prop.type()) name = prop.name() if not device_cond else f"d_{prop.name()}" @@ -511,7 +547,12 @@ class CGen: for feature_prop in module.feature_properties(): type_kw = Types.c_keyword(self.sim, feature_prop.type()) name = feature_prop.name() if not device_cond else f"d_{feature_prop.name()}" - self.print(f"{type_kw} *{feature_prop.name()} = pobj->{name};") + + if feature_prop.device_flag and device_cond: + # self.print(f"{type_kw} *{feature_prop.name()} = {self.generate_object_reference(feature_prop, device=device_cond)};") + continue + else: + self.print(f"{type_kw} *{feature_prop.name()} = pobj->{name};") if feature_prop in module.host_references(): self.print(f"{type_kw} *h_{feature_prop.name()} = pobj->{feature_prop.name()};") @@ -533,6 +574,8 @@ class CGen: kernel_params += f", {decl}" for array in kernel.arrays(): + if array.is_static(): + continue type_kw = Types.c_keyword(self.sim, array.type()) decl = f"{type_kw} *{array.name()}" kernel_params += f", {decl}" @@ -548,6 +591,8 @@ class CGen: kernel_params += f", {decl}" for feature_prop in kernel.feature_properties(): + if feature_prop.device_flag: + continue type_kw = Types.c_keyword(self.sim, feature_prop.type()) decl = f"{type_kw} *{feature_prop.name()}" kernel_params += f", {decl}" @@ -870,6 +915,8 @@ class CGen: kernel_params += f", {var.name()}" for array in kernel.arrays(): + if array.is_static(): + continue kernel_params += f", {array.name()}" for prop in kernel.properties(): @@ -879,6 +926,8 @@ class CGen: kernel_params += f", {contact_prop.name()}" for feature_prop in kernel.feature_properties(): + if feature_prop.device_flag: + continue kernel_params += f", {feature_prop.name()}" for array_access in kernel.array_accesses(): @@ -897,7 +946,7 @@ class CGen: self.print("}") if isinstance(ast_node, ModuleCall): - self.print(f"{ast_node.module.name}(pobj);") + self.print(f"{ast_node.module.name}(pairs_runtime, pobj);") if isinstance(ast_node, Print): args = ast_node.args -- GitLab