diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b3f534af635f30e697b795b47d06f61e3d15f63..a23e8457f1dd21258e0e5922e73a654c3f0515a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,28 +8,25 @@ if(NOT TESTCASE) set(TESTCASE md CACHE STRING "Select the testcase from the following: md, dem" FORCE) endif() -set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.") +set(TARGET_BIN ${TESTCASE}) -set(CMAKE_CUDA_ARCHITECTURES 80) -set(CUDA_ARCH 80) -# if(NOT CUDA_ARCH) -# set(CUDA_ARCH 80) -# endif() +# Set default build type if none is specified +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build (Debug, Release, etc.)" FORCE) +endif() + +set(CMAKE_CXX_FLAGS_RELEASE "-g -O0") +set(CMAKE_CXX_FLAGS_DEBUG "-O3 -DDEBUG") string(TOLOWER "${TESTCASE}" TESTCASE) message(STATUS "Selected testcase: ${TESTCASE}") -option(DEBUG "DEBUG" ON) option(USE_WALBERLA "USE_WALBERLA" ON) option(USE_MPI "USE_MPI" ON) option(COMPILE_CUDA "COMPILE_CUDA" ON) option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON) option(GENERATE_WHOLE_PROGRAM "GENERATE_WHOLE_PROGRAM" OFF) -if(DEBUG) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG ") -endif() - if(USE_WALBERLA) # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING -DWALBERLA_BUILD_WITH_CUDA ") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING ") @@ -38,10 +35,6 @@ if(USE_WALBERLA) waLBerla_import() endif() -set(CPU_SRC "${TESTCASE}.cpp") -set(GPU_SRC "${TESTCASE}.cu") -set(CPU_BIN "${TESTCASE}_cpu") -set(GPU_BIN "${TESTCASE}_gpu") set(GEN_HEADER ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces/last_generated.hpp) set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces) @@ -82,6 +75,7 @@ set(OUTPUT_DIR ${CMAKE_BINARY_DIR}/output) if(EXISTS ${OUTPUT_DIR}) file(REMOVE_RECURSE ${OUTPUT_DIR}) endif() + file(MAKE_DIRECTORY ${OUTPUT_DIR}) execute_process( @@ -92,89 +86,87 @@ execute_process( COMMAND ${PYTHON_EXECUTABLE} setup.py install --user WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) -if(USE_WALBERLA) - waLBerla_add_executable( - NAME ${CPU_BIN} - FILES ${EXEC_FILES} ${RUNTIME_CPU_FILES} ${CMAKE_BINARY_DIR}/${CPU_SRC} - DEPENDS blockforest core pe) -else() - add_executable(${CPU_BIN} ${CPU_SRC} ${EXEC_FILES} ${RUNTIME_CPU_FILES}) -endif() - -add_library(runtime_cpu STATIC runtime/devices/dummy.cpp) -target_link_libraries(${CPU_BIN} runtime_cpu) - -add_custom_command( - OUTPUT ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER} - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py cpu - COMMENT "Generate CPU code" - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py) - -target_include_directories(${CPU_BIN} PRIVATE ${GEN_HEADER_DIR}) -add_custom_target(gen_cpu DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER}) -add_dependencies(${CPU_BIN} gen_cpu) - +# CUDA compilation if(COMPILE_CUDA) find_package(CUDA REQUIRED) enable_language(CUDA) + set(GEN_SOURCES "${CMAKE_BINARY_DIR}/${TESTCASE}.cu") + set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.") + set(TARGET_ARG "gpu") + + # Default arch is 80 + if(NOT CUDA_ARCH) + set(CUDA_ARCH 80) + endif() + + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) if(USE_WALBERLA) - # set(WALBERLA_BUILD_WITH_CUDA ON CACHE BOOL "Enable CUDA in waLBerla") + set(WALBERLA_BUILD_WITH_CUDA ON) + set(WALBERLA_BUILD_WITH_GPU_SUPPORT ON) + waLBerla_add_executable( - NAME ${GPU_BIN} - FILES ${EXEC_FILES} - DEPENDS blockforest core pe) - # DEPENDS blockforest core pe gpu) + NAME ${TARGET_BIN} + FILES ${EXEC_FILES} ${GEN_SOURCES} + DEPENDS blockforest core pe gpu) else() - add_executable(${GPU_BIN} ${GPU_SRC} ${EXEC_FILES} ${RUNTIME_GPU_FILES}) + cuda_add_executable(${TARGET_BIN} ${GEN_SOURCES} ${EXEC_FILES}) endif() if(ENABLE_GPU_DIRECT) - target_compile_options(${GPU_BIN} PRIVATE -DENABLE_CUDA_AWARE_MPI) + target_compile_options(${TARGET_BIN} PRIVATE -DENABLE_CUDA_AWARE_MPI) endif() - add_library(runtime_gpu STATIC runtime/devices/cuda.cu) - target_compile_features(runtime_gpu PUBLIC cxx_std_11) - set_target_properties(runtime_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - target_link_libraries(${GPU_BIN} runtime_gpu) - target_compile_options(${GPU_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>) - target_include_directories(${GPU_BIN} PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR}) - set_target_properties(${GPU_BIN} PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH}) - target_compile_definitions(${GPU_BIN} PRIVATE PAIRS_TARGET_CUDA) - - target_compile_options(runtime_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>) - target_include_directories(runtime_gpu PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR}) - set_target_properties(runtime_gpu PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH}) - target_compile_definitions(runtime_gpu PRIVATE PAIRS_TARGET_CUDA) - - add_custom_command( - OUTPUT ${GPU_SRC} ${GEN_HEADER} - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py gpu - COMMENT "Generate GPU code" - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py) - - add_custom_target(gen_gpu DEPENDS ${GPU_SRC} ${GEN_HEADER}) - add_dependencies(${GPU_BIN} gen_gpu) -endif() + cuda_add_library(runtime STATIC runtime/devices/cuda.cu) + target_compile_features(runtime PUBLIC cxx_std_11) + set_target_properties(runtime PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + set_target_properties(runtime PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH}) + target_compile_options(runtime PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>) + target_compile_definitions(runtime PRIVATE PAIRS_TARGET_CUDA) + target_include_directories(runtime PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR}) -target_link_libraries(${CPU_BIN} ${CMAKE_EXE_LINKER_FLAGS}) -set_target_properties(${CPU_BIN} PROPERTIES CXX_STANDARD_REQUIRED ON) -set_target_properties(${CPU_BIN} PROPERTIES CXX_STANDARD 17) + set_target_properties(${TARGET_BIN} PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH}) + target_compile_options(${TARGET_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>) + target_compile_definitions(${TARGET_BIN} PRIVATE PAIRS_TARGET_CUDA) + target_include_directories(${TARGET_BIN} PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR}) -if(COMPILE_CUDA) - target_link_libraries(${GPU_BIN} ${CMAKE_EXE_LINKER_FLAGS}) - set_target_properties(${GPU_BIN} PROPERTIES CXX_STANDARD_REQUIRED ON) - set_target_properties(${GPU_BIN} PROPERTIES CXX_STANDARD 17) +# CPU compilation +else() + set(GEN_SOURCES "${CMAKE_BINARY_DIR}/${TESTCASE}.cpp") + set(TARGET_ARG "cpu") + + if(USE_WALBERLA) + waLBerla_add_executable( + NAME ${TARGET_BIN} + FILES ${EXEC_FILES} ${GEN_SOURCES} + DEPENDS blockforest core pe) + else() + add_executable(${TARGET_BIN} ${GEN_SOURCES} ${EXEC_FILES}) + endif() + + add_library(runtime STATIC runtime/devices/dummy.cpp) endif() +target_link_libraries(${TARGET_BIN} runtime) + +add_custom_command( + OUTPUT ${GEN_SOURCES} ${GEN_HEADER} + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py ${TARGET_ARG} + COMMENT "Generating code with P4IRS" + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py) + +target_include_directories(${TARGET_BIN} PRIVATE ${GEN_HEADER_DIR}) +add_custom_target(generated_code DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER}) +add_dependencies(${TARGET_BIN} generated_code) + +target_link_libraries(${TARGET_BIN} ${CMAKE_EXE_LINKER_FLAGS}) +set_target_properties(${TARGET_BIN} PROPERTIES CXX_STANDARD_REQUIRED ON) +set_target_properties(${TARGET_BIN} PROPERTIES CXX_STANDARD 17) + if(USE_MPI) find_package(MPI REQUIRED) include_directories(SYSTEM ${MPI_INCLUDE_PATH}) - target_link_libraries(${CPU_BIN} ${MPI_LIBRARIES}) - if(COMPILE_CUDA) - target_link_libraries(${GPU_BIN} ${MPI_LIBRARIES}) - endif() + target_link_libraries(${TARGET_BIN} ${MPI_LIBRARIES}) endif() if(LIKWID_DIR) diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu index db5119e494a0f897f3d2540a0c623985ddc74550..adc79cb6bc69b749e1abdcf91ae52b6190ddd3e9 100644 --- a/runtime/devices/cuda.cu +++ b/runtime/devices/cuda.cu @@ -72,5 +72,36 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t //CUDA_ASSERT(cudaMemcpyFromSymbol(h_ptr, d_ptr, count)); } +#if __CUDA_ARCH__ < 600 +//#error "CUDA architecture is less than 600" +__device__ double atomicAdd_double(double* address, double val) { + unsigned long long int * ull_addr = (unsigned long long int*) address; + unsigned long long int old = *ull_addr, assumed; + + do { + assumed = old; + old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#else +__device__ double atomicAdd_double(double* address, double val) { + return atomicAdd(address, val); +} +#endif + +__device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); } +__device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); } +__device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) { + const int add_res = *addr + val; + if(add_res >= capacity) { + *resize = add_res; + return *addr; + } + + return atomic_add(addr, val); +} } diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp index e2740b0c47631ac6f4f78fa9d144bbaa79fabd4f..6151ed0fbbe1e8e0abb974c3519cab925e5e4928 100644 --- a/runtime/devices/device.hpp +++ b/runtime/devices/device.hpp @@ -72,44 +72,14 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize return host_atomic_add(addr, val); } -#ifdef PAIRS_TARGET_CUDA -// #if __CUDA_ARCH__ < 600 -// #error "CUDA architecture is less than 600" -// __device__ double atomicAdd_double(double* address, double val) { -// unsigned long long int * ull_addr = (unsigned long long int*) address; -// unsigned long long int old = *ull_addr, assumed; - -// do { -// assumed = old; -// old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); -// // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) -// } while (assumed != old); - -// return __longlong_as_double(old); -// } -// #else -__device__ double atomicAdd_double(double* address, double val) { - return atomicAdd(address, val); -} -// #endif - -__device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); } -__device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); } -__device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) { - const int add_res = *addr + val; - if(add_res >= capacity) { - *resize = add_res; - return *addr; - } - - return atomic_add(addr, val); -} +#if defined(PAIRS_TARGET_CUDA) && defined(__CUDA_ARCH__) +__device__ double atomicAdd_double(double* address, double val); +__device__ int atomic_add(int *addr, int val); +__device__ real_t atomic_add(real_t *addr, real_t val); +__device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity); #else -inline int atomic_add(int *addr, int val) { return host_atomic_add(addr, val); } -inline real_t atomic_add(real_t *addr, real_t val) { return host_atomic_add(addr, val); } -inline int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) { - return host_atomic_add_resize_check(addr, val, resize, capacity); -} +int atomic_add(int *addr, int val); +real_t atomic_add(real_t *addr, real_t val); +int atomic_add_resize_check(int *addr, int val, int *resize, int capacity); #endif - } diff --git a/runtime/devices/dummy.cpp b/runtime/devices/dummy.cpp index a0151fc0aecd0322f2cd55feb9699ace713ae52e..9b06d0b267e45fa0ece7b492c27c86c47a1525b5 100644 --- a/runtime/devices/dummy.cpp +++ b/runtime/devices/dummy.cpp @@ -19,4 +19,16 @@ void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count) { std::memcpy(d_ptr1, d_ptr2, count); } +int atomic_add(int *addr, int val) { + return host_atomic_add(addr, val); +} + +real_t atomic_add(real_t *addr, real_t val) { + return host_atomic_add(addr, val); +} + +int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) { + return host_atomic_add_resize_check(addr, val, resize, capacity); +} + } diff --git a/runtime/runtime_var.hpp b/runtime/runtime_var.hpp index c33bef33bbc2725af07b2b8426ed94a7adad7255..7599b9771fab009be05df1a13ea6764e2c0d78ba 100644 --- a/runtime/runtime_var.hpp +++ b/runtime/runtime_var.hpp @@ -5,7 +5,7 @@ namespace pairs { template<typename T> -class RuntimeVar{ +class RuntimeVar { protected: T *h_ptr, *d_ptr;