Skip to content
Snippets Groups Projects
Commit c73b9908 authored by Rafael Ravedutti's avatar Rafael Ravedutti
Browse files

Adjust build for one device type and separate CUDA atomics in cuda.cu file

parent 31bf2f83
No related branches found
No related tags found
No related merge requests found
...@@ -8,28 +8,25 @@ if(NOT TESTCASE) ...@@ -8,28 +8,25 @@ if(NOT TESTCASE)
set(TESTCASE md CACHE STRING "Select the testcase from the following: md, dem" FORCE) set(TESTCASE md CACHE STRING "Select the testcase from the following: md, dem" FORCE)
endif() endif()
set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.") set(TARGET_BIN ${TESTCASE})
set(CMAKE_CUDA_ARCHITECTURES 80) # Set default build type if none is specified
set(CUDA_ARCH 80) if(NOT CMAKE_BUILD_TYPE)
# if(NOT CUDA_ARCH) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build (Debug, Release, etc.)" FORCE)
# set(CUDA_ARCH 80) endif()
# endif()
set(CMAKE_CXX_FLAGS_RELEASE "-g -O0")
set(CMAKE_CXX_FLAGS_DEBUG "-O3 -DDEBUG")
string(TOLOWER "${TESTCASE}" TESTCASE) string(TOLOWER "${TESTCASE}" TESTCASE)
message(STATUS "Selected testcase: ${TESTCASE}") message(STATUS "Selected testcase: ${TESTCASE}")
option(DEBUG "DEBUG" ON)
option(USE_WALBERLA "USE_WALBERLA" ON) option(USE_WALBERLA "USE_WALBERLA" ON)
option(USE_MPI "USE_MPI" ON) option(USE_MPI "USE_MPI" ON)
option(COMPILE_CUDA "COMPILE_CUDA" ON) option(COMPILE_CUDA "COMPILE_CUDA" ON)
option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON) option(ENABLE_GPU_DIRECT "ENABLE_GPU_DIRECT" ON)
option(GENERATE_WHOLE_PROGRAM "GENERATE_WHOLE_PROGRAM" OFF) option(GENERATE_WHOLE_PROGRAM "GENERATE_WHOLE_PROGRAM" OFF)
if(DEBUG)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG ")
endif()
if(USE_WALBERLA) if(USE_WALBERLA)
# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING -DWALBERLA_BUILD_WITH_CUDA ") # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING -DWALBERLA_BUILD_WITH_CUDA ")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING ") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_WALBERLA_LOAD_BALANCING ")
...@@ -38,10 +35,6 @@ if(USE_WALBERLA) ...@@ -38,10 +35,6 @@ if(USE_WALBERLA)
waLBerla_import() waLBerla_import()
endif() endif()
set(CPU_SRC "${TESTCASE}.cpp")
set(GPU_SRC "${TESTCASE}.cu")
set(CPU_BIN "${TESTCASE}_cpu")
set(GPU_BIN "${TESTCASE}_gpu")
set(GEN_HEADER ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces/last_generated.hpp) set(GEN_HEADER ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces/last_generated.hpp)
set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces) set(GEN_HEADER_DIR ${CMAKE_CURRENT_BINARY_DIR}/runtime/interfaces)
...@@ -82,6 +75,7 @@ set(OUTPUT_DIR ${CMAKE_BINARY_DIR}/output) ...@@ -82,6 +75,7 @@ set(OUTPUT_DIR ${CMAKE_BINARY_DIR}/output)
if(EXISTS ${OUTPUT_DIR}) if(EXISTS ${OUTPUT_DIR})
file(REMOVE_RECURSE ${OUTPUT_DIR}) file(REMOVE_RECURSE ${OUTPUT_DIR})
endif() endif()
file(MAKE_DIRECTORY ${OUTPUT_DIR}) file(MAKE_DIRECTORY ${OUTPUT_DIR})
execute_process( execute_process(
...@@ -92,89 +86,87 @@ execute_process( ...@@ -92,89 +86,87 @@ execute_process(
COMMAND ${PYTHON_EXECUTABLE} setup.py install --user COMMAND ${PYTHON_EXECUTABLE} setup.py install --user
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
if(USE_WALBERLA) # CUDA compilation
waLBerla_add_executable(
NAME ${CPU_BIN}
FILES ${EXEC_FILES} ${RUNTIME_CPU_FILES} ${CMAKE_BINARY_DIR}/${CPU_SRC}
DEPENDS blockforest core pe)
else()
add_executable(${CPU_BIN} ${CPU_SRC} ${EXEC_FILES} ${RUNTIME_CPU_FILES})
endif()
add_library(runtime_cpu STATIC runtime/devices/dummy.cpp)
target_link_libraries(${CPU_BIN} runtime_cpu)
add_custom_command(
OUTPUT ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER}
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py cpu
COMMENT "Generate CPU code"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py)
target_include_directories(${CPU_BIN} PRIVATE ${GEN_HEADER_DIR})
add_custom_target(gen_cpu DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER})
add_dependencies(${CPU_BIN} gen_cpu)
if(COMPILE_CUDA) if(COMPILE_CUDA)
find_package(CUDA REQUIRED) find_package(CUDA REQUIRED)
enable_language(CUDA) enable_language(CUDA)
set(GEN_SOURCES "${CMAKE_BINARY_DIR}/${TESTCASE}.cu")
set(CUDA_ARCH ${CUDA_ARCH} CACHE STRING "CUDA_ARCH environment variable must be set.")
set(TARGET_ARG "gpu")
# Default arch is 80
if(NOT CUDA_ARCH)
set(CUDA_ARCH 80)
endif()
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
if(USE_WALBERLA) if(USE_WALBERLA)
# set(WALBERLA_BUILD_WITH_CUDA ON CACHE BOOL "Enable CUDA in waLBerla") set(WALBERLA_BUILD_WITH_CUDA ON)
set(WALBERLA_BUILD_WITH_GPU_SUPPORT ON)
waLBerla_add_executable( waLBerla_add_executable(
NAME ${GPU_BIN} NAME ${TARGET_BIN}
FILES ${EXEC_FILES} FILES ${EXEC_FILES} ${GEN_SOURCES}
DEPENDS blockforest core pe) DEPENDS blockforest core pe gpu)
# DEPENDS blockforest core pe gpu)
else() else()
add_executable(${GPU_BIN} ${GPU_SRC} ${EXEC_FILES} ${RUNTIME_GPU_FILES}) cuda_add_executable(${TARGET_BIN} ${GEN_SOURCES} ${EXEC_FILES})
endif() endif()
if(ENABLE_GPU_DIRECT) if(ENABLE_GPU_DIRECT)
target_compile_options(${GPU_BIN} PRIVATE -DENABLE_CUDA_AWARE_MPI) target_compile_options(${TARGET_BIN} PRIVATE -DENABLE_CUDA_AWARE_MPI)
endif() endif()
add_library(runtime_gpu STATIC runtime/devices/cuda.cu) cuda_add_library(runtime STATIC runtime/devices/cuda.cu)
target_compile_features(runtime_gpu PUBLIC cxx_std_11) target_compile_features(runtime PUBLIC cxx_std_11)
set_target_properties(runtime_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON) set_target_properties(runtime PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(runtime PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
target_link_libraries(${GPU_BIN} runtime_gpu) target_compile_options(runtime PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>)
target_compile_options(${GPU_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>) target_compile_definitions(runtime PRIVATE PAIRS_TARGET_CUDA)
target_include_directories(${GPU_BIN} PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR}) target_include_directories(runtime PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR})
set_target_properties(${GPU_BIN} PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
target_compile_definitions(${GPU_BIN} PRIVATE PAIRS_TARGET_CUDA)
target_compile_options(runtime_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>)
target_include_directories(runtime_gpu PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR})
set_target_properties(runtime_gpu PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
target_compile_definitions(runtime_gpu PRIVATE PAIRS_TARGET_CUDA)
add_custom_command(
OUTPUT ${GPU_SRC} ${GEN_HEADER}
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py gpu
COMMENT "Generate GPU code"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py)
add_custom_target(gen_gpu DEPENDS ${GPU_SRC} ${GEN_HEADER})
add_dependencies(${GPU_BIN} gen_gpu)
endif()
target_link_libraries(${CPU_BIN} ${CMAKE_EXE_LINKER_FLAGS}) set_target_properties(${TARGET_BIN} PROPERTIES CUDA_ARCHITECTURES ${CUDA_ARCH})
set_target_properties(${CPU_BIN} PROPERTIES CXX_STANDARD_REQUIRED ON) target_compile_options(${TARGET_BIN} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_${CUDA_ARCH}>)
set_target_properties(${CPU_BIN} PROPERTIES CXX_STANDARD 17) target_compile_definitions(${TARGET_BIN} PRIVATE PAIRS_TARGET_CUDA)
target_include_directories(${TARGET_BIN} PRIVATE ${CUDA_INCLUDE_DIRS} ${GEN_HEADER_DIR})
if(COMPILE_CUDA) # CPU compilation
target_link_libraries(${GPU_BIN} ${CMAKE_EXE_LINKER_FLAGS}) else()
set_target_properties(${GPU_BIN} PROPERTIES CXX_STANDARD_REQUIRED ON) set(GEN_SOURCES "${CMAKE_BINARY_DIR}/${TESTCASE}.cpp")
set_target_properties(${GPU_BIN} PROPERTIES CXX_STANDARD 17) set(TARGET_ARG "cpu")
if(USE_WALBERLA)
waLBerla_add_executable(
NAME ${TARGET_BIN}
FILES ${EXEC_FILES} ${GEN_SOURCES}
DEPENDS blockforest core pe)
else()
add_executable(${TARGET_BIN} ${GEN_SOURCES} ${EXEC_FILES})
endif()
add_library(runtime STATIC runtime/devices/dummy.cpp)
endif() endif()
target_link_libraries(${TARGET_BIN} runtime)
add_custom_command(
OUTPUT ${GEN_SOURCES} ${GEN_HEADER}
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py ${TARGET_ARG}
COMMENT "Generating code with P4IRS"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/examples/${TESTCASE}.py)
target_include_directories(${TARGET_BIN} PRIVATE ${GEN_HEADER_DIR})
add_custom_target(generated_code DEPENDS ${CMAKE_BINARY_DIR}/${CPU_SRC} ${GEN_HEADER})
add_dependencies(${TARGET_BIN} generated_code)
target_link_libraries(${TARGET_BIN} ${CMAKE_EXE_LINKER_FLAGS})
set_target_properties(${TARGET_BIN} PROPERTIES CXX_STANDARD_REQUIRED ON)
set_target_properties(${TARGET_BIN} PROPERTIES CXX_STANDARD 17)
if(USE_MPI) if(USE_MPI)
find_package(MPI REQUIRED) find_package(MPI REQUIRED)
include_directories(SYSTEM ${MPI_INCLUDE_PATH}) include_directories(SYSTEM ${MPI_INCLUDE_PATH})
target_link_libraries(${CPU_BIN} ${MPI_LIBRARIES}) target_link_libraries(${TARGET_BIN} ${MPI_LIBRARIES})
if(COMPILE_CUDA)
target_link_libraries(${GPU_BIN} ${MPI_LIBRARIES})
endif()
endif() endif()
if(LIKWID_DIR) if(LIKWID_DIR)
......
...@@ -72,5 +72,36 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t ...@@ -72,5 +72,36 @@ __host__ void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t
//CUDA_ASSERT(cudaMemcpyFromSymbol(h_ptr, d_ptr, count)); //CUDA_ASSERT(cudaMemcpyFromSymbol(h_ptr, d_ptr, count));
} }
#if __CUDA_ARCH__ < 600
//#error "CUDA architecture is less than 600"
__device__ double atomicAdd_double(double* address, double val) {
unsigned long long int * ull_addr = (unsigned long long int*) address;
unsigned long long int old = *ull_addr, assumed;
do {
assumed = old;
old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
#else
__device__ double atomicAdd_double(double* address, double val) {
return atomicAdd(address, val);
}
#endif
__device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); }
__device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); }
__device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
const int add_res = *addr + val;
if(add_res >= capacity) {
*resize = add_res;
return *addr;
}
return atomic_add(addr, val);
}
} }
...@@ -72,44 +72,14 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize ...@@ -72,44 +72,14 @@ inline __host__ int host_atomic_add_resize_check(int *addr, int val, int *resize
return host_atomic_add(addr, val); return host_atomic_add(addr, val);
} }
#ifdef PAIRS_TARGET_CUDA #if defined(PAIRS_TARGET_CUDA) && defined(__CUDA_ARCH__)
// #if __CUDA_ARCH__ < 600 __device__ double atomicAdd_double(double* address, double val);
// #error "CUDA architecture is less than 600" __device__ int atomic_add(int *addr, int val);
// __device__ double atomicAdd_double(double* address, double val) { __device__ real_t atomic_add(real_t *addr, real_t val);
// unsigned long long int * ull_addr = (unsigned long long int*) address; __device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity);
// unsigned long long int old = *ull_addr, assumed;
// do {
// assumed = old;
// old = atomicCAS(ull_addr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
// // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
// } while (assumed != old);
// return __longlong_as_double(old);
// }
// #else
__device__ double atomicAdd_double(double* address, double val) {
return atomicAdd(address, val);
}
// #endif
__device__ int atomic_add(int *addr, int val) { return atomicAdd(addr, val); }
__device__ real_t atomic_add(real_t *addr, real_t val) { return atomicAdd_double(addr, val); }
__device__ int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
const int add_res = *addr + val;
if(add_res >= capacity) {
*resize = add_res;
return *addr;
}
return atomic_add(addr, val);
}
#else #else
inline int atomic_add(int *addr, int val) { return host_atomic_add(addr, val); } int atomic_add(int *addr, int val);
inline real_t atomic_add(real_t *addr, real_t val) { return host_atomic_add(addr, val); } real_t atomic_add(real_t *addr, real_t val);
inline int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) { int atomic_add_resize_check(int *addr, int val, int *resize, int capacity);
return host_atomic_add_resize_check(addr, val, resize, capacity);
}
#endif #endif
} }
...@@ -19,4 +19,16 @@ void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count) { ...@@ -19,4 +19,16 @@ void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count) {
std::memcpy(d_ptr1, d_ptr2, count); std::memcpy(d_ptr1, d_ptr2, count);
} }
int atomic_add(int *addr, int val) {
return host_atomic_add(addr, val);
}
real_t atomic_add(real_t *addr, real_t val) {
return host_atomic_add(addr, val);
}
int atomic_add_resize_check(int *addr, int val, int *resize, int capacity) {
return host_atomic_add_resize_check(addr, val, resize, capacity);
}
} }
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
namespace pairs { namespace pairs {
template<typename T> template<typename T>
class RuntimeVar{ class RuntimeVar {
protected: protected:
T *h_ptr, *d_ptr; T *h_ptr, *d_ptr;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment