Skip to content
Snippets Groups Projects
Commit d37f96fe authored by Rafael Ravedutti's avatar Rafael Ravedutti
Browse files

Fix CUDA-aware MPI

parent 0b9a797f
No related branches found
No related tags found
No related merge requests found
...@@ -13,7 +13,7 @@ LIKWID_LIB ?= -L/usr/local/lib ...@@ -13,7 +13,7 @@ LIKWID_LIB ?= -L/usr/local/lib
LIKWID_FLAGS = -llikwid ${LIKWID_INC} ${LIKWID_DEFINES} ${LIKWID_LIB} LIKWID_FLAGS = -llikwid ${LIKWID_INC} ${LIKWID_DEFINES} ${LIKWID_LIB}
#CUDA_FLAGS= #CUDA_FLAGS=
CUDA_FLAGS=-DENABLE_CUDA_AWARE_MPI CUDA_FLAGS=-DENABLE_CUDA_AWARE_MPI
CFLAGS=-Ofast -march=core-avx2 ${CUDA_FLAGS} ${LIKWID_FLAGS} CFLAGS=-Ofast -march=core-avx2 ${LIKWID_FLAGS}
#CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${LIKWID_FLAGS} #CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${LIKWID_FLAGS}
#CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${LIKWID_FLAGS} #CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${LIKWID_FLAGS}
CUDA_BIN_PATH:="$(shell dirname ${NVCC_PATH})" CUDA_BIN_PATH:="$(shell dirname ${NVCC_PATH})"
...@@ -44,10 +44,10 @@ $(GPU_SRC): ...@@ -44,10 +44,10 @@ $(GPU_SRC):
$(PYCMD) examples/$(TESTCASE).py gpu $(PYCMD) examples/$(TESTCASE).py gpu
$(OBJ_PATH)/pairs.o: runtime/pairs.cpp $(OBJ_PATH)/pairs.o: runtime/pairs.cpp
$(CC) $(CFLAGS) -c -o $@ $< $(DEBUG_FLAGS) $(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CUDA_FLAGS) $(CFLAGS)
$(OBJ_PATH)/regular_6d_stencil.o: runtime/domain/regular_6d_stencil.cpp $(OBJ_PATH)/regular_6d_stencil.o: runtime/domain/regular_6d_stencil.cpp
$(CC) $(CFLAGS) -c -o $@ $< $(DEBUG_FLAGS) $(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CUDA_FLAGS) $(CFLAGS)
$(OBJ_PATH)/dummy.o: runtime/devices/dummy.cpp $(OBJ_PATH)/dummy.o: runtime/devices/dummy.cpp
$(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CC) -c -o $@ $< $(DEBUG_FLAGS)
...@@ -57,9 +57,9 @@ $(CPU_BIN): $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OB ...@@ -57,9 +57,9 @@ $(CPU_BIN): $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OB
$(CC) $(CFLAGS) -o $(CPU_BIN) $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OBJ_PATH)/dummy.o $(DEBUG_FLAGS) $(CC) $(CFLAGS) -o $(CPU_BIN) $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OBJ_PATH)/dummy.o $(DEBUG_FLAGS)
$(GPU_BIN): $(GPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(GPU_BIN): $(GPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o
$(NVCC) $(CUDA_FLAGS) -c -o $(OBJ_PATH)/cuda_runtime.o runtime/devices/cuda.cu ${DEBUG_FLAGS} $(NVCC) -c -o $(OBJ_PATH)/cuda_runtime.o runtime/devices/cuda.cu $(DEBUG_FLAGS) $(CUDA_FLAGS)
$(NVCC) $(CUDA_FLAGS) -c -o $(OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) -DDEBUG $(NVCC) -c -o $(OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) $(DEBUG_FLAGS) $(CUDA_FLAGS)
$(CC) -o $(GPU_BIN) $(CFLAGS) $(OBJ_PATH)/$(GPU_BIN).o $(OBJ_PATH)/cuda_runtime.o $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o -lcudart -L$(CUDA_PATH)/lib64 $(CC) -o $(GPU_BIN) $(OBJ_PATH)/$(GPU_BIN).o $(OBJ_PATH)/cuda_runtime.o $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o -lcudart -L$(CUDA_PATH)/lib64 $(CUDA_FLAGS) $(CFLAGS)
clean: clean:
@echo "Cleaning..." @echo "Cleaning..."
......
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <iostream> #include <iostream>
#include <cstring>
#define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); } #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); }
...@@ -42,6 +43,14 @@ __host__ void copy_to_host(const void *d_ptr, void *h_ptr, size_t count) { ...@@ -42,6 +43,14 @@ __host__ void copy_to_host(const void *d_ptr, void *h_ptr, size_t count) {
CUDA_ASSERT(cudaMemcpy(h_ptr, d_ptr, count, cudaMemcpyDeviceToHost)); CUDA_ASSERT(cudaMemcpy(h_ptr, d_ptr, count, cudaMemcpyDeviceToHost));
} }
__host__ void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count) {
#ifdef ENABLE_CUDA_AWARE_MPI
CUDA_ASSERT(cudaMemcpy(d_ptr1, d_ptr2, count, cudaMemcpyDeviceToDevice));
#else
std::memcpy(d_ptr1, d_ptr2, count);
#endif
}
__host__ void copy_slice_to_device(const void *h_ptr, void *d_ptr, size_t offset, size_t count) { __host__ void copy_slice_to_device(const void *h_ptr, void *d_ptr, size_t offset, size_t count) {
void *d_ptr_start = ((char *) d_ptr) + offset; void *d_ptr_start = ((char *) d_ptr) + offset;
void *h_ptr_start = ((char *) h_ptr) + offset; void *h_ptr_start = ((char *) h_ptr) + offset;
......
...@@ -19,6 +19,7 @@ __host__ void device_free(void *ptr); ...@@ -19,6 +19,7 @@ __host__ void device_free(void *ptr);
__host__ void device_synchronize(); __host__ void device_synchronize();
__host__ void copy_to_device(const void *h_ptr, void *d_ptr, size_t count); __host__ void copy_to_device(const void *h_ptr, void *d_ptr, size_t count);
__host__ void copy_to_host(const void *d_ptr, void *h_ptr, size_t count); __host__ void copy_to_host(const void *d_ptr, void *h_ptr, size_t count);
__host__ void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count);
__host__ void copy_slice_to_device(const void *h_ptr, void *d_ptr, size_t offset, size_t count); __host__ void copy_slice_to_device(const void *h_ptr, void *d_ptr, size_t offset, size_t count);
__host__ void copy_slice_to_host(const void *d_ptr, void *h_ptr, size_t offset, size_t count); __host__ void copy_slice_to_host(const void *d_ptr, void *h_ptr, size_t offset, size_t count);
__host__ void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count); __host__ void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count);
......
#include <cstring>
//---
#include "device.hpp" #include "device.hpp"
namespace pairs { namespace pairs {
...@@ -13,4 +15,8 @@ void copy_slice_to_host(void const *d_ptr, void *h_ptr, size_t offset, size_t co ...@@ -13,4 +15,8 @@ void copy_slice_to_host(void const *d_ptr, void *h_ptr, size_t offset, size_t co
void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count) {} void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count) {}
void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t count) {} void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t count) {}
void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count) {
std::memcpy(d_ptr1, d_ptr2, count);
}
} }
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <vector> #include <vector>
//--- //---
#include "../pairs_common.hpp" #include "../pairs_common.hpp"
#include "../devices/device.hpp"
#include "regular_6d_stencil.hpp" #include "regular_6d_stencil.hpp"
namespace pairs { namespace pairs {
...@@ -155,17 +156,7 @@ void Regular6DStencil::communicateData( ...@@ -155,17 +156,7 @@ void Regular6DStencil::communicateData(
MPI_COMM_WORLD, &send_requests[0]); MPI_COMM_WORLD, &send_requests[0]);
*/ */
} else { } else {
#ifdef ENABLE_CUDA_AWARE_MPI pairs::copy_in_device(recv_prev, send_prev, nsend[dim * 2 + 0] * elem_size * sizeof(real_t));
cudaMemcpy(
recv_prev,
send_prev,
nsend[dim * 2 + 0] * elem_size * sizeof(real_t),
cudaMemcpyDeviceToDevice);
#else
for(int i = 0; i < nsend[dim * 2 + 0] * elem_size; i++) {
recv_prev[i] = send_prev[i];
}
#endif
} }
if(next[dim] != rank) { if(next[dim] != rank) {
...@@ -184,17 +175,7 @@ void Regular6DStencil::communicateData( ...@@ -184,17 +175,7 @@ void Regular6DStencil::communicateData(
MPI_COMM_WORLD, &send_requests[1]); MPI_COMM_WORLD, &send_requests[1]);
*/ */
} else { } else {
#ifdef ENABLE_CUDA_AWARE_MPI pairs::copy_in_device(recv_next, send_next, nsend[dim * 2 + 1] * elem_size * sizeof(real_t));
cudaMemcpy(
recv_next,
send_next,
nsend[dim * 2 + 1] * elem_size * sizeof(real_t),
cudaMemcpyDeviceToDevice);
#else
for(int i = 0; i < nsend[dim * 2 + 1] * elem_size; i++) {
recv_next[i] = send_next[i];
}
#endif
} }
//MPI_Waitall(2, recv_requests, MPI_STATUSES_IGNORE); //MPI_Waitall(2, recv_requests, MPI_STATUSES_IGNORE);
...@@ -231,17 +212,7 @@ void Regular6DStencil::communicateAllData( ...@@ -231,17 +212,7 @@ void Regular6DStencil::communicateAllData(
MPI_COMM_WORLD, &recv_requests[d * 2 + 0]); MPI_COMM_WORLD, &recv_requests[d * 2 + 0]);
*/ */
} else { } else {
#ifdef ENABLE_CUDA_AWARE_MPI pairs::copy_in_device(recv_prev, send_prev, nsend[d * 2 + 0] * elem_size * sizeof(real_t));
cudaMemcpy(
recv_prev,
send_prev,
nsend[d * 2 + 0] * elem_size * sizeof(real_t),
cudaMemcpyDeviceToDevice);
#else
for (int i = 0; i < nsend[d * 2 + 0] * elem_size; i++) {
recv_prev[i] = send_prev[i];
}
#endif
} }
if (next[d] != rank) { if (next[d] != rank) {
...@@ -260,17 +231,7 @@ void Regular6DStencil::communicateAllData( ...@@ -260,17 +231,7 @@ void Regular6DStencil::communicateAllData(
MPI_COMM_WORLD, &recv_requests[d * 2 + 1]); MPI_COMM_WORLD, &recv_requests[d * 2 + 1]);
*/ */
} else { } else {
#ifdef ENABLE_CUDA_AWARE_MPI pairs::copy_in_device(recv_next, send_next, nsend[d * 2 + 1] * elem_size * sizeof(real_t));
cudaMemcpy(
recv_next,
send_next,
nsend[d * 2 + 1] * elem_size * sizeof(real_t),
cudaMemcpyDeviceToDevice);
#else
for (int i = 0; i < nsend[d * 2 + 1] * elem_size; i++) {
recv_next[i] = send_next[i];
}
#endif
} }
} }
......
...@@ -364,8 +364,8 @@ void PairsSimulation::communicateData( ...@@ -364,8 +364,8 @@ void PairsSimulation::communicateData(
copyArrayToHost(nrecv_id, ReadOnly); copyArrayToHost(nrecv_id, ReadOnly);
#ifdef ENABLE_CUDA_AWARE_MPI #ifdef ENABLE_CUDA_AWARE_MPI
send_buf_ptr = send_buf_array.getDevicePointer(); send_buf_ptr = (real_t *) send_buf_array.getDevicePointer();
recv_buf_ptr = recv_buf_array.getDevicePointer(); recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer();
#else #else
int nsend_all = 0; int nsend_all = 0;
int nrecv_all = 0; int nrecv_all = 0;
...@@ -429,8 +429,8 @@ void PairsSimulation::communicateAllData( ...@@ -429,8 +429,8 @@ void PairsSimulation::communicateAllData(
copyArrayToHost(nrecv_id, ReadOnly); copyArrayToHost(nrecv_id, ReadOnly);
#ifdef ENABLE_CUDA_AWARE_MPI #ifdef ENABLE_CUDA_AWARE_MPI
send_buf_ptr = send_buf_array.getDevicePointer(); send_buf_ptr = (real_t *) send_buf_array.getDevicePointer();
recv_buf_ptr = recv_buf_array.getDevicePointer(); recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer();
#else #else
int nsend_all = 0; int nsend_all = 0;
int nrecv_all = 0; int nrecv_all = 0;
...@@ -489,8 +489,8 @@ void PairsSimulation::communicateContactHistoryData( ...@@ -489,8 +489,8 @@ void PairsSimulation::communicateContactHistoryData(
} }
#ifdef ENABLE_CUDA_AWARE_MPI #ifdef ENABLE_CUDA_AWARE_MPI
send_buf_ptr = send_buf_array.getDevicePointer(); send_buf_ptr = (real_t *) send_buf_array.getDevicePointer();
recv_buf_ptr = recv_buf_array.getDevicePointer(); recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer();
#else #else
copyArrayToHost(send_buf_id, Ignore, nsend_all * sizeof(real_t)); copyArrayToHost(send_buf_id, Ignore, nsend_all * sizeof(real_t));
array_flags->setHostFlag(recv_buf_id); array_flags->setHostFlag(recv_buf_id);
......
...@@ -9,12 +9,6 @@ typedef double real_t; ...@@ -9,12 +9,6 @@ typedef double real_t;
//typedef float real_t; //typedef float real_t;
//#endif //#endif
#ifndef PAIRS_TARGET_CUDA
# ifdef ENABLE_CUDA_AWARE_MPI
# undef ENABLE_CUDA_AWARE_MPI
# endif
#endif
typedef int array_t; typedef int array_t;
typedef int property_t; typedef int property_t;
typedef int layout_t; typedef int layout_t;
...@@ -60,10 +54,16 @@ enum DomainPartitioners { ...@@ -60,10 +54,16 @@ enum DomainPartitioners {
#ifdef DEBUG #ifdef DEBUG
# include <assert.h> # include <assert.h>
# define PAIRS_DEBUG(...) { \ # define PAIRS_DEBUG(...) { \
int __init_flag; \
int __rank; \ int __rank; \
MPI_Comm_rank(MPI_COMM_WORLD, &__rank); \ MPI_Initialized(&__init_flag); \
if(__rank == 0) { \ if(__init_flag == 0) { \
fprintf(stderr, __VA_ARGS__); \ fprintf(stderr, __VA_ARGS__); \
} else { \
MPI_Comm_rank(MPI_COMM_WORLD, &__rank); \
if(__rank == 0) { \
fprintf(stderr, __VA_ARGS__); \
} \
} \ } \
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment