From d37f96fe00a43cc90c018bbd8816d7a99f0d5583 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti <rafaelravedutti@gmail.com> Date: Wed, 13 Dec 2023 21:13:19 +0100 Subject: [PATCH] Fix CUDA-aware MPI Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com> --- Makefile | 12 +++---- runtime/devices/cuda.cu | 9 +++++ runtime/devices/device.hpp | 1 + runtime/devices/dummy.cpp | 6 ++++ runtime/domain/regular_6d_stencil.cpp | 49 +++------------------------ runtime/pairs.cpp | 12 +++---- runtime/pairs_common.hpp | 16 ++++----- 7 files changed, 41 insertions(+), 64 deletions(-) diff --git a/Makefile b/Makefile index f9ea605..9085543 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ LIKWID_LIB ?= -L/usr/local/lib LIKWID_FLAGS = -llikwid ${LIKWID_INC} ${LIKWID_DEFINES} ${LIKWID_LIB} #CUDA_FLAGS= CUDA_FLAGS=-DENABLE_CUDA_AWARE_MPI -CFLAGS=-Ofast -march=core-avx2 ${CUDA_FLAGS} ${LIKWID_FLAGS} +CFLAGS=-Ofast -march=core-avx2 ${LIKWID_FLAGS} #CFLAGS=-Ofast -xHost -qopt-zmm-usage=high ${LIKWID_FLAGS} #CFLAGS=-Ofast -xCORE-AVX512 -qopt-zmm-usage=high ${LIKWID_FLAGS} CUDA_BIN_PATH:="$(shell dirname ${NVCC_PATH})" @@ -44,10 +44,10 @@ $(GPU_SRC): $(PYCMD) examples/$(TESTCASE).py gpu $(OBJ_PATH)/pairs.o: runtime/pairs.cpp - $(CC) $(CFLAGS) -c -o $@ $< $(DEBUG_FLAGS) + $(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CUDA_FLAGS) $(CFLAGS) $(OBJ_PATH)/regular_6d_stencil.o: runtime/domain/regular_6d_stencil.cpp - $(CC) $(CFLAGS) -c -o $@ $< $(DEBUG_FLAGS) + $(CC) -c -o $@ $< $(DEBUG_FLAGS) $(CUDA_FLAGS) $(CFLAGS) $(OBJ_PATH)/dummy.o: runtime/devices/dummy.cpp $(CC) -c -o $@ $< $(DEBUG_FLAGS) @@ -57,9 +57,9 @@ $(CPU_BIN): $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OB $(CC) $(CFLAGS) -o $(CPU_BIN) $(CPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o $(OBJ_PATH)/dummy.o $(DEBUG_FLAGS) $(GPU_BIN): $(GPU_SRC) $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o - $(NVCC) $(CUDA_FLAGS) -c -o $(OBJ_PATH)/cuda_runtime.o runtime/devices/cuda.cu ${DEBUG_FLAGS} - $(NVCC) $(CUDA_FLAGS) -c -o $(OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) -DDEBUG - $(CC) -o $(GPU_BIN) $(CFLAGS) $(OBJ_PATH)/$(GPU_BIN).o $(OBJ_PATH)/cuda_runtime.o $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o -lcudart -L$(CUDA_PATH)/lib64 + $(NVCC) -c -o $(OBJ_PATH)/cuda_runtime.o runtime/devices/cuda.cu $(DEBUG_FLAGS) $(CUDA_FLAGS) + $(NVCC) -c -o $(OBJ_PATH)/$(GPU_BIN).o $(GPU_SRC) $(DEBUG_FLAGS) $(CUDA_FLAGS) + $(CC) -o $(GPU_BIN) $(OBJ_PATH)/$(GPU_BIN).o $(OBJ_PATH)/cuda_runtime.o $(OBJ_PATH)/pairs.o $(OBJ_PATH)/regular_6d_stencil.o -lcudart -L$(CUDA_PATH)/lib64 $(CUDA_FLAGS) $(CFLAGS) clean: @echo "Cleaning..." diff --git a/runtime/devices/cuda.cu b/runtime/devices/cuda.cu index 3e4f067..8bb7c59 100644 --- a/runtime/devices/cuda.cu +++ b/runtime/devices/cuda.cu @@ -1,5 +1,6 @@ #include <cuda_runtime.h> #include <iostream> +#include <cstring> #define CUDA_ASSERT(a) { pairs::cuda_assert((a), __FILE__, __LINE__); } @@ -42,6 +43,14 @@ __host__ void copy_to_host(const void *d_ptr, void *h_ptr, size_t count) { CUDA_ASSERT(cudaMemcpy(h_ptr, d_ptr, count, cudaMemcpyDeviceToHost)); } +__host__ void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count) { + #ifdef ENABLE_CUDA_AWARE_MPI + CUDA_ASSERT(cudaMemcpy(d_ptr1, d_ptr2, count, cudaMemcpyDeviceToDevice)); + #else + std::memcpy(d_ptr1, d_ptr2, count); + #endif +} + __host__ void copy_slice_to_device(const void *h_ptr, void *d_ptr, size_t offset, size_t count) { void *d_ptr_start = ((char *) d_ptr) + offset; void *h_ptr_start = ((char *) h_ptr) + offset; diff --git a/runtime/devices/device.hpp b/runtime/devices/device.hpp index 384891a..ade0405 100644 --- a/runtime/devices/device.hpp +++ b/runtime/devices/device.hpp @@ -19,6 +19,7 @@ __host__ void device_free(void *ptr); __host__ void device_synchronize(); __host__ void copy_to_device(const void *h_ptr, void *d_ptr, size_t count); __host__ void copy_to_host(const void *d_ptr, void *h_ptr, size_t count); +__host__ void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count); __host__ void copy_slice_to_device(const void *h_ptr, void *d_ptr, size_t offset, size_t count); __host__ void copy_slice_to_host(const void *d_ptr, void *h_ptr, size_t offset, size_t count); __host__ void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count); diff --git a/runtime/devices/dummy.cpp b/runtime/devices/dummy.cpp index 3b21e68..a0151fc 100644 --- a/runtime/devices/dummy.cpp +++ b/runtime/devices/dummy.cpp @@ -1,3 +1,5 @@ +#include <cstring> +//--- #include "device.hpp" namespace pairs { @@ -13,4 +15,8 @@ void copy_slice_to_host(void const *d_ptr, void *h_ptr, size_t offset, size_t co void copy_static_symbol_to_device(void *h_ptr, const void *d_ptr, size_t count) {} void copy_static_symbol_to_host(void *d_ptr, const void *h_ptr, size_t count) {} +void copy_in_device(void *d_ptr1, const void *d_ptr2, size_t count) { + std::memcpy(d_ptr1, d_ptr2, count); +} + } diff --git a/runtime/domain/regular_6d_stencil.cpp b/runtime/domain/regular_6d_stencil.cpp index 84bbf2c..d1cec87 100644 --- a/runtime/domain/regular_6d_stencil.cpp +++ b/runtime/domain/regular_6d_stencil.cpp @@ -2,6 +2,7 @@ #include <vector> //--- #include "../pairs_common.hpp" +#include "../devices/device.hpp" #include "regular_6d_stencil.hpp" namespace pairs { @@ -155,17 +156,7 @@ void Regular6DStencil::communicateData( MPI_COMM_WORLD, &send_requests[0]); */ } else { - #ifdef ENABLE_CUDA_AWARE_MPI - cudaMemcpy( - recv_prev, - send_prev, - nsend[dim * 2 + 0] * elem_size * sizeof(real_t), - cudaMemcpyDeviceToDevice); - #else - for(int i = 0; i < nsend[dim * 2 + 0] * elem_size; i++) { - recv_prev[i] = send_prev[i]; - } - #endif + pairs::copy_in_device(recv_prev, send_prev, nsend[dim * 2 + 0] * elem_size * sizeof(real_t)); } if(next[dim] != rank) { @@ -184,17 +175,7 @@ void Regular6DStencil::communicateData( MPI_COMM_WORLD, &send_requests[1]); */ } else { - #ifdef ENABLE_CUDA_AWARE_MPI - cudaMemcpy( - recv_next, - send_next, - nsend[dim * 2 + 1] * elem_size * sizeof(real_t), - cudaMemcpyDeviceToDevice); - #else - for(int i = 0; i < nsend[dim * 2 + 1] * elem_size; i++) { - recv_next[i] = send_next[i]; - } - #endif + pairs::copy_in_device(recv_next, send_next, nsend[dim * 2 + 1] * elem_size * sizeof(real_t)); } //MPI_Waitall(2, recv_requests, MPI_STATUSES_IGNORE); @@ -231,17 +212,7 @@ void Regular6DStencil::communicateAllData( MPI_COMM_WORLD, &recv_requests[d * 2 + 0]); */ } else { - #ifdef ENABLE_CUDA_AWARE_MPI - cudaMemcpy( - recv_prev, - send_prev, - nsend[d * 2 + 0] * elem_size * sizeof(real_t), - cudaMemcpyDeviceToDevice); - #else - for (int i = 0; i < nsend[d * 2 + 0] * elem_size; i++) { - recv_prev[i] = send_prev[i]; - } - #endif + pairs::copy_in_device(recv_prev, send_prev, nsend[d * 2 + 0] * elem_size * sizeof(real_t)); } if (next[d] != rank) { @@ -260,17 +231,7 @@ void Regular6DStencil::communicateAllData( MPI_COMM_WORLD, &recv_requests[d * 2 + 1]); */ } else { - #ifdef ENABLE_CUDA_AWARE_MPI - cudaMemcpy( - recv_next, - send_next, - nsend[d * 2 + 1] * elem_size * sizeof(real_t), - cudaMemcpyDeviceToDevice); - #else - for (int i = 0; i < nsend[d * 2 + 1] * elem_size; i++) { - recv_next[i] = send_next[i]; - } - #endif + pairs::copy_in_device(recv_next, send_next, nsend[d * 2 + 1] * elem_size * sizeof(real_t)); } } diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp index 67c73f8..2639377 100644 --- a/runtime/pairs.cpp +++ b/runtime/pairs.cpp @@ -364,8 +364,8 @@ void PairsSimulation::communicateData( copyArrayToHost(nrecv_id, ReadOnly); #ifdef ENABLE_CUDA_AWARE_MPI - send_buf_ptr = send_buf_array.getDevicePointer(); - recv_buf_ptr = recv_buf_array.getDevicePointer(); + send_buf_ptr = (real_t *) send_buf_array.getDevicePointer(); + recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer(); #else int nsend_all = 0; int nrecv_all = 0; @@ -429,8 +429,8 @@ void PairsSimulation::communicateAllData( copyArrayToHost(nrecv_id, ReadOnly); #ifdef ENABLE_CUDA_AWARE_MPI - send_buf_ptr = send_buf_array.getDevicePointer(); - recv_buf_ptr = recv_buf_array.getDevicePointer(); + send_buf_ptr = (real_t *) send_buf_array.getDevicePointer(); + recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer(); #else int nsend_all = 0; int nrecv_all = 0; @@ -489,8 +489,8 @@ void PairsSimulation::communicateContactHistoryData( } #ifdef ENABLE_CUDA_AWARE_MPI - send_buf_ptr = send_buf_array.getDevicePointer(); - recv_buf_ptr = recv_buf_array.getDevicePointer(); + send_buf_ptr = (real_t *) send_buf_array.getDevicePointer(); + recv_buf_ptr = (real_t *) recv_buf_array.getDevicePointer(); #else copyArrayToHost(send_buf_id, Ignore, nsend_all * sizeof(real_t)); array_flags->setHostFlag(recv_buf_id); diff --git a/runtime/pairs_common.hpp b/runtime/pairs_common.hpp index 4272bc4..c3cf60d 100644 --- a/runtime/pairs_common.hpp +++ b/runtime/pairs_common.hpp @@ -9,12 +9,6 @@ typedef double real_t; //typedef float real_t; //#endif -#ifndef PAIRS_TARGET_CUDA -# ifdef ENABLE_CUDA_AWARE_MPI -# undef ENABLE_CUDA_AWARE_MPI -# endif -#endif - typedef int array_t; typedef int property_t; typedef int layout_t; @@ -60,10 +54,16 @@ enum DomainPartitioners { #ifdef DEBUG # include <assert.h> # define PAIRS_DEBUG(...) { \ + int __init_flag; \ int __rank; \ - MPI_Comm_rank(MPI_COMM_WORLD, &__rank); \ - if(__rank == 0) { \ + MPI_Initialized(&__init_flag); \ + if(__init_flag == 0) { \ fprintf(stderr, __VA_ARGS__); \ + } else { \ + MPI_Comm_rank(MPI_COMM_WORLD, &__rank); \ + if(__rank == 0) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ } \ } -- GitLab