Skip to content
Snippets Groups Projects
Commit d637b5ba authored by Nils Kohl's avatar Nils Kohl :full_moon_with_face:
Browse files

Initial commit

parents
Branches master
No related tags found
No related merge requests found
cmake_minimum_required(VERSION 3.5)
# set the project name
project(mpi-sor-benchmark)
find_package(MPI REQUIRED)
include_directories(${MPI_INCLUDE_PATH})
# add the executable
add_executable(mpi-sor-benchmark main.cpp dummy.cpp kernel.cpp)
target_link_libraries(mpi-sor-benchmark ${MPI_LIBRARIES})
function ( link_files_to_builddir globExpression )
# don't need links for in-source builds
if( CMAKE_CURRENT_SOURCE_DIR STREQUAL "${CMAKE_CURRENT_BINARY_DIR}" )
return()
endif()
file( GLOB filesToLink RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${globExpression} )
foreach( f ${filesToLink} )
if( CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows" )
configure_file( ${f} ${f} COPYONLY )
else()
execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink
${CMAKE_CURRENT_SOURCE_DIR}/${f}
${CMAKE_CURRENT_BINARY_DIR}/${f} )
endif()
endforeach()
endfunction ( link_files_to_builddir )
link_files_to_builddir(*.py)
link_files_to_builddir(*.sh)
set( CMAKE_BUILD_TYPE "Release" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -ffast-math" )
\ No newline at end of file
# Mini benchmark to trigger SuperMUC-NG performance fluctuations (ticket 130502)
## Build instructions
Out-of-source build with cmake:
$ git clone https://i10git.cs.fau.de/kohl/mpi-sor-benchmark.git
$ mkdir mpi-sor-benchmark-build
$ cd mpi-sor-benchmark-build
$ cmake ../mpi-sor-benchmark
$ make
Create job scripts with `python create_run_script.py`.
This creates a new directory with a bunch of job files that can be submitted directly, e.g.:
$ python create_run_script.py
$ # creates directory '2020-01-28_13-56-08'
$ cd 2020-01-28_13-56-08
$ sbatch snoop_n96_ppn48_level6_cellspp10_inner10_outer2_2020-01-28_13-56-08.job
$ # runs the benchmark on refinement level 6, 10 cells per process, 10 inner iterations,
$ # 2 outer iterations on 96 nodes with 48 ppn
To best reproduce the results try both level 6 and 7 as well as 10 or 100 inner iterations.
If number of cells per process parameter is greater than 1, the kernel is simply executed on multiple
(different) arrays and each execution is measured independently.
The outer iteration parameter sets the number of times the measurement is performed.
Results are written into `.csv` files. Use `plot_csv.py output.csv 0 1` to visualize the timings per rank
for cell 0 and outer iteration 1.
Example output:
![image](snoop_0_1.png)
\ No newline at end of file
import time
import datetime
import os
def supermuc_job_file_string(job_name="job", wall_clock_limit="1:00:00", prm_string="parameter_file.prm", num_nodes=1, ppn=48):
def partition(num_nodes):
if num_nodes <= 16:
return "micro"
elif num_nodes <= 768:
return "general"
elif num_nodes <= 3072:
return "large"
constraint = ""
if num_nodes <= 792:
constraint = "#SBATCH --constraint=[i01|i02|i03|i04|i05|i06|i07|i08]"
base_config = """#!/bin/bash
# Job Name and Files (also --job-name)
#SBATCH -J {job_name}
#Output and error (also --output, --error):
#SBATCH -o ./%x.%j.out
#SBATCH -e ./%x.%j.err
#Initial working directory (also --chdir):
#SBATCH -D ./
#Notification and type
#SBATCH --mail-type=END
#SBATCH --mail-user=nils.kohl@fau.de
# Wall clock limit:
#SBATCH --time={wall_clock_limit}
#SBATCH --no-requeue
#Setup of execution environment
#SBATCH --export=NONE
#SBATCH --get-user-env
#SBATCH --account=pr86ma
#SBATCH --ear=off
#SBATCH --partition={partition}
#Number of nodes and MPI tasks per node:
#SBATCH --nodes={num_nodes}
#SBATCH --ntasks-per-node={ppn}
{constraint}
module load slurm_setup
cd ..
pwd
ls -lha
source load_modules.sh
module list
#Run the program:
mpiexec -n $SLURM_NTASKS ./mpi-sor-benchmark {prm_string}
""".format(job_name=job_name, wall_clock_limit=wall_clock_limit, num_nodes=num_nodes, prm_string=prm_string, partition=partition(num_nodes),
constraint=constraint, ppn=ppn)
return base_config
def supermuc_scaling():
some_id = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
os.mkdir(some_id)
ppn = 48
outer_iterations = 2
inner_iterations_list = [1, 10, 100]
cells_pp_list = [2, 10]
level_list = [6, 7]
for num_nodes in [1, 2, 6, 12, 24, 48, 96, 192, 384, 768, 1536, 3072]:
for inner_iterations in inner_iterations_list:
for cells_pp in cells_pp_list:
for level in level_list:
job_name = "snoop_n{}_ppn{}_level{}_cellspp{}_inner{}_outer{}_{}".format(num_nodes, ppn, level, cells_pp, inner_iterations, outer_iterations, some_id)
job_file_name = job_name + ".job"
db_file = job_name + ".csv"
prm_string = "{} {} {} {} {}".format(level, cells_pp, outer_iterations, inner_iterations, db_file)
job_string = supermuc_job_file_string(job_name=job_name, wall_clock_limit="0:10:00",
num_nodes=num_nodes, prm_string=prm_string, ppn=ppn)
with open(os.path.join(some_id, job_file_name), "w") as f:
f.write(job_string)
if __name__ == "__main__":
supermuc_scaling()
#include "dummy.h"
void dummy(double *a) {}
\ No newline at end of file
void dummy(double *a);
\ No newline at end of file
This diff is collapsed.
kernel.h 0 → 100644
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
void kernel(double const *RESTRICT const _data_edgeCellDst_X,
double const *RESTRICT const _data_edgeCellDst_XY,
double const *RESTRICT const _data_edgeCellDst_XYZ,
double const *RESTRICT const _data_edgeCellDst_XZ,
double const *RESTRICT const _data_edgeCellDst_Y,
double const *RESTRICT const _data_edgeCellDst_YZ,
double const *RESTRICT const _data_edgeCellDst_Z,
double *RESTRICT _data_vertexCellDst,
double const *RESTRICT const _data_vertexCellRhs,
double const *RESTRICT const _data_xi, unsigned level,
double relax);
\ No newline at end of file
#!/usr/bin/env bash
module unload devEnv
module load devEnv/GCC
module load boost
module load petsc
export CC=gcc
export CXX=g++
main.cpp 0 → 100644
#include "dummy.h"
#include "kernel.h"
#include <chrono>
#include <fstream>
#include <iostream>
#include <mpi.h>
#include <sstream>
#include <vector>
double get_wc_time() {
return static_cast<double>(
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::high_resolution_clock::now().time_since_epoch())
.count()) *
1e-9;
}
void runBenchmark(unsigned level, unsigned numCellsPerProcess,
unsigned numOuterSORIterations,
unsigned numInnerSORIterations, const std::string &dbFile) {
int numProcesses;
MPI_Comm_size(MPI_COMM_WORLD, &numProcesses);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
std::cout << "Parameters:" << std::endl;
std::cout << " - num processes: " << numProcesses << std::endl;
std::cout << " - level: " << level << std::endl;
std::cout << " - cells / process: " << numCellsPerProcess << std::endl;
std::cout << " - outer iterations: " << numOuterSORIterations << std::endl;
std::cout << " - inner iterations: " << numInnerSORIterations << std::endl;
std::cout << "" << std::endl;
}
const unsigned v_width = (1u << level) + 1;
const unsigned v_size = ((v_width + 2) * (v_width + 1) * v_width) / 6;
const unsigned e_width = (1u << level);
const unsigned e_offset = ((e_width + 2) * (e_width + 1) * e_width) / 6;
const unsigned e_width_xyz = (1u << level) - 1;
const unsigned e_offset_xyz =
((e_width_xyz + 2) * (e_width_xyz + 1) * e_width_xyz) / 6;
const unsigned size_arrays_per_cell_in_byte =
(2 * v_size + 6 * e_offset + e_offset_xyz) * 8;
if (rank == 0) {
std::cout << "size of destination array (slightly larger than number of "
"updated points per cell and iteration): "
<< v_size << std::endl;
std::cout << "size of arrays that are allocated per cell and loaded per "
"kernel call: "
<< v_size << " + " << v_size << " + 6 * " << e_offset << " + "
<< e_offset_xyz << " (~ "
<< size_arrays_per_cell_in_byte / 1000000. << " MB)" << std::endl;
std::cout << "" << std::endl;
}
std::vector<std::vector<double>> v_dst(numCellsPerProcess);
std::vector<std::vector<double>> v_rhs(numCellsPerProcess);
std::vector<std::vector<double>> e_src(numCellsPerProcess);
std::vector<double> xi(100);
for (unsigned i = 0; i < 100; i++) {
xi[i] = 0.1 * (double)i;
}
for (unsigned cellID = 0; cellID < numCellsPerProcess; cellID++) {
v_dst[cellID] = std::vector<double>(v_size, 42.);
v_rhs[cellID] = std::vector<double>(v_size, 42.);
e_src[cellID] = std::vector<double>(6 * e_offset + e_offset_xyz, 42.);
}
// sorTimings[ cellID ][ outerIter ] = timing;
std::vector<std::vector<double>> sorTimings(numCellsPerProcess);
for (unsigned id = 0; id < numCellsPerProcess; id++) {
sorTimings[id] = std::vector<double>(numOuterSORIterations);
}
if (rank == 0) {
std::cout << "Running benchmark ..." << std::endl;
std::cout << "" << std::endl;
}
for (unsigned cellID = 0; cellID < numCellsPerProcess; cellID++) {
const double relax = 1.1;
for (unsigned outer = 0; outer < numOuterSORIterations; outer++) {
MPI_Barrier(MPI_COMM_WORLD);
double start = get_wc_time();
for (unsigned inner = 0; inner < numInnerSORIterations; inner++) {
kernel(&e_src[cellID][0], &e_src[cellID][3 * e_offset],
&e_src[cellID][6 * e_offset], &e_src[cellID][4 * e_offset],
&e_src[cellID][1 * e_offset], &e_src[cellID][5 * e_offset],
&e_src[cellID][2 * e_offset], v_dst[cellID].data(),
v_rhs[cellID].data(), xi.data(), level, 1.1);
}
sorTimings[cellID][outer] = get_wc_time() - start;
}
dummy(v_dst[cellID].data());
dummy(v_rhs[cellID].data());
dummy(e_src[cellID].data());
}
// allSorTimings[ cellID ][ outerIter ][ rank ] = timing;
std::vector<std::vector<std::vector<double>>> allSorTimings(
numCellsPerProcess);
for (unsigned id = 0; id < numCellsPerProcess; id++) {
allSorTimings[id] = std::vector<std::vector<double>>(numOuterSORIterations);
for (unsigned it = 0; it < numOuterSORIterations; it++) {
allSorTimings[id][it] = std::vector<double>(numProcesses);
void *recv_buffer = nullptr;
if (rank == 0) {
recv_buffer = allSorTimings[id][it].data();
}
MPI_Gather(&sorTimings[id][it], 1, MPI_DOUBLE, recv_buffer, 1, MPI_DOUBLE,
0, MPI_COMM_WORLD);
}
}
if (rank == 0) {
std::stringstream myfile;
// header
myfile << "data_point,";
myfile << "num_processes,";
myfile << "outer_iterations,";
myfile << "inner_iterations,";
myfile << "cell_per_process,";
myfile << "level,";
myfile << "dofs_per_cell,";
myfile << "rank,";
myfile << "outer_iteration,";
myfile << "cell_id,";
myfile << "time\n";
// global data
myfile << 0 << ",";
myfile << numProcesses << ",";
myfile << numOuterSORIterations << ",";
myfile << numInnerSORIterations << ",";
myfile << numCellsPerProcess << ",";
myfile << level << ",";
myfile << v_size << ",";
myfile << ",";
myfile << ",";
myfile << ",";
myfile << "\n";
std::vector<std::vector<std::vector<double>>> sorTimingsRecv(numProcesses);
for (unsigned id = 0; id < numCellsPerProcess; id++) {
for (unsigned it = 0; it < numOuterSORIterations; it++) {
for (unsigned r = 0; r < numProcesses; r++) {
myfile << 1 << ",";
myfile << ",";
myfile << ",";
myfile << ",";
myfile << ",";
myfile << ",";
myfile << ",";
myfile << r << ",";
myfile << it << ",";
myfile << id << ",";
myfile << allSorTimings[id][it][r] << "\n";
}
}
}
double min = std::numeric_limits<double>::max();
double max = 0;
double avg = 0;
double maxDiff = 0;
unsigned maxDiffCellID = 0;
unsigned maxDiffIteration = 0;
for (unsigned id = 0; id < numCellsPerProcess; id++) {
for (unsigned it = 0; it < numOuterSORIterations; it++) {
double itMin = std::numeric_limits<double>::max();
double itMax = 0;
double itSum = 0;
for (unsigned r = 0; r < numProcesses; r++) {
const double time = allSorTimings[id][it][r];
if (itMin > time)
itMin = time;
if (itMax < time)
itMax = time;
itSum += time;
}
double itAvg = itSum / static_cast<double>(numProcesses);
double itDiff = itMax - itAvg;
if (maxDiff <= itDiff) {
maxDiff = itDiff;
avg = itAvg;
max = itMax;
min = itMin;
maxDiffCellID = id;
maxDiffIteration = it;
}
}
}
std::cout << "Measurement with largest difference (max(rank) - average(all "
"ranks)):"
<< std::endl;
std::cout << " - min: " << min << std::endl;
std::cout << " - max: " << max << std::endl;
std::cout << " - avg: " << avg << std::endl;
std::cout << " - max - avg: " << max - avg << " ("
<< (maxDiff / avg) * 100. << "% of average)" << std::endl;
std::cout << " - cell ID: " << maxDiffCellID << std::endl;
std::cout << " - outer iteration: " << maxDiffIteration << std::endl;
std::cout << "" << std::endl;
std::cout << "Writing root data (global run data) ..." << std::endl;
std::ofstream outputFile;
outputFile.open(dbFile);
outputFile << myfile.str();
outputFile.close();
}
}
int main(int argc, char **argv) {
MPI_Init(nullptr, nullptr);
int num_processes;
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int level, numCellsPerProcess, numOuterSORIterations, numInnerSORIterations;
std::string outputFile;
if (rank == 0) {
std::cout << "Parallel SOR benchmark." << std::endl;
std::cout << "Performs stencil-based SOR iterations on structured refined "
"tetrahedral domains.\n"
<< std::endl;
}
if (argc != 6) {
if (rank == 0) {
std::cout << "Usage:" << std::endl;
std::cout << " ./mpi-sor-benchmark <level> <numCellsPerProcess> "
"<numOuterIterations> <numInnerIterations> <outputFile.csv>"
<< std::endl;
std::cout << "" << std::endl;
std::cout << "Parameters:" << std::endl;
std::cout << " - level: refinement level of the tet (6 "
"or 7 should be used for total memory allocation of ~3MB "
"and ~25MB per cell)"
<< std::endl;
std::cout << " - numCellsPerProcess: number of different tetrahedrons "
"to process (to vary the memory addresses, each tetrahedron "
"is measured individually)"
<< std::endl;
std::cout << " - numOuterSORIterations: number of measured iterations "
"per tetrahedron"
<< std::endl;
std::cout << " - numInnerSORIterations: number of relaxation iterations "
"performed per measurement/outer iteration"
<< std::endl;
std::cout << " - outputFile.csv: file to output the detailed "
"measurements per rank, cell and outer iteration"
<< std::endl;
}
MPI_Finalize();
return EXIT_SUCCESS;
} else {
level = std::atoi(argv[1]);
numCellsPerProcess = std::atoi(argv[2]);
numOuterSORIterations = std::atoi(argv[3]);
numInnerSORIterations = std::atoi(argv[4]);
outputFile = std::string(argv[5]);
}
runBenchmark(level, numCellsPerProcess, numOuterSORIterations,
numInnerSORIterations, outputFile);
MPI_Finalize();
}
\ No newline at end of file
import csv
import matplotlib.pyplot as plt
import sys
def read_csv(csv_file):
data = []
with open(csv_file, mode='r') as infile:
reader = csv.DictReader(infile)
for i, r in enumerate(reader):
if i == 0:
info = r
else:
data.append(r)
return info, data
def plot_comparison(csv_file, cell_id, outer_iteration):
info, data = read_csv(csv_file)
num_processes = info['num_processes']
plot_data = [float(v["time"]) for v in data if v["outer_iteration"] == str(outer_iteration) and v["cell_id"] == str(cell_id)]
# plt.subplot(1, len(db_files), i+1)
# plt.plot(plot_data, label="timings");
# plt.plot(sorted(plot_data), label="timings sorted");
plt.bar(list(range(len(plot_data))), plot_data, width=1.0, align='center')
plt.title("{} processes, cell {}, outer iteration {}".format(int(num_processes), cell_id, outer_iteration))
plt.xlabel("ranks")
plt.ylabel("time in seconds")
plt.show()
if __name__ == "__main__":
usage = "Plot benchmark csv data.\nUsage: plot_csv.py <data.csv> <cell_id> <outer_iteration>"
if len(sys.argv) != 4:
print(usage)
else:
plot_comparison(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))
\ No newline at end of file
snoop_0_1.png

34.3 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment