Initial commit

d637b5ba · Nils Kohl · d637b5ba · d637b5ba · d637b5ba · d637b5ba
Commit d637b5ba authored 5 years ago by Nils Kohl
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+cmake_minimum_required(VERSION 3.5)
+# set the project name
+project(mpi-sor-benchmark)
+find_package(MPI REQUIRED)
+include_directories(${MPI_INCLUDE_PATH})
+# add the executable
+add_executable(mpi-sor-benchmark main.cpp dummy.cpp kernel.cpp)
+target_link_libraries(mpi-sor-benchmark ${MPI_LIBRARIES})
+function ( link_files_to_builddir globExpression )
+    # don't need links for in-source builds
+    if( CMAKE_CURRENT_SOURCE_DIR STREQUAL "${CMAKE_CURRENT_BINARY_DIR}" )
+        return()
+    endif()
+    file( GLOB filesToLink RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${globExpression} )
+    foreach( f ${filesToLink} )
+        if( CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows" )
+            configure_file( ${f} ${f} COPYONLY )
+        else()
+            execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink
+                    ${CMAKE_CURRENT_SOURCE_DIR}/${f}
+                    ${CMAKE_CURRENT_BINARY_DIR}/${f} )
+        endif()
+    endforeach()
+endfunction ( link_files_to_builddir )
+link_files_to_builddir(*.py)
+link_files_to_builddir(*.sh)
+set( CMAKE_BUILD_TYPE "Release" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -ffast-math" )
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# Mini benchmark to trigger SuperMUC-NG performance fluctuations (ticket 130502)
+## Build instructions
+Out-of-source build with cmake:
+    $ git clone https://i10git.cs.fau.de/kohl/mpi-sor-benchmark.git
+    $ mkdir mpi-sor-benchmark-build
+    $ cd mpi-sor-benchmark-build
+    $ cmake ../mpi-sor-benchmark
+    $ make
+Create job scripts with `python create_run_script.py`. 
+This creates a new directory with a bunch of job files that can be submitted directly, e.g.:
+    $ python create_run_script.py
+    $ # creates directory '2020-01-28_13-56-08'
+    $ cd 2020-01-28_13-56-08
+    $ sbatch snoop_n96_ppn48_level6_cellspp10_inner10_outer2_2020-01-28_13-56-08.job
+    $ # runs the benchmark on refinement level 6, 10 cells per process, 10 inner iterations, 
+    $ # 2 outer iterations on 96 nodes with 48 ppn
+To best reproduce the results try both level 6 and 7 as well as 10 or 100 inner iterations.
+If number of cells per process parameter is greater than 1, the kernel is simply executed on multiple
+(different) arrays and each execution is measured independently.
+The outer iteration parameter sets the number of times the measurement is performed. 
+Results are written into `.csv` files. Use `plot_csv.py output.csv 0 1` to visualize the timings per rank
+for cell 0 and outer iteration 1.
+Example output:
+![image](snoop_0_1.png)
\ No newline at end of file
--- a/create_run_script.py
+++ b/create_run_script.py
+import time
+import datetime
+import os
+def supermuc_job_file_string(job_name="job", wall_clock_limit="1:00:00", prm_string="parameter_file.prm", num_nodes=1, ppn=48):
+    def partition(num_nodes):
+        if num_nodes <= 16:
+            return "micro"
+        elif num_nodes <= 768:
+            return "general"
+        elif num_nodes <= 3072:
+            return "large"
+    constraint = ""
+    if num_nodes <= 792:
+        constraint = "#SBATCH --constraint=[i01|i02|i03|i04|i05|i06|i07|i08]"
+    base_config = """#!/bin/bash
+# Job Name and Files (also --job-name)
+#SBATCH -J {job_name}
+#Output and error (also --output, --error):
+#SBATCH -o ./%x.%j.out
+#SBATCH -e ./%x.%j.err
+#Initial working directory (also --chdir):
+#SBATCH -D ./
+#Notification and type
+#SBATCH --mail-type=END
+#SBATCH --mail-user=nils.kohl@fau.de
+# Wall clock limit:
+#SBATCH --time={wall_clock_limit}
+#SBATCH --no-requeue
+#Setup of execution environment
+#SBATCH --export=NONE
+#SBATCH --get-user-env
+#SBATCH --account=pr86ma
+#SBATCH --ear=off
+#SBATCH --partition={partition}
+#Number of nodes and MPI tasks per node:
+#SBATCH --nodes={num_nodes}
+#SBATCH --ntasks-per-node={ppn}
+{constraint}
+module load slurm_setup
+cd ..
+pwd
+ls -lha
+source load_modules.sh
+module list
+#Run the program:
+mpiexec -n $SLURM_NTASKS ./mpi-sor-benchmark {prm_string}
+""".format(job_name=job_name, wall_clock_limit=wall_clock_limit, num_nodes=num_nodes, prm_string=prm_string, partition=partition(num_nodes),
+           constraint=constraint, ppn=ppn)
+    return base_config
+def supermuc_scaling():
+    some_id = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
+    os.mkdir(some_id)
+    ppn = 48
+    outer_iterations = 2
+    inner_iterations_list = [1, 10, 100]
+    cells_pp_list = [2, 10]
+    level_list = [6, 7]
+    for num_nodes in [1, 2, 6, 12, 24, 48, 96, 192, 384, 768, 1536, 3072]:
+        for inner_iterations in inner_iterations_list:
+            for cells_pp in cells_pp_list:
+                for level in level_list:
+                    job_name = "snoop_n{}_ppn{}_level{}_cellspp{}_inner{}_outer{}_{}".format(num_nodes, ppn, level, cells_pp, inner_iterations, outer_iterations, some_id)
+                    job_file_name = job_name + ".job"
+                    db_file = job_name + ".csv"
+                    prm_string = "{} {} {} {} {}".format(level, cells_pp, outer_iterations, inner_iterations, db_file)
+                    job_string = supermuc_job_file_string(job_name=job_name, wall_clock_limit="0:10:00",
+                                                          num_nodes=num_nodes, prm_string=prm_string, ppn=ppn)
+                    with open(os.path.join(some_id, job_file_name), "w") as f:
+                        f.write(job_string)
+if __name__ == "__main__":
+    supermuc_scaling()
--- a/dummy.cpp
+++ b/dummy.cpp
+#include "dummy.h"
+void dummy(double *a) {}
\ No newline at end of file
--- a/dummy.h
+++ b/dummy.h
+void dummy(double *a);
\ No newline at end of file
--- a/kernel.cpp
+++ b/kernel.cpp
--- a/kernel.h
+++ b/kernel.h
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+void kernel(double const *RESTRICT const _data_edgeCellDst_X,
+            double const *RESTRICT const _data_edgeCellDst_XY,
+            double const *RESTRICT const _data_edgeCellDst_XYZ,
+            double const *RESTRICT const _data_edgeCellDst_XZ,
+            double const *RESTRICT const _data_edgeCellDst_Y,
+            double const *RESTRICT const _data_edgeCellDst_YZ,
+            double const *RESTRICT const _data_edgeCellDst_Z,
+            double *RESTRICT _data_vertexCellDst,
+            double const *RESTRICT const _data_vertexCellRhs,
+            double const *RESTRICT const _data_xi, unsigned level,
+            double relax);
\ No newline at end of file
--- a/load_modules.sh
+++ b/load_modules.sh
+#!/usr/bin/env bash
+module unload devEnv
+module load devEnv/GCC
+module load boost
+module load petsc
+export CC=gcc
+export CXX=g++
--- a/main.cpp
+++ b/main.cpp
+#include "dummy.h"
+#include "kernel.h"
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <mpi.h>
+#include <sstream>
+#include <vector>
+double get_wc_time() {
+  return static_cast<double>(
+             std::chrono::duration_cast<std::chrono::nanoseconds>(
+                 std::chrono::high_resolution_clock::now().time_since_epoch())
+                 .count()) *
+         1e-9;
+}
+void runBenchmark(unsigned level, unsigned numCellsPerProcess,
+                  unsigned numOuterSORIterations,
+                  unsigned numInnerSORIterations, const std::string &dbFile) {
+  int numProcesses;
+  MPI_Comm_size(MPI_COMM_WORLD, &numProcesses);
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank == 0) {
+    std::cout << "Parameters:" << std::endl;
+    std::cout << "  - num processes:    " << numProcesses << std::endl;
+    std::cout << "  - level:            " << level << std::endl;
+    std::cout << "  - cells / process:  " << numCellsPerProcess << std::endl;
+    std::cout << "  - outer iterations: " << numOuterSORIterations << std::endl;
+    std::cout << "  - inner iterations: " << numInnerSORIterations << std::endl;
+    std::cout << "" << std::endl;
+  }
+  const unsigned v_width = (1u << level) + 1;
+  const unsigned v_size = ((v_width + 2) * (v_width + 1) * v_width) / 6;
+  const unsigned e_width = (1u << level);
+  const unsigned e_offset = ((e_width + 2) * (e_width + 1) * e_width) / 6;
+  const unsigned e_width_xyz = (1u << level) - 1;
+  const unsigned e_offset_xyz =
+      ((e_width_xyz + 2) * (e_width_xyz + 1) * e_width_xyz) / 6;
+  const unsigned size_arrays_per_cell_in_byte =
+      (2 * v_size + 6 * e_offset + e_offset_xyz) * 8;
+  if (rank == 0) {
+    std::cout << "size of destination array (slightly larger than number of "
+                 "updated points per cell and iteration): "
+              << v_size << std::endl;
+    std::cout << "size of arrays that are allocated per cell and loaded per "
+                 "kernel call: "
+              << v_size << " + " << v_size << " + 6 * " << e_offset << " + "
+              << e_offset_xyz << " (~ "
+              << size_arrays_per_cell_in_byte / 1000000. << " MB)" << std::endl;
+    std::cout << "" << std::endl;
+  }
+  std::vector<std::vector<double>> v_dst(numCellsPerProcess);
+  std::vector<std::vector<double>> v_rhs(numCellsPerProcess);
+  std::vector<std::vector<double>> e_src(numCellsPerProcess);
+  std::vector<double> xi(100);
+  for (unsigned i = 0; i < 100; i++) {
+    xi[i] = 0.1 * (double)i;
+  }
+  for (unsigned cellID = 0; cellID < numCellsPerProcess; cellID++) {
+    v_dst[cellID] = std::vector<double>(v_size, 42.);
+    v_rhs[cellID] = std::vector<double>(v_size, 42.);
+    e_src[cellID] = std::vector<double>(6 * e_offset + e_offset_xyz, 42.);
+  }
+  // sorTimings[ cellID ][ outerIter ] = timing;
+  std::vector<std::vector<double>> sorTimings(numCellsPerProcess);
+  for (unsigned id = 0; id < numCellsPerProcess; id++) {
+    sorTimings[id] = std::vector<double>(numOuterSORIterations);
+  }
+  if (rank == 0) {
+    std::cout << "Running benchmark ..." << std::endl;
+    std::cout << "" << std::endl;
+  }
+  for (unsigned cellID = 0; cellID < numCellsPerProcess; cellID++) {
+    const double relax = 1.1;
+    for (unsigned outer = 0; outer < numOuterSORIterations; outer++) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      double start = get_wc_time();
+      for (unsigned inner = 0; inner < numInnerSORIterations; inner++) {
+        kernel(&e_src[cellID][0], &e_src[cellID][3 * e_offset],
+               &e_src[cellID][6 * e_offset], &e_src[cellID][4 * e_offset],
+               &e_src[cellID][1 * e_offset], &e_src[cellID][5 * e_offset],
+               &e_src[cellID][2 * e_offset], v_dst[cellID].data(),
+               v_rhs[cellID].data(), xi.data(), level, 1.1);
+      }
+      sorTimings[cellID][outer] = get_wc_time() - start;
+    }
+    dummy(v_dst[cellID].data());
+    dummy(v_rhs[cellID].data());
+    dummy(e_src[cellID].data());
+  }
+  // allSorTimings[ cellID ][ outerIter ][ rank ] = timing;
+  std::vector<std::vector<std::vector<double>>> allSorTimings(
+      numCellsPerProcess);
+  for (unsigned id = 0; id < numCellsPerProcess; id++) {
+    allSorTimings[id] = std::vector<std::vector<double>>(numOuterSORIterations);
+    for (unsigned it = 0; it < numOuterSORIterations; it++) {
+      allSorTimings[id][it] = std::vector<double>(numProcesses);
+      void *recv_buffer = nullptr;
+      if (rank == 0) {
+        recv_buffer = allSorTimings[id][it].data();
+      }
+      MPI_Gather(&sorTimings[id][it], 1, MPI_DOUBLE, recv_buffer, 1, MPI_DOUBLE,
+                 0, MPI_COMM_WORLD);
+    }
+  }
+  if (rank == 0) {
+    std::stringstream myfile;
+    // header
+    myfile << "data_point,";
+    myfile << "num_processes,";
+    myfile << "outer_iterations,";
+    myfile << "inner_iterations,";
+    myfile << "cell_per_process,";
+    myfile << "level,";
+    myfile << "dofs_per_cell,";
+    myfile << "rank,";
+    myfile << "outer_iteration,";
+    myfile << "cell_id,";
+    myfile << "time\n";
+    // global data
+    myfile << 0 << ",";
+    myfile << numProcesses << ",";
+    myfile << numOuterSORIterations << ",";
+    myfile << numInnerSORIterations << ",";
+    myfile << numCellsPerProcess << ",";
+    myfile << level << ",";
+    myfile << v_size << ",";
+    myfile << ",";
+    myfile << ",";
+    myfile << ",";
+    myfile << "\n";
+    std::vector<std::vector<std::vector<double>>> sorTimingsRecv(numProcesses);
+    for (unsigned id = 0; id < numCellsPerProcess; id++) {
+      for (unsigned it = 0; it < numOuterSORIterations; it++) {
+        for (unsigned r = 0; r < numProcesses; r++) {
+          myfile << 1 << ",";
+          myfile << ",";
+          myfile << ",";
+          myfile << ",";
+          myfile << ",";
+          myfile << ",";
+          myfile << ",";
+          myfile << r << ",";
+          myfile << it << ",";
+          myfile << id << ",";
+          myfile << allSorTimings[id][it][r] << "\n";
+        }
+      }
+    }
+    double min = std::numeric_limits<double>::max();
+    double max = 0;
+    double avg = 0;
+    double maxDiff = 0;
+    unsigned maxDiffCellID = 0;
+    unsigned maxDiffIteration = 0;
+    for (unsigned id = 0; id < numCellsPerProcess; id++) {
+      for (unsigned it = 0; it < numOuterSORIterations; it++) {
+        double itMin = std::numeric_limits<double>::max();
+        double itMax = 0;
+        double itSum = 0;
+        for (unsigned r = 0; r < numProcesses; r++) {
+          const double time = allSorTimings[id][it][r];
+          if (itMin > time)
+            itMin = time;
+          if (itMax < time)
+            itMax = time;
+          itSum += time;
+        }
+        double itAvg = itSum / static_cast<double>(numProcesses);
+        double itDiff = itMax - itAvg;
+        if (maxDiff <= itDiff) {
+          maxDiff = itDiff;
+          avg = itAvg;
+          max = itMax;
+          min = itMin;
+          maxDiffCellID = id;
+          maxDiffIteration = it;
+        }
+      }
+    }
+    std::cout << "Measurement with largest difference (max(rank) - average(all "
+                 "ranks)):"
+              << std::endl;
+    std::cout << " - min:             " << min << std::endl;
+    std::cout << " - max:             " << max << std::endl;
+    std::cout << " - avg:             " << avg << std::endl;
+    std::cout << " - max - avg:       " << max - avg << " ("
+              << (maxDiff / avg) * 100. << "% of average)" << std::endl;
+    std::cout << " - cell ID:         " << maxDiffCellID << std::endl;
+    std::cout << " - outer iteration: " << maxDiffIteration << std::endl;
+    std::cout << "" << std::endl;
+    std::cout << "Writing root data (global run data) ..." << std::endl;
+    std::ofstream outputFile;
+    outputFile.open(dbFile);
+    outputFile << myfile.str();
+    outputFile.close();
+  }
+}
+int main(int argc, char **argv) {
+  MPI_Init(nullptr, nullptr);
+  int num_processes;
+  MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  int level, numCellsPerProcess, numOuterSORIterations, numInnerSORIterations;
+  std::string outputFile;
+  if (rank == 0) {
+    std::cout << "Parallel SOR benchmark." << std::endl;
+    std::cout << "Performs stencil-based SOR iterations on structured refined "
+                 "tetrahedral domains.\n"
+              << std::endl;
+  }
+  if (argc != 6) {
+    if (rank == 0) {
+      std::cout << "Usage:" << std::endl;
+      std::cout << "    ./mpi-sor-benchmark <level> <numCellsPerProcess> "
+                   "<numOuterIterations> <numInnerIterations> <outputFile.csv>"
+                << std::endl;
+      std::cout << "" << std::endl;
+      std::cout << "Parameters:" << std::endl;
+      std::cout << " - level:                 refinement level of the tet (6 "
+                   "or 7 should be used for total memory allocation of ~3MB "
+                   "and ~25MB per cell)"
+                << std::endl;
+      std::cout << " - numCellsPerProcess:    number of different tetrahedrons "
+                   "to process (to vary the memory addresses, each tetrahedron "
+                   "is measured individually)"
+                << std::endl;
+      std::cout << " - numOuterSORIterations: number of measured iterations "
+                   "per tetrahedron"
+                << std::endl;
+      std::cout << " - numInnerSORIterations: number of relaxation iterations "
+                   "performed per measurement/outer iteration"
+                << std::endl;
+      std::cout << " - outputFile.csv:        file to output the detailed "
+                   "measurements per rank, cell and outer iteration"
+                << std::endl;
+    }
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+  } else {
+    level = std::atoi(argv[1]);
+    numCellsPerProcess = std::atoi(argv[2]);
+    numOuterSORIterations = std::atoi(argv[3]);
+    numInnerSORIterations = std::atoi(argv[4]);
+    outputFile = std::string(argv[5]);
+  }
+  runBenchmark(level, numCellsPerProcess, numOuterSORIterations,
+               numInnerSORIterations, outputFile);
+  MPI_Finalize();
+}
\ No newline at end of file
--- a/plot_csv.py
+++ b/plot_csv.py
+import csv
+import matplotlib.pyplot as plt
+import sys
+def read_csv(csv_file):
+    data = []
+    with open(csv_file, mode='r') as infile:
+        reader = csv.DictReader(infile)
+        for i, r in enumerate(reader):
+            if i == 0:
+                info = r
+            else:
+                data.append(r)
+    return info, data
+def plot_comparison(csv_file, cell_id, outer_iteration):
+    info, data = read_csv(csv_file)
+    num_processes = info['num_processes']
+    plot_data = [float(v["time"]) for v in data if v["outer_iteration"] == str(outer_iteration) and v["cell_id"] == str(cell_id)]
+    # plt.subplot(1, len(db_files), i+1)
+    # plt.plot(plot_data, label="timings");
+    # plt.plot(sorted(plot_data), label="timings sorted");
+    plt.bar(list(range(len(plot_data))), plot_data, width=1.0, align='center')
+    plt.title("{} processes, cell {}, outer iteration {}".format(int(num_processes), cell_id, outer_iteration))
+    plt.xlabel("ranks")
+    plt.ylabel("time in seconds")
+    plt.show()
+if __name__ == "__main__":
+    usage = "Plot benchmark csv data.\nUsage: plot_csv.py <data.csv> <cell_id> <outer_iteration>"
+    if len(sys.argv) != 4:
+        print(usage)
+    else:
+        plot_comparison(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))
\ No newline at end of file
--- a/snoop_0_1.png
+++ b/snoop_0_1.png