diff --git a/program/main.cu b/program/main.cu index 87ba87ced73ce658028464478235ecab33dc74ec..068893526e27032b9196b33d1d0547c01180cf23 100644 --- a/program/main.cu +++ b/program/main.cu @@ -6,6 +6,11 @@ #include "source/ugblock.h" #include <math.h> #include <chrono> +#include <QMessageBox> +#include <QApplication> +#include <QScopedPointer> +#include <QProgressDialog> + using namespace ::_COLSAMM_; @@ -26,6 +31,10 @@ double modeProfile(double rr) { void profilingHelper(int n, int iteration, bool Cuda) { + QProgressDialog progress("Executing Iteration.", "Abort Calculation", 0, iteration); + progress.setWindowModality(Qt::WindowModal); + progress.setValue(0); + std::ofstream DATEI; //physical constsnts @@ -117,6 +126,13 @@ void profilingHelper(int n, int iteration, bool Cuda) //Measure execution time std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); for (int i = 0; i < iteration; ++i) { + progress.setValue(i + 1); + if (progress.wasCanceled()) + { + numberOfPoints = -1; + break; + } + // population inversion Npopinv = (Npopinv + tau * pumping) / (1.0 + tau * (1.0 / tau_f + sigma_by_roundTrip * Phi * mode + pumping / NpopinvMax)); //Npopinv = NpopinvExpression; @@ -133,10 +149,12 @@ void profilingHelper(int n, int iteration, bool Cuda) else Phi = (Phi + tau * S) / (1.0 - tau * lambda); DATEI << i * tau << " " << factorOutpoutPower * Phi << std::endl; - } - DATEI.close(); + } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::cout << "(" << numberOfPoints << "," << (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count()) / 1000000.0 << ")" << std::endl; + DATEI.close(); + double time = (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count()) / 1000000.0; + double gflops = 15.0 * (double)numberOfPoints * (double)iteration / (1000000000.0 * time); + std::cout << "(" << numberOfPoints << "," << gflops << ")" << std::endl; delete &Npopinv; delete &NpopinvMax; @@ -155,64 +173,25 @@ int main(int argc, char** argv) std::cout.precision(10); std::cout.setf(std::ios::fixed, std::ios::floatfield); - std::cout << "Cuda(10000 Iterations)\nGridpoints;Time;TimePerPoint" << std::endl; - - /* - profilingHelper(6, 10000, true); - profilingHelper(8, 10000, true); - profilingHelper(10, 10000, true); - profilingHelper(12, 10000, true); - profilingHelper(14, 10000, true); - profilingHelper(16, 10000, true); - profilingHelper(18, 10000, true); - profilingHelper(20, 10000, true); - profilingHelper(22, 10000, true); - profilingHelper(24, 10000, true); - profilingHelper(26, 10000, true); - profilingHelper(28, 10000, true); - profilingHelper(30, 10000, true); - profilingHelper(32, 10000, true); - profilingHelper(38, 10000, true); - profilingHelper(50, 10000, true); - profilingHelper(60, 10000, true); - profilingHelper(70, 10000, true); - profilingHelper(80, 10000, true); - profilingHelper(90, 10000, true); - profilingHelper(100, 10000, true); - profilingHelper(110, 10000, true); - profilingHelper(120, 10000, true); - profilingHelper(140, 10000, true); - profilingHelper(160, 10000, true); - */ - profilingHelper(60, 10000, true); - - std::cout << "NonCuda(10000 Iterations)\nGridpoints;Time;TimePerPoint" << std::endl; - /* - profilingHelper(6, 10000, false); - profilingHelper(8, 10000, false); - profilingHelper(10, 10000, false); - profilingHelper(12, 10000, false); - profilingHelper(14, 10000, false); - profilingHelper(16, 10000, false); - profilingHelper(18, 10000, false); - profilingHelper(20, 10000, false); - profilingHelper(22, 10000, false); - profilingHelper(24, 10000, false); - profilingHelper(26, 10000, false); - profilingHelper(28, 10000, false); - profilingHelper(30, 10000, false); - profilingHelper(32, 10000, false); - profilingHelper(38, 10000, false); - profilingHelper(50, 10000, false); - profilingHelper(60, 10000, false); - profilingHelper(70, 10000, false); - profilingHelper(80, 10000, false); - profilingHelper(90, 10000, false); - profilingHelper(100, 10000, false); - profilingHelper(110, 10000, false); - profilingHelper(120, 10000, false); - profilingHelper(140, 10000, false); - profilingHelper(160, 10000, false); - */ + int n, iteration; + std::ifstream PARAMETER; + PARAMETER.open("para.dat", std::ios::in); + PARAMETER >> n >> iteration; + + QScopedPointer<QCoreApplication> app(new QApplication(argc, argv)); + + if (n == 0 || iteration == 0) + { + for (int n = 6; n <= 220; n += n / 3) + { + profilingHelper(n, iteration, true); + } + } + else + { + profilingHelper(n, iteration, true); + } + + exit(EXIT_SUCCESS); } diff --git a/program/source/extemp/cuda_helper.cu b/program/source/extemp/cuda_helper.cu index 4f7170e52ce31853597a0f98a26254c3501042d6..71c463001368fc88b8406f6439fc41402689d10e 100644 --- a/program/source/extemp/cuda_helper.cu +++ b/program/source/extemp/cuda_helper.cu @@ -21,36 +21,57 @@ static inline __device__ double atomicAdd(double* address, double val) { } #endif +/* PER WARP REDUCTION +__device__ double reduce_sum(cg::thread_group warp, double sum) +{ + //Sum all values of the warp + //Threads where warp.thread_rank() == 0 will return the correct summed result + sum += __shfl_down_sync(0xFFFFFFFF,sum, 1); + sum += __shfl_down_sync(0xFFFFFFFF, sum, 2); + sum += __shfl_down_sync(0xFFFFFFFF, sum, 4); + sum += __shfl_down_sync(0xFFFFFFFF, sum, 8); + sum += __shfl_down_sync(0xFFFFFFFF, sum, 16); + return sum; +} -__device__ double reduce_sum(cg::thread_group g, double *temp, double val) +__global__ void EfficientDotproduct(double* Result, double* InputA, double* InputB, int n) { - int lane = g.thread_rank(); - //Sum all values of the thread group - //Thread 0 will return the correct summed result - for (int i = g.size() / 2; i > 0; i /= 2) + double myValue = 0.0; + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n; index += (gridDim.x * blockDim.x)) { - temp[lane] = val; - g.sync(); - if (lane < i) val += temp[lane + i]; - g.sync(); + myValue += InputA[index] * InputB[index]; } - return val; -} + auto warp = cg::tiled_partition<32>(cg::this_thread_block()); + double block_sum = reduce_sum(warp, myValue); + + //Only thread 0 of the thread group returned the correct sum + if (warp.thread_rank() == 0) atomicAdd(Result, block_sum); +} +*/ __global__ void EfficientDotproduct(double* Result, double* InputA, double* InputB, int n) { + auto block = cg::this_thread_block(); + auto warp = cg::tiled_partition<32>(block); + auto warpLane = warp.thread_rank(); + double myValue = 0.0; for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n; index += (gridDim.x * blockDim.x)) { myValue += InputA[index] * InputB[index]; } - extern __shared__ double temp[]; - auto g = cg::this_thread_block(); - double block_sum = reduce_sum(g, temp, myValue); - //Only thread 0 of the thread group returned the correct sum - if (g.thread_rank() == 0) atomicAdd(Result, block_sum); + myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 1); + myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 2); + myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 4); + myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 8); + myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 16); + if (warpLane == 0) + { + atomicAdd(Result, myValue); + } } + double* tempBufferDevice = nullptr; double* tempBufferHost = nullptr; double CudaDotproduct(double* InputA, double* InputB, int InputSize) @@ -65,7 +86,9 @@ double CudaDotproduct(double* InputA, double* InputB, int InputSize) int blocks = InputSize / threads; //Third parameter is the size of the shared memory - EfficientDotproduct<<<blocks, threads, threads * sizeof(double)>>>(tempBufferDevice, InputA, InputB, InputSize); + EfficientDotproduct << <blocks, threads >> > (tempBufferDevice, InputA, InputB, InputSize); + cudaDeviceSynchronize(); + cudaMemcpy(tempBufferHost, tempBufferDevice, sizeof(double), cudaMemcpyDeviceToHost); return *tempBufferHost; } \ No newline at end of file diff --git a/program/source/extemp/cuda_helper.h b/program/source/extemp/cuda_helper.h index 78d82c64995b10fea668e260baaedb42f93b115b..bde22f8e5bc57b6404a4b8f3a27acdcb983944a9 100644 --- a/program/source/extemp/cuda_helper.h +++ b/program/source/extemp/cuda_helper.h @@ -62,15 +62,6 @@ public: } int GetNumberOfBlocks() { return 128; } int GetCudaDeviceID(){ return 0; } - - template <class T> - std::pair<int, int> CalculateOptimalNumberBlocksThreads(T func, int usedSharedMemory) - { - int blocks = 0; - int threads = 0; - cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, func, usedSharedMemory, 0); - return std::make_pair(blocks, threads); - } };