Skip to content
Snippets Groups Projects
Commit b309913e authored by Peter Herbst's avatar Peter Herbst
Browse files

implementation of QProgressDialog and per warp reduction for scalarproduct

parent a9ad75fc
No related merge requests found
......@@ -6,6 +6,11 @@
#include "source/ugblock.h"
#include <math.h>
#include <chrono>
#include <QMessageBox>
#include <QApplication>
#include <QScopedPointer>
#include <QProgressDialog>
using namespace ::_COLSAMM_;
......@@ -26,6 +31,10 @@ double modeProfile(double rr) {
void profilingHelper(int n, int iteration, bool Cuda)
{
QProgressDialog progress("Executing Iteration.", "Abort Calculation", 0, iteration);
progress.setWindowModality(Qt::WindowModal);
progress.setValue(0);
std::ofstream DATEI;
//physical constsnts
......@@ -117,6 +126,13 @@ void profilingHelper(int n, int iteration, bool Cuda)
//Measure execution time
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
for (int i = 0; i < iteration; ++i) {
progress.setValue(i + 1);
if (progress.wasCanceled())
{
numberOfPoints = -1;
break;
}
// population inversion
Npopinv = (Npopinv + tau * pumping) / (1.0 + tau * (1.0 / tau_f + sigma_by_roundTrip * Phi * mode + pumping / NpopinvMax));
//Npopinv = NpopinvExpression;
......@@ -133,10 +149,12 @@ void profilingHelper(int n, int iteration, bool Cuda)
else Phi = (Phi + tau * S) / (1.0 - tau * lambda);
DATEI << i * tau << " " << factorOutpoutPower * Phi << std::endl;
}
DATEI.close();
}
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
std::cout << "(" << numberOfPoints << "," << (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count()) / 1000000.0 << ")" << std::endl;
DATEI.close();
double time = (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count()) / 1000000.0;
double gflops = 15.0 * (double)numberOfPoints * (double)iteration / (1000000000.0 * time);
std::cout << "(" << numberOfPoints << "," << gflops << ")" << std::endl;
delete &Npopinv;
delete &NpopinvMax;
......@@ -155,64 +173,25 @@ int main(int argc, char** argv)
std::cout.precision(10);
std::cout.setf(std::ios::fixed, std::ios::floatfield);
std::cout << "Cuda(10000 Iterations)\nGridpoints;Time;TimePerPoint" << std::endl;
/*
profilingHelper(6, 10000, true);
profilingHelper(8, 10000, true);
profilingHelper(10, 10000, true);
profilingHelper(12, 10000, true);
profilingHelper(14, 10000, true);
profilingHelper(16, 10000, true);
profilingHelper(18, 10000, true);
profilingHelper(20, 10000, true);
profilingHelper(22, 10000, true);
profilingHelper(24, 10000, true);
profilingHelper(26, 10000, true);
profilingHelper(28, 10000, true);
profilingHelper(30, 10000, true);
profilingHelper(32, 10000, true);
profilingHelper(38, 10000, true);
profilingHelper(50, 10000, true);
profilingHelper(60, 10000, true);
profilingHelper(70, 10000, true);
profilingHelper(80, 10000, true);
profilingHelper(90, 10000, true);
profilingHelper(100, 10000, true);
profilingHelper(110, 10000, true);
profilingHelper(120, 10000, true);
profilingHelper(140, 10000, true);
profilingHelper(160, 10000, true);
*/
profilingHelper(60, 10000, true);
std::cout << "NonCuda(10000 Iterations)\nGridpoints;Time;TimePerPoint" << std::endl;
/*
profilingHelper(6, 10000, false);
profilingHelper(8, 10000, false);
profilingHelper(10, 10000, false);
profilingHelper(12, 10000, false);
profilingHelper(14, 10000, false);
profilingHelper(16, 10000, false);
profilingHelper(18, 10000, false);
profilingHelper(20, 10000, false);
profilingHelper(22, 10000, false);
profilingHelper(24, 10000, false);
profilingHelper(26, 10000, false);
profilingHelper(28, 10000, false);
profilingHelper(30, 10000, false);
profilingHelper(32, 10000, false);
profilingHelper(38, 10000, false);
profilingHelper(50, 10000, false);
profilingHelper(60, 10000, false);
profilingHelper(70, 10000, false);
profilingHelper(80, 10000, false);
profilingHelper(90, 10000, false);
profilingHelper(100, 10000, false);
profilingHelper(110, 10000, false);
profilingHelper(120, 10000, false);
profilingHelper(140, 10000, false);
profilingHelper(160, 10000, false);
*/
int n, iteration;
std::ifstream PARAMETER;
PARAMETER.open("para.dat", std::ios::in);
PARAMETER >> n >> iteration;
QScopedPointer<QCoreApplication> app(new QApplication(argc, argv));
if (n == 0 || iteration == 0)
{
for (int n = 6; n <= 220; n += n / 3)
{
profilingHelper(n, iteration, true);
}
}
else
{
profilingHelper(n, iteration, true);
}
exit(EXIT_SUCCESS);
}
......@@ -21,36 +21,57 @@ static inline __device__ double atomicAdd(double* address, double val) {
}
#endif
/* PER WARP REDUCTION
__device__ double reduce_sum(cg::thread_group warp, double sum)
{
//Sum all values of the warp
//Threads where warp.thread_rank() == 0 will return the correct summed result
sum += __shfl_down_sync(0xFFFFFFFF,sum, 1);
sum += __shfl_down_sync(0xFFFFFFFF, sum, 2);
sum += __shfl_down_sync(0xFFFFFFFF, sum, 4);
sum += __shfl_down_sync(0xFFFFFFFF, sum, 8);
sum += __shfl_down_sync(0xFFFFFFFF, sum, 16);
return sum;
}
__device__ double reduce_sum(cg::thread_group g, double *temp, double val)
__global__ void EfficientDotproduct(double* Result, double* InputA, double* InputB, int n)
{
int lane = g.thread_rank();
//Sum all values of the thread group
//Thread 0 will return the correct summed result
for (int i = g.size() / 2; i > 0; i /= 2)
double myValue = 0.0;
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n; index += (gridDim.x * blockDim.x))
{
temp[lane] = val;
g.sync();
if (lane < i) val += temp[lane + i];
g.sync();
myValue += InputA[index] * InputB[index];
}
return val;
}
auto warp = cg::tiled_partition<32>(cg::this_thread_block());
double block_sum = reduce_sum(warp, myValue);
//Only thread 0 of the thread group returned the correct sum
if (warp.thread_rank() == 0) atomicAdd(Result, block_sum);
}
*/
__global__ void EfficientDotproduct(double* Result, double* InputA, double* InputB, int n)
{
auto block = cg::this_thread_block();
auto warp = cg::tiled_partition<32>(block);
auto warpLane = warp.thread_rank();
double myValue = 0.0;
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n; index += (gridDim.x * blockDim.x))
{
myValue += InputA[index] * InputB[index];
}
extern __shared__ double temp[];
auto g = cg::this_thread_block();
double block_sum = reduce_sum(g, temp, myValue);
//Only thread 0 of the thread group returned the correct sum
if (g.thread_rank() == 0) atomicAdd(Result, block_sum);
myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 1);
myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 2);
myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 4);
myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 8);
myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 16);
if (warpLane == 0)
{
atomicAdd(Result, myValue);
}
}
double* tempBufferDevice = nullptr;
double* tempBufferHost = nullptr;
double CudaDotproduct(double* InputA, double* InputB, int InputSize)
......@@ -65,7 +86,9 @@ double CudaDotproduct(double* InputA, double* InputB, int InputSize)
int blocks = InputSize / threads;
//Third parameter is the size of the shared memory
EfficientDotproduct<<<blocks, threads, threads * sizeof(double)>>>(tempBufferDevice, InputA, InputB, InputSize);
EfficientDotproduct << <blocks, threads >> > (tempBufferDevice, InputA, InputB, InputSize);
cudaDeviceSynchronize();
cudaMemcpy(tempBufferHost, tempBufferDevice, sizeof(double), cudaMemcpyDeviceToHost);
return *tempBufferHost;
}
\ No newline at end of file
......@@ -62,15 +62,6 @@ public:
}
int GetNumberOfBlocks() { return 128; }
int GetCudaDeviceID(){ return 0; }
template <class T>
std::pair<int, int> CalculateOptimalNumberBlocksThreads(T func, int usedSharedMemory)
{
int blocks = 0;
int threads = 0;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, func, usedSharedMemory, 0);
return std::make_pair(blocks, threads);
}
};
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment