implementation of QProgressDialog and per warp reduction for scalarproduct

b309913e · Peter Herbst · a9ad75fc · b309913e · b309913e · b309913e
Commit b309913e authored 4 years ago by Peter Herbst
--- a/program/main.cu
+++ b/program/main.cu
@@ -6,6 +6,11 @@
 #include "source/ugblock.h"
 #include <math.h>
 #include <chrono>
+#include <QMessageBox>
+#include <QApplication>
+#include <QScopedPointer>
+#include <QProgressDialog>
+

 using namespace ::_COLSAMM_;

@@ -26,6 +31,10 @@ double modeProfile(double rr) {

 void profilingHelper(int n, int iteration, bool Cuda)
 {
+	QProgressDialog progress("Executing Iteration.", "Abort Calculation", 0, iteration);
+	progress.setWindowModality(Qt::WindowModal);
+	progress.setValue(0);
+
 	std::ofstream DATEI;

 	//physical constsnts
@@ -117,6 +126,13 @@ void profilingHelper(int n, int iteration, bool Cuda)
 	//Measure execution time
 	std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
 	for (int i = 0; i < iteration; ++i) {
+		progress.setValue(i + 1);
+		if (progress.wasCanceled())
+		{
+			numberOfPoints = -1;
+			break;
+		}
+
 		// population inversion
 		Npopinv = (Npopinv + tau * pumping) / (1.0 + tau * (1.0 / tau_f + sigma_by_roundTrip * Phi * mode + pumping / NpopinvMax));
 		//Npopinv = NpopinvExpression;
@@ -133,10 +149,12 @@ void profilingHelper(int n, int iteration, bool Cuda)
 		else             Phi = (Phi + tau * S) / (1.0 - tau * lambda);

 		DATEI << i * tau << "   " << factorOutpoutPower * Phi << std::endl;
-	}
-	DATEI.close();
+	}	
 	std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-	std::cout << "(" << numberOfPoints << "," << (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count()) / 1000000.0 << ")" << std::endl;
+	DATEI.close();
+	double time = (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count()) / 1000000.0;
+	double gflops = 15.0 * (double)numberOfPoints * (double)iteration / (1000000000.0 * time);
+	std::cout << "(" << numberOfPoints << "," << gflops << ")" << std::endl;

 	delete &Npopinv;
 	delete &NpopinvMax;
@@ -155,64 +173,25 @@ int main(int argc, char** argv)
 	std::cout.precision(10);
 	std::cout.setf(std::ios::fixed, std::ios::floatfield);

-	std::cout << "Cuda(10000 Iterations)\nGridpoints;Time;TimePerPoint" << std::endl;
-	
-	/*
-	profilingHelper(6, 10000, true);
-	profilingHelper(8, 10000, true);
-	profilingHelper(10, 10000, true);
-	profilingHelper(12, 10000, true);
-	profilingHelper(14, 10000, true);
-	profilingHelper(16, 10000, true);
-	profilingHelper(18, 10000, true);
-	profilingHelper(20, 10000, true);
-	profilingHelper(22, 10000, true);
-	profilingHelper(24, 10000, true);
-	profilingHelper(26, 10000, true);
-	profilingHelper(28, 10000, true);
-	profilingHelper(30, 10000, true);
-	profilingHelper(32, 10000, true);
-	profilingHelper(38, 10000, true);
-	profilingHelper(50, 10000, true);
-	profilingHelper(60, 10000, true);
-	profilingHelper(70, 10000, true);
-	profilingHelper(80, 10000, true);
-	profilingHelper(90, 10000, true);
-	profilingHelper(100, 10000, true);
-	profilingHelper(110, 10000, true);
-	profilingHelper(120, 10000, true);
-	profilingHelper(140, 10000, true);
-	profilingHelper(160, 10000, true);
-	*/
-	profilingHelper(60, 10000, true);
-
-	std::cout << "NonCuda(10000 Iterations)\nGridpoints;Time;TimePerPoint" << std::endl;
-	/*
-	profilingHelper(6, 10000, false);
-	profilingHelper(8, 10000, false);
-	profilingHelper(10, 10000, false);
-	profilingHelper(12, 10000, false);
-	profilingHelper(14, 10000, false);
-	profilingHelper(16, 10000, false);
-	profilingHelper(18, 10000, false);
-	profilingHelper(20, 10000, false);
-	profilingHelper(22, 10000, false);
-	profilingHelper(24, 10000, false);
-	profilingHelper(26, 10000, false);
-	profilingHelper(28, 10000, false);
-	profilingHelper(30, 10000, false);
-	profilingHelper(32, 10000, false);
-	profilingHelper(38, 10000, false);
-	profilingHelper(50, 10000, false);
-	profilingHelper(60, 10000, false);
-	profilingHelper(70, 10000, false);
-	profilingHelper(80, 10000, false);
-	profilingHelper(90, 10000, false);
-	profilingHelper(100, 10000, false);
-	profilingHelper(110, 10000, false);
-	profilingHelper(120, 10000, false);
-	profilingHelper(140, 10000, false);
-	profilingHelper(160, 10000, false);
-	*/
+	int n, iteration;
+	std::ifstream PARAMETER;
+	PARAMETER.open("para.dat", std::ios::in);
+	PARAMETER >> n >> iteration;
+
+	QScopedPointer<QCoreApplication> app(new QApplication(argc, argv));
+
+	if (n == 0 || iteration == 0)
+	{
+		for (int n = 6; n <= 220; n += n / 3)
+		{
+			profilingHelper(n, iteration, true);
+		}
+	}
+	else
+	{
+		profilingHelper(n, iteration, true);
+	}
+
+	exit(EXIT_SUCCESS);
 }

--- a/program/source/extemp/cuda_helper.cu
+++ b/program/source/extemp/cuda_helper.cu
@@ -21,36 +21,57 @@ static inline __device__ double atomicAdd(double* address, double val) {
 }
 #endif

+/*  PER WARP REDUCTION
+__device__ double reduce_sum(cg::thread_group warp, double sum)
+{
+	//Sum all values of the warp
+	//Threads where warp.thread_rank() == 0 will return the correct summed result
+	sum += __shfl_down_sync(0xFFFFFFFF,sum, 1);
+	sum += __shfl_down_sync(0xFFFFFFFF, sum, 2);
+	sum += __shfl_down_sync(0xFFFFFFFF, sum, 4);
+	sum += __shfl_down_sync(0xFFFFFFFF, sum, 8);
+	sum += __shfl_down_sync(0xFFFFFFFF, sum, 16);
+	return sum;
+}

-__device__ double reduce_sum(cg::thread_group g, double *temp, double val)
+__global__ void EfficientDotproduct(double* Result, double* InputA, double* InputB, int n)
 {
-	int lane = g.thread_rank();
-	//Sum all values of the thread group
-	//Thread 0 will return the correct summed result
-	for (int i = g.size() / 2; i > 0; i /= 2)
+	double myValue = 0.0;
+	for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n; index += (gridDim.x * blockDim.x))
 	{
-		temp[lane] = val;
-		g.sync();
-		if (lane < i) val += temp[lane + i];
-		g.sync();
+		myValue += InputA[index] * InputB[index];
 	}
-	return val;
-}

+	auto warp = cg::tiled_partition<32>(cg::this_thread_block());
+	double block_sum = reduce_sum(warp, myValue);
+
+	//Only thread 0 of the thread group returned the correct sum
+	if (warp.thread_rank() == 0) atomicAdd(Result, block_sum);
+}
+*/
 __global__ void EfficientDotproduct(double* Result, double* InputA, double* InputB, int n)
 {
+	auto block = cg::this_thread_block();
+	auto warp = cg::tiled_partition<32>(block);
+	auto warpLane = warp.thread_rank();
+
 	double myValue = 0.0;
 	for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n; index += (gridDim.x * blockDim.x))
 	{
 		myValue += InputA[index] * InputB[index];
 	}
-	extern __shared__ double temp[];
-	auto g = cg::this_thread_block();
-	double block_sum = reduce_sum(g, temp, myValue);

-	//Only thread 0 of the thread group returned the correct sum
-	if (g.thread_rank() == 0) atomicAdd(Result, block_sum);
+	myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 1);
+	myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 2);
+	myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 4);
+	myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 8);
+	myValue += __shfl_down_sync(0xFFFFFFFF, myValue, 16);
+	if (warpLane == 0)
+	{
+		atomicAdd(Result, myValue);
+	}
 }
+
 double* tempBufferDevice = nullptr;
 double* tempBufferHost = nullptr;
 double CudaDotproduct(double* InputA, double* InputB, int InputSize)
@@ -65,7 +86,9 @@ double CudaDotproduct(double* InputA, double* InputB, int InputSize)
 	int blocks = InputSize / threads;

 	//Third parameter is the size of the shared memory
-	EfficientDotproduct<<<blocks, threads, threads * sizeof(double)>>>(tempBufferDevice, InputA, InputB, InputSize);
+	EfficientDotproduct << <blocks, threads >> > (tempBufferDevice, InputA, InputB, InputSize);
+	cudaDeviceSynchronize();
+
 	cudaMemcpy(tempBufferHost, tempBufferDevice, sizeof(double), cudaMemcpyDeviceToHost);
 	return *tempBufferHost;
 }
\ No newline at end of file
--- a/program/source/extemp/cuda_helper.h
+++ b/program/source/extemp/cuda_helper.h
@@ -62,15 +62,6 @@ public:
 	}
 	int GetNumberOfBlocks() { return 128; }
 	int GetCudaDeviceID(){ return 0; }
-
-	template <class T>
-	std::pair<int, int> CalculateOptimalNumberBlocksThreads(T func, int usedSharedMemory)
-	{
-		int blocks = 0;
-		int threads = 0;
-		cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, func, usedSharedMemory, 0);
-		return std::make_pair(blocks, threads);
-	}
 };