From 8ceebeadc4a4014a082ae922317c22a173bbab45 Mon Sep 17 00:00:00 2001 From: Sebastian Kuckuk <sebastian.kuckuk@fau.de> Date: Mon, 20 Feb 2023 11:37:23 +0100 Subject: [PATCH] add source code for reduction benchmarks --- src/calc-res-gsl.h | 33 ++++ src/calc-res.h | 29 +++ src/fused-atomic-gsl.h | 37 ++++ src/fused-atomic.h | 31 +++ src/fused-cub-atomic.h | 47 +++++ src/fused-cub-gsl-atomic.h | 52 +++++ src/fused-shared-atomic.h | 58 ++++++ src/fused-shared-gsl-atomic.h | 62 ++++++ src/fused-shfl-atomic.h | 35 ++++ src/fused-shfl-gsl-atomic.h | 39 ++++ src/fused-shfl2-atomic.h | 51 +++++ src/fused-shfl2-gsl-atomic.h | 55 ++++++ src/main.cpp | 353 ++++++++++++++++++++++++++++++++++ src/red-atomic-gsl.h | 27 +++ src/red-atomic.h | 29 +++ src/red-base.h | 27 +++ src/red-cub-atomic.h | 40 ++++ src/red-cub-gsl-atomic.h | 41 ++++ src/red-cub-gsl-cascade.h | 38 ++++ src/red-shared-gsl-atomic.h | 47 +++++ src/red-shared-gsl.h | 52 +++++ src/red-shfl-gsl-atomic.h | 26 +++ src/red-shfl2-gsl-atomic.h | 45 +++++ src/red-shfl2-gsl.h | 47 +++++ src/red-step-2.h | 24 +++ src/red-step-n-gsl.h | 32 +++ src/red-step-n.h | 33 ++++ src/red-thrust.h | 15 ++ src/util.h | 79 ++++++++ 29 files changed, 1484 insertions(+) create mode 100644 src/calc-res-gsl.h create mode 100644 src/calc-res.h create mode 100644 src/fused-atomic-gsl.h create mode 100644 src/fused-atomic.h create mode 100644 src/fused-cub-atomic.h create mode 100644 src/fused-cub-gsl-atomic.h create mode 100644 src/fused-shared-atomic.h create mode 100644 src/fused-shared-gsl-atomic.h create mode 100644 src/fused-shfl-atomic.h create mode 100644 src/fused-shfl-gsl-atomic.h create mode 100644 src/fused-shfl2-atomic.h create mode 100644 src/fused-shfl2-gsl-atomic.h create mode 100644 src/main.cpp create mode 100644 src/red-atomic-gsl.h create mode 100644 src/red-atomic.h create mode 100644 src/red-base.h create mode 100644 src/red-cub-atomic.h create mode 100644 src/red-cub-gsl-atomic.h create mode 100644 src/red-cub-gsl-cascade.h create mode 100644 src/red-shared-gsl-atomic.h create mode 100644 src/red-shared-gsl.h create mode 100644 src/red-shfl-gsl-atomic.h create mode 100644 src/red-shfl2-gsl-atomic.h create mode 100644 src/red-shfl2-gsl.h create mode 100644 src/red-step-2.h create mode 100644 src/red-step-n-gsl.h create mode 100644 src/red-step-n.h create mode 100644 src/red-thrust.h create mode 100644 src/util.h diff --git a/src/calc-res-gsl.h b/src/calc-res-gsl.h new file mode 100644 index 0000000..3008105 --- /dev/null +++ b/src/calc-res-gsl.h @@ -0,0 +1,33 @@ +#pragma once + +#include "util.h" + +__global__ void calc_res_gsl_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1Start = blockIdx.y * blockDim.y + threadIdx.y; + + for (size_t i1 = i1Start; i1 < numElements.y; i1 += gridDim.y * blockDim.y) { + for (size_t i0 = i0Start; i0 < numElements.x; i0 += gridDim.x * blockDim.x) { + DATA_TYPE val = -4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + val += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + val += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + val += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.x - 1) + val += data[(i1 + 1) * numElements.x + i0]; + + norm[i1 * numElements.x + i0] = val; + } + } +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void calc_res_gsl(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements, dim3 blocks) { + blocks = dim3(std::min((size_t) blocks.x, sdiv(numElements.x, blockSize_x)), + std::min((size_t) blocks.y, sdiv(numElements.y, blockSize_y))); + + calc_res_gsl_kernel<<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/calc-res.h b/src/calc-res.h new file mode 100644 index 0000000..c747e08 --- /dev/null +++ b/src/calc-res.h @@ -0,0 +1,29 @@ +#pragma once + +#include "util.h" + +__global__ void calc_res_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1 = blockIdx.y * blockDim.y + threadIdx.y; + + if (i0 < numElements.x && i1 < numElements.y) { + DATA_TYPE val = -4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + val += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + val += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + val += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.x - 1) + val += data[(i1 + 1) * numElements.x + i0]; + + norm[i1 * numElements.x + i0] = val; + } +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void calc_res(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements) { + dim3 blocks = sdiv(numElements, dim3(blockSize_x, blockSize_y)); + calc_res_kernel<<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-atomic-gsl.h b/src/fused-atomic-gsl.h new file mode 100644 index 0000000..348bee5 --- /dev/null +++ b/src/fused-atomic-gsl.h @@ -0,0 +1,37 @@ +#pragma once + +#include "util.h" + +__global__ void fused_atomic_gsl_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1Start = blockIdx.y * blockDim.y + threadIdx.y; + DATA_TYPE acc = 0; + + for (size_t i1 = i1Start; i1 < numElements.y; i1 += gridDim.y * blockDim.y) { + for (size_t i0 = i0Start; i0 < numElements.x; i0 += gridDim.x * blockDim.x) { + acc -= 4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + acc += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + acc += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + acc += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + acc += data[(i1 + 1) * numElements.x + i0]; + } + } + + if (i0Start < numElements.x && i1Start < numElements.y) + atomicAdd(norm, acc); +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_atomic_gsl(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements, dim3 blocks) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + blocks = dim3(std::min((size_t) blocks.x, sdiv(numElements.x, blockSize_x)), + std::min((size_t) blocks.y, sdiv(numElements.y, blockSize_y))); + + fused_atomic_gsl_kernel<<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-atomic.h b/src/fused-atomic.h new file mode 100644 index 0000000..6593033 --- /dev/null +++ b/src/fused-atomic.h @@ -0,0 +1,31 @@ +#pragma once + +#include "util.h" + +__global__ void fused_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1 = blockIdx.y * blockDim.y + threadIdx.y; + + if (i0 < numElements.x && i1 < numElements.y) { + DATA_TYPE val = -4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + val += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + val += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + val += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + val += data[(i1 + 1) * numElements.x + i0]; + + atomicAdd(norm, val); + } +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_atomic(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + dim3 blocks = sdiv(numElements, dim3(blockSize_x, blockSize_y)); + fused_atomic_kernel<<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-cub-atomic.h b/src/fused-cub-atomic.h new file mode 100644 index 0000000..d4dff91 --- /dev/null +++ b/src/fused-cub-atomic.h @@ -0,0 +1,47 @@ +#pragma once + +#include <cub/cub.cuh> + +#include "util.h" + +template<unsigned int blockSize_x, unsigned int blockSize_y> +__global__ void fused_cub_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1 = blockIdx.y * blockDim.y + threadIdx.y; + + // Define BlockReduce type with as many threads per block as we use + typedef cub::BlockReduce <DATA_TYPE, blockSize_x, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, blockSize_y> BlockReduce; + + // Allocate shared memory for block reduction + __shared__ + typename BlockReduce::TempStorage temp_storage; + + DATA_TYPE val = 0; + if (i0 < numElements.x && i1 < numElements.y) { + val = -4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + val += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + val += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + val += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + val += data[(i1 + 1) * numElements.x + i0]; + } + + // Reduce over block (all threads must participate) + DATA_TYPE block_norm = BlockReduce(temp_storage).Sum(val); + + // Only first thread in the block performs the atomic + if (0 == threadIdx.x && 0 == threadIdx.y && i0 < numElements.x && i1 < numElements.y) + atomicAdd(norm, block_norm); +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_cub_atomic(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + dim3 blocks = sdiv(numElements, dim3(blockSize_x, blockSize_y)); + fused_cub_atomic_kernel<blockSize_x, blockSize_y><<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-cub-gsl-atomic.h b/src/fused-cub-gsl-atomic.h new file mode 100644 index 0000000..cca74bf --- /dev/null +++ b/src/fused-cub-gsl-atomic.h @@ -0,0 +1,52 @@ +#pragma once + +#include <cub/cub.cuh> + +#include "util.h" + +template<unsigned int blockSize_x, unsigned int blockSize_y> +__global__ void fused_cub_gsl_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1Start = blockIdx.y * blockDim.y + threadIdx.y; + + // Define BlockReduce type with as many threads per block as we use + typedef cub::BlockReduce <DATA_TYPE, blockSize_x, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, blockSize_y> BlockReduce; + + // Allocate shared memory for block reduction + __shared__ + typename BlockReduce::TempStorage temp_storage; + + DATA_TYPE acc = 0; + + for (size_t i1 = i1Start; i1 < numElements.y; i1 += gridDim.y * blockDim.y) { + for (size_t i0 = i0Start; i0 < numElements.x; i0 += gridDim.x * blockDim.x) { + acc -= 4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + acc += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + acc += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + acc += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + acc += data[(i1 + 1) * numElements.x + i0]; + } + } + + // Reduce over block (all threads must participate) + DATA_TYPE block_norm = BlockReduce(temp_storage).Sum(acc); + + // Only first thread in the block performs the atomic + if (0 == threadIdx.x && 0 == threadIdx.y && i0Start < numElements.x && i1Start < numElements.y) + atomicAdd(norm, block_norm); +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_cub_gsl_atomic(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements, dim3 blocks) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + blocks = dim3(std::min((size_t) blocks.x, sdiv(numElements.x, blockSize_x)), + std::min((size_t) blocks.y, sdiv(numElements.y, blockSize_y))); + + fused_cub_gsl_atomic_kernel<blockSize_x, blockSize_y><<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-shared-atomic.h b/src/fused-shared-atomic.h new file mode 100644 index 0000000..81e0f7a --- /dev/null +++ b/src/fused-shared-atomic.h @@ -0,0 +1,58 @@ +#pragma once + +#include "util.h" + +template<unsigned int blockSize_x, unsigned int blockSize_y> +__global__ void fused_shared_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + __shared__ + DATA_TYPE sharedData[blockSize_x * blockSize_y]; + + unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1 = blockIdx.y * blockDim.y + threadIdx.y; + sharedData[tid] = 0; + + if (i0 < numElements.x && i1 < numElements.y) { + sharedData[tid] -= 4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + sharedData[tid] += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + sharedData[tid] += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + sharedData[tid] += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + sharedData[tid] += data[(i1 + 1) * numElements.x + i0]; + } + + __syncthreads(); + +#pragma unroll + for (int stride = 512; stride > 32; stride /= 2) { + if (blockSize_x * blockSize_y >= 2 * stride) { + if (tid < stride) + sharedData[tid] += sharedData[tid + stride]; + __syncthreads(); + } + } + +#pragma unroll + for (int stride = 32; stride > 0; stride /= 2) { + if (blockSize_x * blockSize_y >= 2 * stride) { + if (tid < stride) + sharedData[tid] += sharedData[tid + stride]; + __syncwarp(); + } + } + + if (0 == tid && i0 < numElements.x && i1 < numElements.y) + atomicAdd(norm, sharedData[0]); +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_shared_atomic(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + dim3 blocks = sdiv(numElements, dim3(blockSize_x, blockSize_y)); + fused_shared_atomic_kernel<blockSize_x, blockSize_y><<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-shared-gsl-atomic.h b/src/fused-shared-gsl-atomic.h new file mode 100644 index 0000000..ec27cb8 --- /dev/null +++ b/src/fused-shared-gsl-atomic.h @@ -0,0 +1,62 @@ +#pragma once + +#include "util.h" + +template<unsigned int blockSize_x, unsigned int blockSize_y> +__global__ void fused_shared_gsl_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + __shared__ + DATA_TYPE sharedData[blockSize_x * blockSize_y]; + + unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1Start = blockIdx.y * blockDim.y + threadIdx.y; + sharedData[tid] = 0; + + for (size_t i1 = i1Start; i1 < numElements.y; i1 += gridDim.y * blockDim.y) { + for (size_t i0 = i0Start; i0 < numElements.x; i0 += gridDim.x * blockDim.x) { + sharedData[tid] -= 4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + sharedData[tid] += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + sharedData[tid] += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + sharedData[tid] += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + sharedData[tid] += data[(i1 + 1) * numElements.x + i0]; + } + } + + __syncthreads(); + +#pragma unroll + for (int stride = 512; stride > 32; stride /= 2) { + if (blockSize_x * blockSize_y >= 2 * stride) { + if (tid < stride) + sharedData[tid] += sharedData[tid + stride]; + __syncthreads(); + } + } + +#pragma unroll + for (int stride = 32; stride > 0; stride /= 2) { + if (blockSize_x * blockSize_y >= 2 * stride) { + if (tid < stride) + sharedData[tid] += sharedData[tid + stride]; + __syncwarp(); + } + } + + if (0 == tid && i0Start < numElements.x && i1Start < numElements.y) + atomicAdd(norm, sharedData[0]); +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_shared_gsl_atomic(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements, dim3 blocks) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + blocks = dim3(std::min((size_t) blocks.x, sdiv(numElements.x, blockSize_x)), + std::min((size_t) blocks.y, sdiv(numElements.y, blockSize_y))); + + fused_shared_gsl_atomic_kernel<blockSize_x, blockSize_y><<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-shfl-atomic.h b/src/fused-shfl-atomic.h new file mode 100644 index 0000000..9482b27 --- /dev/null +++ b/src/fused-shfl-atomic.h @@ -0,0 +1,35 @@ +#pragma once + +#include "util.h" + +__global__ void fused_shfl_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1 = blockIdx.y * blockDim.y + threadIdx.y; + DATA_TYPE acc = 0; + + if (i0 < numElements.x && i1 < numElements.y) { + acc -= 4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + acc += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + acc += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + acc += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + acc += data[(i1 + 1) * numElements.x + i0]; + } + + acc = warp_reduce(acc); + if (0 == tid % warpSize && i0 < numElements.x && i1 < numElements.y) + atomicAdd(norm, acc); +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_shfl_atomic(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + dim3 blocks = sdiv(numElements, dim3(blockSize_x, blockSize_y)); + fused_shfl_atomic_kernel<<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-shfl-gsl-atomic.h b/src/fused-shfl-gsl-atomic.h new file mode 100644 index 0000000..8eb5a7f --- /dev/null +++ b/src/fused-shfl-gsl-atomic.h @@ -0,0 +1,39 @@ +#pragma once + +#include "util.h" + +__global__ void fused_shfl_gsl_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1Start = blockIdx.y * blockDim.y + threadIdx.y; + DATA_TYPE acc = 0; + + for (size_t i1 = i1Start; i1 < numElements.y; i1 += gridDim.y * blockDim.y) { + for (size_t i0 = i0Start; i0 < numElements.x; i0 += gridDim.x * blockDim.x) { + acc -= 4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + acc += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + acc += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + acc += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + acc += data[(i1 + 1) * numElements.x + i0]; + } + } + + acc = warp_reduce(acc); + if (0 == tid % warpSize && i0Start < numElements.x && i1Start < numElements.y) + atomicAdd(norm, acc); +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_shfl_gsl_atomic(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements, dim3 blocks) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + blocks = dim3(std::min((size_t) blocks.x, sdiv(numElements.x, blockSize_x)), + std::min((size_t) blocks.y, sdiv(numElements.y, blockSize_y))); + + fused_shfl_gsl_atomic_kernel<<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-shfl2-atomic.h b/src/fused-shfl2-atomic.h new file mode 100644 index 0000000..e0ad50b --- /dev/null +++ b/src/fused-shfl2-atomic.h @@ -0,0 +1,51 @@ +#pragma once + +#include "util.h" + +__global__ void fused_shfl2_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1 = blockIdx.y * blockDim.y + threadIdx.y; + DATA_TYPE acc = 0; + + if (i0 < numElements.x && i1 < numElements.y) { + acc -= 4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + acc += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + acc += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + acc += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + acc += data[(i1 + 1) * numElements.x + i0]; + } + + acc = warp_reduce(acc); + + __shared__ + DATA_TYPE shared[32]; // TODO: adapt to blockSize + unsigned int lane = tid % warpSize; + unsigned int wid = tid / warpSize; + + if (lane == 0) + shared[wid] = acc; + + // wait for all partial reductions + __syncthreads(); + + acc = (tid < blockDim.x * blockDim.y / warpSize) ? shared[lane] : 0; + if (wid == 0) + acc = warp_reduce(acc); + + if (0 == tid && i0 < numElements.x && i1 < numElements.y) + atomicAdd(norm, acc); +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_shfl2_atomic(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + dim3 blocks = sdiv(numElements, dim3(blockSize_x, blockSize_y)); + fused_shfl2_atomic_kernel<<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/fused-shfl2-gsl-atomic.h b/src/fused-shfl2-gsl-atomic.h new file mode 100644 index 0000000..6857600 --- /dev/null +++ b/src/fused-shfl2-gsl-atomic.h @@ -0,0 +1,55 @@ +#pragma once + +#include "util.h" + +__global__ void fused_shfl2_gsl_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, dim3 numElements) { + unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + size_t i1Start = blockIdx.y * blockDim.y + threadIdx.y; + DATA_TYPE acc = 0; + + for (size_t i1 = i1Start; i1 < numElements.y; i1 += gridDim.y * blockDim.y) { + for (size_t i0 = i0Start; i0 < numElements.x; i0 += gridDim.x * blockDim.x) { + acc -= 4 * data[i1 * numElements.x + i0]; + + if (i0 > 0) + acc += data[i1 * numElements.x + i0 - 1]; + if (i0 < numElements.x - 1) + acc += data[i1 * numElements.x + i0 + 1]; + if (i1 > 0) + acc += data[(i1 - 1) * numElements.x + i0]; + if (i1 < numElements.y - 1) + acc += data[(i1 + 1) * numElements.x + i0]; + } + } + + acc = warp_reduce(acc); + + __shared__ + DATA_TYPE shared[32]; // TODO: adapt to blockSize + unsigned int lane = tid % warpSize; + unsigned int wid = tid / warpSize; + + if (lane == 0) + shared[wid] = acc; + + // wait for all partial reductions + __syncthreads(); + + acc = (tid < blockDim.x * blockDim.y / warpSize) ? shared[lane] : 0; + if (wid == 0) + acc = warp_reduce(acc); + + if (0 == tid && i0Start < numElements.x && i1Start < numElements.y) + atomicAdd(norm, acc); +} + +template<unsigned int blockSize_x, unsigned int blockSize_y> +void fused_shfl2_gsl_atomic(DATA_TYPE *data, DATA_TYPE *norm, const dim3 &numElements, dim3 blocks) { + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + blocks = dim3(std::min((size_t) blocks.x, sdiv(numElements.x, blockSize_x)), + std::min((size_t) blocks.y, sdiv(numElements.y, blockSize_y))); + + fused_shfl2_gsl_atomic_kernel<<<blocks, dim3(blockSize_x, blockSize_y)>>>(data, norm, numElements); +} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..baeedff --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,353 @@ +// nvcc -x cu -O3 -arch=sm_80 -lcuda -o main main.cpp + +#include <cstdio> +#include <limits> + +#include <iostream> +#include <iomanip> +#include <sstream> + +#include <vector> + +#include "util.h" + +#include "red-base.h" +#include "red-atomic.h" +#include "red-atomic-gsl.h" +#include "red-step-2.h" +#include "red-step-n.h" +//#include "step-n-gsl.h" +#include "red-cub-atomic.h" +#include "red-cub-gsl-atomic.h" +#include "red-cub-gsl-cascade.h" +#include "red-thrust.h" +#include "red-shared-gsl.h" +#include "red-shared-gsl-atomic.h" +#include "red-shfl2-gsl.h" +#include "red-shfl2-gsl-atomic.h" +#include "red-shfl-gsl-atomic.h" + +#include "calc-res.h" +#include "calc-res-gsl.h" + +#include "fused-atomic.h" +#include "fused-atomic-gsl.h" +#include "fused-cub-atomic.h" +#include "fused-cub-gsl-atomic.h" +#include "fused-shared-atomic.h" +#include "fused-shared-gsl-atomic.h" +#include "fused-shfl-atomic.h" +#include "fused-shfl-gsl-atomic.h" +#include "fused-shfl2-atomic.h" +#include "fused-shfl2-gsl-atomic.h" + +__global__ void init(DATA_TYPE *data, int N) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < N) + data[idx] = 1.f; +} + +DATA_TYPE *h_norm, *d_norm; + +template<typename T> +void measure(std::string label, const T &toMeasure, size_t numElements = 0, int numIt = 10) { + Timer timer; + +// std::cout << "Measuring " << label << std::endl; +// CUDA_CHECK(cudaDeviceSynchronize()); + + timer.start(); + toMeasure(); + CUDA_CHECK(cudaMemcpy(h_norm, d_norm, sizeof(DATA_TYPE), cudaMemcpyDeviceToHost)); + timer.stop(); + + if (0 == numIt) + numIt = max(1, (int) (1e3 / timer.time)); + + CUDA_CHECK(cudaDeviceSynchronize()); + timer.start(); + + for (int i = 0; i < numIt; ++i) { + toMeasure(); + CUDA_CHECK(cudaMemcpy(h_norm, d_norm, sizeof(DATA_TYPE), cudaMemcpyDeviceToHost)); + } + + timer.stop(); + CUDA_CHECK(cudaDeviceSynchronize()); + + if (benchmark) { + label.erase(label.find_last_not_of(" \n\r\t") + 1); + std::cout << label << ',' << numElements << ',' << numIt << ',' << timer.time << ',' << timer.time / numIt << std::endl; + } else { + std::cout << std::setw(30) << label << " (" << std::setw(4) << numIt << " times): " << std::setw(12) << std::right << timer.time / numIt << " ms / it"; + if (numElements > 0) + std::cout << ", " << std::setw(12) << std::right << 1e-9 * numElements * numIt / (1e-3 * timer.time) << " GLUP/s"; + std::cout << std::endl; + } +} + +int main() { + size_t numPoints; + numPoints = GIGA; + + constexpr int blockSize = 1024; + + //constexpr int blocks = 256; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + int blocks = 8 * deviceProp.multiProcessorCount; + + constexpr int blockSize_x = 32; + constexpr int blockSize_y = 32; + //dim3 blockSize3 = dim3(blockSize_x, blockSize_y); + dim3 blocks3 = dim3(sdiv(deviceProp.multiProcessorCount, 4), 8 * 4); // TODO: adapt? + + DATA_TYPE *d_data; + CUDA_CHECK(cudaMalloc(&d_data, numPoints * sizeof(DATA_TYPE))); + + CUDA_CHECK(cudaMallocHost(&h_norm, sizeof(DATA_TYPE))); + CUDA_CHECK(cudaMalloc(&d_norm, numPoints * sizeof(DATA_TYPE))); + + DATA_TYPE *d_tmp; + CUDA_CHECK(cudaMalloc(&d_tmp, numPoints * sizeof(DATA_TYPE))); + + init<<<sdiv(numPoints, blockSize), blockSize>>>(d_data, numPoints); + + Timer timer; + + std::vector<size_t> numPointsList; + if (benchmark) + for (size_t i = 1; i <= GIGA; i *= 2) + numPointsList.push_back(i); + else + numPointsList = {MEGA, 16 * MEGA, 256 * MEGA, GIGA}; + + if (benchmark) + std::cout << "benchmarkName,numElements,numIt,time,timePerIt" << std::endl; + + for (size_t numPoints: numPointsList) { + auto validate = [&] { + CUDA_CHECK(cudaMemcpy(h_norm, d_norm, sizeof(DATA_TYPE), cudaMemcpyDeviceToHost)); + + if (*h_norm != numPoints) + std::cout << "Norm = " << *h_norm << ", should be " << numPoints << std::endl; + }; + + if (!benchmark) + std::cout << std::endl << "Running benchmark for " << numPoints << " elements / " << numPoints * sizeof(DATA_TYPE) << " bytes" << std::endl; + + measure("red-base", [&] { return reduce_base<blockSize>(d_data, d_norm, numPoints); }, numPoints); + validate(); + + measure("red-atomic", [&] { return reduce_atomic<1, 1, false, blockSize>(d_data, d_norm, numPoints); }, numPoints, 1); + validate(); +// measure("atomic<8, 1> ", [&] { return reduce_atomic<8, 1, false>(d_data, d_norm, numPoints / 8, threadsPerBlock); }, numPoints, 1); +// validate(); +// measure("atomic<1, 8> ", [&] { return reduce_atomic<1, 8, false>(d_data, d_norm, numPoints, threadsPerBlock); }, numPoints, 1); +// validate(); +// measure("atomic<8, 2> ", [&] { return reduce_atomic<8, 1, false>(d_data, d_norm, numPoints / 8, threadsPerBlock); }, numPoints, 1); +// validate(); + + measure("red-atomic-gsl", [&] { return reduce_atomic_gsl<blockSize>(d_data, d_norm, numPoints, blocks); }, numPoints); + validate(); + + measure("red-step-2", [&] { return reduce_step2<blockSize>(d_data, d_norm, numPoints); }, numPoints); + validate(); + + for (int n: {2, 3, 4, 8, 16, 32, 64, 128, 256}) { + std::stringstream labelStream; + labelStream << "red-step-n(" << n << ")"; + measure(labelStream.str(), [&] { + return reduce_stepN<blockSize>(d_data, d_norm, numPoints, n); + }, numPoints); + validate(); + } + + measure("red-cub-atomic", [&] { return reduce_cub_atomic<blockSize>(d_data, d_norm, numPoints); }, numPoints); + validate(); + + measure("red-cub-gsl-atomic", [&] { return reduce_cub_gsl_atomic<blockSize>(d_data, d_norm, numPoints, blocks); }, numPoints); + validate(); + + measure("red-cub-gsl-cascade", [&] { return reduce_cub_gsl_cascade<blockSize>(d_data, d_norm, numPoints, blocks); }, numPoints); + validate(); + + measure("red-thrust", [&] { return reduce_thrust<blockSize>(d_data, h_norm, numPoints); }, numPoints); + CUDA_CHECK(cudaMemcpy(d_norm, h_norm, sizeof(DATA_TYPE), cudaMemcpyHostToDevice)); + validate(); + + measure("red-shared-gsl", [&] { return reduce_shared_gsl<blockSize>(d_data, d_norm, numPoints, blocks); }, numPoints); + validate(); + + measure("red-shared-gsl-atomic", [&] { return reduce_shared_gsl_atomic<blockSize>(d_data, d_norm, numPoints, blocks); }, numPoints); + validate(); + + measure("red-shfl2-gsl", [&] { return reduce_shfl2_gsl<blockSize>(d_data, d_norm, numPoints, blocks); }, numPoints); + validate(); + + measure("red-shfl2-gsl-atomic", [&] { return reduce_shfl2_gsl_atomic<blockSize>(d_data, d_norm, numPoints, blocks); }, numPoints); + validate(); + + measure("red-shfl-gsl-atomic", [&] { return reduce_shfl_gsl_atomic<blockSize>(d_data, d_norm, numPoints, blocks); }, numPoints); + validate(); + } + + std::vector<size_t> numPointsPerDimList; + if (benchmark) + for (size_t i = 1; i <= 32 * KILO; i *= 2) + numPointsPerDimList.push_back(i); + else + numPointsPerDimList = {KILO, 4 * KILO, 16 * KILO}; + + for (size_t numPointsPerDim: numPointsPerDimList) { + dim3 numPoints3 = dim3(numPointsPerDim, numPointsPerDim); + numPoints = numPoints3.x * numPoints3.y * numPoints3.z; + + auto validate = [&] { + CUDA_CHECK(cudaMemcpy(h_norm, d_norm, sizeof(DATA_TYPE), cudaMemcpyDeviceToHost)); + + if (*h_norm != -2.f * numPoints3.x - 2.f * numPoints3.y) + std::cout << "Norm = " << *h_norm << ", should be " << -2.f * numPoints3.x - 2.f * numPoints3.y << std::endl; + }; + + if (!benchmark) + std::cout << std::endl << "Running benchmark for " << numPoints << " elements / " << numPoints * sizeof(DATA_TYPE) << " bytes" << std::endl; + + measure("only-res", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + }, numPoints); + + measure("only-res-gsl", [&] { + calc_res_gsl<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3, blocks3); + }, numPoints); + + measure("split-base", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + return reduce_base<blockSize>(d_data, d_norm, numPoints, false); + }, numPoints); + validate(); + + measure("split-atomic", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + return reduce_atomic<1, 1, true, blockSize>(d_data, d_norm, numPoints); + }, numPoints, 1); + validate(); + + measure("split-atomic-gsl", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + return reduce_atomic_gsl<blockSize>(d_data, d_norm, numPoints, blocks, true); + }, numPoints); + validate(); + + measure("split-step-2", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + return reduce_step2<blockSize>(d_data, d_norm, numPoints, false); + }, numPoints); + validate(); + + for (int n: {2, 3, 4, 8, 16, 32, 64, 128, 256}) { + std::stringstream labelStream; + labelStream << "split-step-n(" << n << ")"; + measure(labelStream.str(), [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + return reduce_stepN<blockSize>(d_data, d_norm, numPoints, n, false); + }, numPoints); + validate(); + } + + measure("split-cub-atomic", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + return reduce_cub_atomic<blockSize>(d_data, d_norm, numPoints, true); + }, numPoints); + validate(); + + measure("split-cub-gsl-atomic", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + return reduce_cub_gsl_atomic<blockSize>(d_data, d_norm, numPoints, blocks, true); + }, numPoints); + validate(); + + measure("split-cub-gsl-cascade", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_tmp, numPoints3); + return reduce_cub_gsl_cascade<blockSize>(d_tmp, d_norm, numPoints, blocks); + }, numPoints); + validate(); + + measure("split-shared-gsl-atomic", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_tmp, numPoints3); + return reduce_shared_gsl_atomic<blockSize>(d_tmp, d_norm, numPoints, blocks); + }, numPoints); + validate(); + + measure("split-shfl2-gsl-atomic", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_tmp, numPoints3); + return reduce_shfl2_gsl_atomic<blockSize>(d_tmp, d_norm, numPoints, blocks); + }, numPoints); + validate(); + + measure("split-thrust", [&] { + calc_res<blockSize_x, blockSize_y>(d_data, d_tmp, numPoints3); + return reduce_thrust<blockSize>(d_tmp, h_norm, numPoints); + }, numPoints); + CUDA_CHECK(cudaMemcpy(d_norm, h_norm, sizeof(DATA_TYPE), cudaMemcpyHostToDevice)); + validate(); + + measure("fused-atomic", [&] { + return fused_atomic<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + }, numPoints, 1); + validate(); + + measure("fused-atomic-gsl", [&] { + return fused_atomic_gsl<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3, blocks3); + }, numPoints); + validate(); + + measure("fused-cub-atomic", [&] { + return fused_cub_atomic<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + }, numPoints); + validate(); + + measure("fused-cub-gsl-atomic", [&] { + return fused_cub_gsl_atomic<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3, blocks3); + }, numPoints); + validate(); + + measure("fused-shared-atomic", [&] { + return fused_shared_atomic<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + }, numPoints); + validate(); + + measure("fused-shared-gsl-atomic", [&] { + return fused_shared_gsl_atomic<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3, blocks3); + }, numPoints); + validate(); + + measure("fused-shfl-atomic", [&] { + return fused_shfl_atomic<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + }, numPoints); + validate(); + + measure("fused-shfl-gsl-atomic", [&] { + return fused_shfl_gsl_atomic<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3, blocks3); + }, numPoints); + validate(); + + measure("fused-shfl2-atomic", [&] { + return fused_shfl2_atomic<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3); + }, numPoints); + validate(); + + measure("fused-shfl2-gsl-atomic", [&] { + return fused_shfl2_gsl_atomic<blockSize_x, blockSize_y>(d_data, d_norm, numPoints3, blocks3); + }, numPoints); + validate(); + } + + cudaFree(h_norm); + cudaFree(d_norm); + cudaFree(d_data); + cudaFree(d_tmp); + + return 0; +} diff --git a/src/red-atomic-gsl.h b/src/red-atomic-gsl.h new file mode 100644 index 0000000..d493780 --- /dev/null +++ b/src/red-atomic-gsl.h @@ -0,0 +1,27 @@ +#pragma once + +#include "util.h" + +__global__ void reduce_atomic_gsl_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, bool in_place) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + if (in_place) + i0Start += 1; + + DATA_TYPE acc = 0; + + for (size_t i0 = i0Start; i0 < numElements; i0 += gridDim.x * blockDim.x) + acc += data[i0]; + + if (i0Start < numElements) + atomicAdd(norm, acc); +} + +template<unsigned int blockSize> +void reduce_atomic_gsl(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, size_t blocks, bool in_place = false) { + blocks = std::min(blocks, sdiv(numElements, blockSize)); + + if (!in_place) + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + reduce_atomic_gsl_kernel<<<blocks, blockSize>>>(in_place ? norm : data, norm, numElements, in_place); +} diff --git a/src/red-atomic.h b/src/red-atomic.h new file mode 100644 index 0000000..e2adbb9 --- /dev/null +++ b/src/red-atomic.h @@ -0,0 +1,29 @@ +#pragma once + +#include "util.h" + +template<int chunkSize, int redStride, bool in_place> +__global__ void reduce_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numChunks) { + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + i0 *= redStride; + + if (!in_place) { + for (int redI = 0; redI < chunkSize; ++redI) + if (i0 < numChunks) + atomicAdd(&(norm[redI]), data[i0 * chunkSize + redI]); + } else { + if (i0 > 0) // skip very first thread + for (int redI = 0; redI < chunkSize; ++redI) + if (i0 < numChunks) + atomicAdd(&(norm[redI]), norm[i0 * chunkSize + redI]); + } +} + +template<int chunkSize, int redStride, bool in_place, unsigned int blockSize> +void reduce_atomic(DATA_TYPE *data, DATA_TYPE *norm, size_t numChunks) { + if (!in_place) + cudaMemset(norm, 0, sizeof(DATA_TYPE) * chunkSize); + + size_t blocks = sdiv(numChunks, blockSize); + reduce_atomic_kernel<chunkSize, redStride, in_place><<<blocks, blockSize>>>(data, norm, numChunks); +} diff --git a/src/red-base.h b/src/red-base.h new file mode 100644 index 0000000..d60761f --- /dev/null +++ b/src/red-base.h @@ -0,0 +1,27 @@ +#pragma once + +#include "util.h" + +__global__ void reduce_base_kernel(DATA_TYPE *data, size_t numElements, size_t halfStride) { + int i0 = ((blockIdx.x * blockDim.x) + threadIdx.x); + i0 *= (2 * halfStride); + + if (((i0 < 0) || (i0 >= numElements))) { + return; + } + + if (((i0 + halfStride) < numElements)) { + data[i0] = (data[(halfStride + i0)] + data[i0]); + } +} + +template<unsigned int blockSize> +void reduce_base(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, bool copy_in = true) { + if (copy_in) CUDA_CHECK(cudaMemcpy(norm, data, numElements * sizeof(DATA_TYPE), cudaMemcpyDefault)) + + for (size_t halfStride = 1; halfStride < numElements; halfStride *= 2) { + //std::cout << "Writing with half stride " << halfStride << std::endl; + size_t blocks = sdiv(numElements, 2 * blockSize * halfStride); + reduce_base_kernel<<<blocks, blockSize>>>(norm, numElements, halfStride); + } +} diff --git a/src/red-cub-atomic.h b/src/red-cub-atomic.h new file mode 100644 index 0000000..61939fa --- /dev/null +++ b/src/red-cub-atomic.h @@ -0,0 +1,40 @@ +#pragma once + +#include <cub/cub.cuh> + +#include "util.h" + +template<int blockSize> +__global__ void reduce_cub_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, bool in_place) { + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + if (in_place) + i0 += 1; + + // Define BlockReduce type with as many threads per block as we use + typedef cub::BlockReduce <DATA_TYPE, blockSize, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY> BlockReduce; + + // Allocate shared memory for block reduction + __shared__ + typename BlockReduce::TempStorage temp_storage; + + DATA_TYPE elem = 0.f; + + if (i0 < numElements) + elem = data[i0]; + + // Reduce over block (all threads must participate) + DATA_TYPE block_norm = BlockReduce(temp_storage).Sum(elem); + + // Only first thread in the block performs the atomic + if (0 == threadIdx.x && i0 < numElements) + atomicAdd(norm, block_norm); +} + +template<unsigned int blockSize> +void reduce_cub_atomic(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, bool in_place = false) { + if (!in_place) + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + size_t blocks = sdiv(numElements, blockSize); + reduce_cub_atomic_kernel<blockSize><<<blocks, blockSize>>>(in_place ? norm : data, norm, numElements, in_place); +} diff --git a/src/red-cub-gsl-atomic.h b/src/red-cub-gsl-atomic.h new file mode 100644 index 0000000..5c68dbd --- /dev/null +++ b/src/red-cub-gsl-atomic.h @@ -0,0 +1,41 @@ +#pragma once + +#include <cub/cub.cuh> + +#include "util.h" + +template<int blockSize> +__global__ void reduce_cub_gsl_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, bool in_place) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + if (in_place) + i0Start += 1; + + DATA_TYPE acc = 0; + + // Define BlockReduce type with as many threads per block as we use + typedef cub::BlockReduce <DATA_TYPE, blockSize, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY> BlockReduce; + + // Allocate shared memory for block reduction + __shared__ + typename BlockReduce::TempStorage temp_storage; + + for (size_t i0 = i0Start; i0 < numElements; i0 += gridDim.x * blockDim.x) + acc += data[i0]; + + // Reduce over block (all threads must participate) + DATA_TYPE block_norm = BlockReduce(temp_storage).Sum(acc); + + // Only first thread in the block performs the atomic + if (0 == threadIdx.x && i0Start < numElements) + atomicAdd(norm, block_norm); +} + +template<unsigned int blockSize> +void reduce_cub_gsl_atomic(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, size_t blocks, bool in_place = false) { + if (!in_place) + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + + blocks = std::min(blocks, sdiv(numElements, blockSize)); + + reduce_cub_gsl_atomic_kernel<blockSize><<<blocks, blockSize>>>(in_place ? norm : data, norm, numElements, in_place); +} diff --git a/src/red-cub-gsl-cascade.h b/src/red-cub-gsl-cascade.h new file mode 100644 index 0000000..a83b29b --- /dev/null +++ b/src/red-cub-gsl-cascade.h @@ -0,0 +1,38 @@ +#pragma once + +#include <cub/cub.cuh> + +#include "util.h" + +template<int blockSize> +__global__ void reduce_cub_gsl_cascade_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + + DATA_TYPE acc = 0; + + for (size_t i0 = i0Start; i0 < numElements; i0 += gridDim.x * blockDim.x) + acc += data[i0]; + + // define BlockReduce type with as many threads per block as we use + typedef cub::BlockReduce <DATA_TYPE, blockSize, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY> BlockReduce; + + // allocate shared memory for block reduction + __shared__ + typename BlockReduce::TempStorage temp_storage; + + // reduce over block (all threads must participate) + DATA_TYPE block_norm = BlockReduce(temp_storage).Sum(acc); + + // only first thread in the block writes result + if (0 == threadIdx.x && i0Start < numElements) + norm[blockIdx.x] = block_norm; +} + +template<unsigned int blockSize> +void reduce_cub_gsl_cascade(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, size_t blocks) { + blocks = std::min(blocks, sdiv(numElements, blockSize)); + + reduce_cub_gsl_cascade_kernel<blockSize><<<blocks, blockSize>>>(data, norm, numElements); + if (blocks > 1) + reduce_cub_gsl_cascade_kernel<blockSize><<<1, blockSize>>>(norm, norm, blocks); +} diff --git a/src/red-shared-gsl-atomic.h b/src/red-shared-gsl-atomic.h new file mode 100644 index 0000000..750ac44 --- /dev/null +++ b/src/red-shared-gsl-atomic.h @@ -0,0 +1,47 @@ +#pragma once + +#include "util.h" + +template<unsigned int blockSize> +__global__ void reduce_shared_gsl_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements) { + __shared__ + DATA_TYPE sharedData[blockSize]; + + unsigned int tid = threadIdx.x; + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + + sharedData[tid] = 0; + + for (size_t i0 = i0Start; i0 < numElements; i0 += blockDim.x * gridDim.x) + sharedData[tid] += data[i0]; + __syncthreads(); + +#pragma unroll + for (int stride = 512; stride > 32; stride /= 2) { + if (blockSize >= 2 * stride) { + if (tid < stride) + sharedData[tid] += sharedData[tid + stride]; + __syncthreads(); + } + } + +#pragma unroll + for (int stride = 32; stride > 0; stride /= 2) { + if (blockSize >= 2 * stride) { + if (tid < stride) + sharedData[tid] += sharedData[tid + stride]; + __syncwarp(); + } + } + + if (0 == tid) + atomicAdd(norm, sharedData[0]); +} + +template<unsigned int blockSize> +void reduce_shared_gsl_atomic(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, size_t blocks) { // does not work in place + blocks = std::min(blocks, sdiv(numElements, blockSize)); + + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + reduce_shared_gsl_atomic_kernel<blockSize><<<blocks, blockSize>>>(data, norm, numElements); +} diff --git a/src/red-shared-gsl.h b/src/red-shared-gsl.h new file mode 100644 index 0000000..1f0b084 --- /dev/null +++ b/src/red-shared-gsl.h @@ -0,0 +1,52 @@ +#pragma once + +#include "util.h" + +template<unsigned int blockSize> +__global__ void reduce_shared_gsl_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements) { + unsigned int tid = threadIdx.x; + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + + __shared__ + DATA_TYPE sharedData[blockSize]; + + // sum into shared memory + sharedData[tid] = 0; + + for (size_t i0 = i0Start; i0 < numElements; i0 += blockDim.x * gridDim.x) + sharedData[tid] += data[i0]; + __syncthreads(); + + // perform reduction across warps +#pragma unroll + for (int stride = 512; stride > 32; stride /= 2) { + if (blockSize >= 2 * stride) { + if (tid < stride) + sharedData[tid] += sharedData[tid + stride]; + __syncthreads(); + } + } + + // perform reduction within first warp +#pragma unroll + for (int stride = 32; stride > 0; stride /= 2) { + if (blockSize >= 2 * stride) { + if (tid < stride) + sharedData[tid] += sharedData[tid + stride]; + __syncwarp(); + } + } + + // write back result + if (0 == tid) + norm[blockIdx.x] = sharedData[0]; +} + +template<unsigned int blockSize> +void reduce_shared_gsl(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, size_t blocks) { // does not work in place + blocks = std::min(blocks, sdiv(numElements, blockSize)); + + reduce_shared_gsl_kernel<blockSize><<<blocks, blockSize>>>(data, norm, numElements); + if (blocks > 1) + reduce_shared_gsl_kernel<blockSize><<<1, blockSize>>>(norm, norm, blocks); +} diff --git a/src/red-shfl-gsl-atomic.h b/src/red-shfl-gsl-atomic.h new file mode 100644 index 0000000..13a5a03 --- /dev/null +++ b/src/red-shfl-gsl-atomic.h @@ -0,0 +1,26 @@ +#pragma once + +#include "util.h" + +template<unsigned int blockSize> +__global__ void reduce_shfl_gsl_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + + DATA_TYPE sum = 0; + for (size_t i0 = i0Start; i0 < numElements; i0 += blockDim.x * gridDim.x) + sum += data[i0]; + + unsigned int lane = threadIdx.x % warpSize; + + sum = warp_reduce(sum); + if (0 == lane && i0Start < numElements) + atomicAdd(norm, sum); +} + +template<unsigned int blockSize> +void reduce_shfl_gsl_atomic(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, size_t blocks) { + blocks = std::min(blocks, sdiv(numElements, blockSize)); + + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + reduce_shfl_gsl_atomic_kernel<blockSize><<<blocks, blockSize>>>(data, norm, numElements); +} diff --git a/src/red-shfl2-gsl-atomic.h b/src/red-shfl2-gsl-atomic.h new file mode 100644 index 0000000..ec05e4e --- /dev/null +++ b/src/red-shfl2-gsl-atomic.h @@ -0,0 +1,45 @@ +#pragma once + +#include "util.h" + +template<unsigned int blockSize> +__global__ void reduce_shfl2_gsl_atomic_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + + DATA_TYPE sum = 0; + for (size_t i0 = i0Start; i0 < numElements; i0 += blockDim.x * gridDim.x) + sum += data[i0]; + + // warp reduction + sum = warp_reduce(sum); + + // store to shared memory + __shared__ + DATA_TYPE shared[blockSize / 32]; + unsigned int lane = threadIdx.x % warpSize; + unsigned int wid = threadIdx.x / warpSize; + + if (0 == lane) + shared[wid] = sum; + + // wait for all partial reductions + __syncthreads(); + + // only first warp - read from shared memory and perform reduction + if (0 == wid) { + sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + sum = warp_reduce(sum); + + // write back final result + if (threadIdx.x == 0) + atomicAdd(norm, sum); + } +} + +template<unsigned int blockSize> +void reduce_shfl2_gsl_atomic(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, size_t blocks) { // does not work in place + blocks = std::min(blocks, sdiv(numElements, blockSize)); + + cudaMemset(norm, 0, sizeof(DATA_TYPE)); + reduce_shfl2_gsl_atomic_kernel<blockSize><<<blocks, blockSize>>>(data, norm, numElements); +} diff --git a/src/red-shfl2-gsl.h b/src/red-shfl2-gsl.h new file mode 100644 index 0000000..ec94cd7 --- /dev/null +++ b/src/red-shfl2-gsl.h @@ -0,0 +1,47 @@ +#pragma once + +#include "util.h" + + +template<unsigned int blockSize> +__global__ void reduce_shfl2_gsl_kernel(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + + DATA_TYPE sum = 0; + for (size_t i0 = i0Start; i0 < numElements; i0 += blockDim.x * gridDim.x) + sum += data[i0]; + + // warp reduction + sum = warp_reduce(sum); + + // store to shared memory + __shared__ + DATA_TYPE shared[blockSize / 32]; + unsigned int lane = threadIdx.x % warpSize; + unsigned int wid = threadIdx.x / warpSize; + + if (0 == lane) + shared[wid] = sum; + + // wait for all partial reductions + __syncthreads(); + + // only first warp - read from shared memory and perform reduction + if (0 == wid) { + sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + sum = warp_reduce(sum); + + // write back final result + if (0 == threadIdx.x) + norm[blockIdx.x] = sum; + } +} + +template<unsigned int blockSize> +void reduce_shfl2_gsl(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, size_t blocks) { // does not work in place + blocks = std::min(blocks, sdiv(numElements, blockSize)); + + reduce_shfl2_gsl_kernel<blockSize><<<blocks, blockSize>>>(data, norm, numElements); + if (blocks > 1) + reduce_shfl2_gsl_kernel<blockSize><<<1, blockSize>>>(norm, norm, blocks); +} diff --git a/src/red-step-2.h b/src/red-step-2.h new file mode 100644 index 0000000..ca4d287 --- /dev/null +++ b/src/red-step-2.h @@ -0,0 +1,24 @@ +#pragma once + +#include "util.h" + +__global__ void reduce_step2_kernel(DATA_TYPE *data, size_t numElements, size_t toWrite) { + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + size_t i0Add = i0 + toWrite; + + if (i0 < toWrite && i0Add < numElements) + data[i0] = data[i0] + data[i0Add]; +} + +template<unsigned int blockSize> +void reduce_step2(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, bool copy_in = true) { + if (copy_in) + cudaMemcpy(norm, data, numElements * sizeof(DATA_TYPE), cudaMemcpyDefault); + + for (size_t remaining = numElements; remaining > 1; remaining = (remaining + 1) / 2) { + size_t toWrite = (remaining + 1) / 2; + //std::cout << "Writing to " << toWrite << " elements" << std::endl; + size_t blocks = sdiv(toWrite, blockSize); + reduce_step2_kernel<<<blocks, blockSize>>>(norm, remaining, toWrite); + } +} diff --git a/src/red-step-n-gsl.h b/src/red-step-n-gsl.h new file mode 100644 index 0000000..742ec69 --- /dev/null +++ b/src/red-step-n-gsl.h @@ -0,0 +1,32 @@ +#pragma once + +#include "util.h" + +__global__ void reduce_stepN_gsl_kernel(DATA_TYPE *data, size_t numElements, size_t toWrite, int step) { + size_t i0Start = blockIdx.x * blockDim.x + threadIdx.x; + + DATA_TYPE acc = data[i0Start]; + + for (size_t i0 = i0Start; i0 < toWrite; i0 += gridDim.x * blockDim.x) { + for (int off = 1; off < step; ++off) { + int i0Add = i0 + off * toWrite; + + if (i0Add < numElements) + acc = acc + data[i0Add]; + } + } + + data[i0Start] = acc; +} + +template<unsigned int blockSize> +void reduce_stepN_gsl(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, int step, size_t blocks) { + cudaMemcpy(norm, data, numElements * sizeof(DATA_TYPE), cudaMemcpyDefault); + + for (size_t remaining = numElements; remaining > 1; remaining = (remaining + step - 1) / step) { + size_t toWrite = (remaining + step - 1) / step; + //std::cout << "Writing to " << toWrite << " elements" << std::endl; + blocks = std::min(blocks, sdiv(remaining, blockSize)); + reduce_stepN_kernel<<<blocks, blockSize>>>(norm, remaining, toWrite, step); + } +} diff --git a/src/red-step-n.h b/src/red-step-n.h new file mode 100644 index 0000000..161f0ec --- /dev/null +++ b/src/red-step-n.h @@ -0,0 +1,33 @@ +#pragma once + +#include "util.h" + +__global__ void reduce_stepN_kernel(DATA_TYPE *data, size_t numElements, size_t toWrite, int step) { + size_t i0 = blockIdx.x * blockDim.x + threadIdx.x; + + DATA_TYPE acc = data[i0]; + + if (i0 < toWrite) { + for (int off = 1; off < step; ++off) { + int i0Add = i0 + off * toWrite; + + if (i0Add < numElements) + acc = acc + data[i0Add]; + } + } + + data[i0] = acc; +} + +template<unsigned int blockSize> +void reduce_stepN(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements, int step, bool copy_in = true) { + if (copy_in) + cudaMemcpy(norm, data, numElements * sizeof(DATA_TYPE), cudaMemcpyDefault); + + for (size_t remaining = numElements; remaining > 1; remaining = (remaining + step - 1) / step) { + size_t toWrite = (remaining + step - 1) / step; + //std::cout << "Writing to " << toWrite << " elements" << std::endl; + size_t blocks = sdiv(toWrite, blockSize); + reduce_stepN_kernel<<<blocks, blockSize>>>(norm, remaining, toWrite, step); + } +} diff --git a/src/red-thrust.h b/src/red-thrust.h new file mode 100644 index 0000000..139bbde --- /dev/null +++ b/src/red-thrust.h @@ -0,0 +1,15 @@ +#pragma once + +#include <thrust/device_ptr.h> +#include <thrust/reduce.h> +#include <thrust/execution_policy.h> + +#include "util.h" + +template<unsigned int blockSize> +void reduce_thrust(DATA_TYPE *data, DATA_TYPE *norm, size_t numElements) { + thrust::device_ptr <DATA_TYPE> dd(data); +// thrust::inclusive_scan(dd, dd + numElements, dn, thrust::plus<DATA_TYPE>()); +// std::cout << thrust::reduce(dd, dd + numElements) << std::endl; + *norm = thrust::reduce(dd, dd + numElements); +} diff --git a/src/util.h b/src/util.h new file mode 100644 index 0000000..f35f4dc --- /dev/null +++ b/src/util.h @@ -0,0 +1,79 @@ +#pragma once + +#include <iostream> +#include <fstream> +#include <cstdint> +#include <string> + +#include <cuda.h> + +typedef double DATA_TYPE; + +const bool benchmark = false; + +const size_t KILO = 1024; +const size_t MEGA = KILO * KILO; +const size_t GIGA = KILO * MEGA; + +#define CUDA_CHECK(x) { \ + cudaError_t err = x; \ + if (err != cudaSuccess) { \ + std::cout << "CUDA error: " << cudaGetErrorString(err) << " : " \ + << __FILE__ << ", line " << __LINE__ << std::endl; \ + exit(1); \ + } else { \ + cudaError_t err; \ + if ((err = cudaGetLastError()) != cudaSuccess) { \ + std::cout << "CUDA last error: " << cudaGetErrorString(err) << " : " \ + << __FILE__ << ", line " << __LINE__ << std::endl; \ + exit(1); \ + } \ + } \ +} + +size_t sdiv(size_t a, size_t b) { + return (a + b - 1) / b; +} + +dim3 sdiv(const dim3 &a, const dim3 &b) { + return dim3(sdiv(a.x, b.x), sdiv(a.y, b.y), sdiv(a.z, b.z)); +} + +struct Timer { + float time; + const int gpu; + cudaEvent_t ying, yang; + + Timer(int gpu = 0) : gpu(gpu) { + cudaSetDevice(gpu); + cudaEventCreate(&ying); + cudaEventCreate(&yang); + } + + ~Timer() { + cudaSetDevice(gpu); + cudaEventDestroy(ying); + cudaEventDestroy(yang); + } + + void start() { + cudaSetDevice(gpu); + cudaEventRecord(ying, 0); + } + + void stop() { + cudaSetDevice(gpu); + cudaEventRecord(yang, 0); + cudaEventSynchronize(yang); + cudaEventElapsedTime(&time, ying, yang); + } +}; + +__inline__ __device__ double warp_reduce(DATA_TYPE val) { +#pragma unroll + for (int i = 16; i >= 1; i /= 2) + val += __shfl_xor_sync(0xffffffff, val, i); +// or: val += __shfl_down_sync(0xffffffff, val, i); + + return val; +} -- GitLab