diff --git a/generated_files/cone_backprojector_3D_CudaKernel.cu b/generated_files/cone_backprojector_3D_CudaKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4b899de4c1b8d8b0feb284dd6feea16612f4ebd1 --- /dev/null +++ b/generated_files/cone_backprojector_3D_CudaKernel.cu @@ -0,0 +1,177 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Voxel-driven cone-beam back-projector CUDA kernel using kernel interpolation + * Implementation adapted from CONRAD + * PYRO-NN is developed as an Open Source project under the Apache License, + * Version 2.0. + */ + +#include "helper_headers/helper_grid.h" +#include "helper_headers/helper_math.h" + +#define BLOCKSIZE_X 16 +#define BLOCKSIZE_Y 4 +#define BLOCKSIZE_Z 4 + +#include <cstdio> +texture<float, cudaTextureType2DLayered> sinogram_as_texture; + +#define CUDART_INF_F __int_as_float(0x7f800000) + +#define gpuErrchk(ans) \ + { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, + bool abort = true) { + if (code != cudaSuccess) { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, + line); + if (abort) + exit(code); + } +} + +inline __device__ float3 map(float3 coordinates, float *d_projection_matrices, + int n) { + const float *matrix = &(d_projection_matrices[n * 12]); + + return make_float3(matrix[0] * coordinates.x + matrix[1] * coordinates.y + + matrix[2] * coordinates.z + matrix[3], + matrix[4] * coordinates.x + matrix[5] * coordinates.y + + matrix[6] * coordinates.z + matrix[7], + matrix[8] * coordinates.x + matrix[9] * coordinates.y + + matrix[10] * coordinates.z + matrix[11]); +} + +inline __device__ float interp2D(const float *const_volume_ptr, + const float3 point, const uint3 pointer_offset, + const uint2 detector_size) { + + const int x_f = __float2int_rn(floor(point.x)); + const int y_f = __float2int_rn(floor(point.y)); + + const int x_c = x_f + 1; + const int y_c = y_f + 1; + + int i00 = (uint)point.z * pointer_offset.z + y_f * pointer_offset.y + x_f; + int i01 = (uint)point.z * pointer_offset.z + y_f * pointer_offset.y + x_c; + int i10 = (uint)point.z * pointer_offset.z + y_c * pointer_offset.y + x_f; + int i11 = (uint)point.z * pointer_offset.z + y_c * pointer_offset.y + x_c; + + float p00 = + (x_f < 0 || x_f > detector_size.x || y_f < 0 || y_f > detector_size.y) + ? 0.0f + : __ldg(&const_volume_ptr[i00]); + float p01 = + (x_c < 0 || x_c > detector_size.x || y_f < 0 || y_f > detector_size.y) + ? 0.0f + : __ldg(&const_volume_ptr[i01]); + float p10 = + (x_f < 0 || x_f > detector_size.x || y_c < 0 || y_c > detector_size.y) + ? 0.0f + : __ldg(&const_volume_ptr[i10]); + float p11 = + (x_c < 0 || x_c > detector_size.x || y_c < 0 || y_c > detector_size.y) + ? 0.0f + : __ldg(&const_volume_ptr[i11]); + + const float x_d = (point.x - x_f); + const float y_d = (point.y - y_f); + + const float p0 = __fmaf_rn(p01, x_d, __fmaf_rn(p00, -x_d, p00)); + const float p1 = __fmaf_rn(p11, x_d, __fmaf_rn(p10, -x_d, p10)); + + const float p = __fmaf_rn(p1, y_d, __fmaf_rn(p0, -y_d, p0)); + + return p; +} + +__global__ void backproject_3Dcone_beam_kernel( + const float *sinogram_ptr, float *vol, float *d_projection_matrices, + const int number_of_projections, const uint3 volume_size, + const float3 volume_spacing, const float3 volume_origin, + const uint2 detector_size, const uint3 pointer_offsets, + const float projection_multiplier) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + const int k = blockIdx.z * blockDim.z + threadIdx.z; + + if (i >= volume_size.x || j >= volume_size.y || k >= volume_size.z) + return; + + const float3 coordinates = + index_to_physical(make_float3(i, j, k), volume_origin, volume_spacing); + + float val = 0.0f; + + for (int n = 0; n < number_of_projections; ++n) { + auto ip = map(coordinates, d_projection_matrices, n); + + ip.z = 1.0f / ip.z; + ip.x *= ip.z; + ip.y *= ip.z; + float3 point = make_float3(ip.x, ip.y, n); + + val += interp2D(sinogram_ptr, point, pointer_offsets, detector_size) * + ip.z * ip.z; + } + + // linear volume address + const unsigned int l = volume_size.x * (k * volume_size.y + j) + i; + vol[l] = (val * projection_multiplier); +} + +void Cone_Backprojection3D_Kernel_Launcher( + const float *sinogram_ptr, float *out, const float *projection_matrices, + const int number_of_projections, const int volume_width, + const int volume_height, const int volume_depth, + const float volume_spacing_x, const float volume_spacing_y, + const float volume_spacing_z, const float volume_origin_x, + const float volume_origin_y, const float volume_origin_z, + const int detector_width, const int detector_height, + const float projection_multiplier) { + // COPY matrix to graphics card as float array + auto matrices_size_b = number_of_projections * 12 * sizeof(float); + float *d_projection_matrices; + gpuErrchk(cudaMalloc(&d_projection_matrices, matrices_size_b)); + gpuErrchk(cudaMemcpy(d_projection_matrices, projection_matrices, + matrices_size_b, cudaMemcpyHostToDevice)); + + uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth); + float3 volume_spacing = + make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z); + float3 volume_origin = + make_float3(volume_origin_x, volume_origin_y, volume_origin_z); + + uint2 detector_size = make_uint2(detector_width, detector_height); + + uint3 pointer_offsets = + make_uint3(1, detector_width, detector_height * detector_width); + + // launch kernel + const unsigned int gridsize_x = (volume_size.x - 1) / BLOCKSIZE_X + 1; + const unsigned int gridsize_y = (volume_size.y - 1) / BLOCKSIZE_Y + 1; + const unsigned int gridsize_z = (volume_size.z - 1) / BLOCKSIZE_Z + 1; + const dim3 grid = dim3(gridsize_x, gridsize_y, gridsize_z); + const dim3 block = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z); + + backproject_3Dcone_beam_kernel<<<grid, block>>>( + sinogram_ptr, out, d_projection_matrices, number_of_projections, + volume_size, volume_spacing, volume_origin, detector_size, + pointer_offsets, projection_multiplier); + + gpuErrchk(cudaUnbindTexture(sinogram_as_texture)); + gpuErrchk(cudaFree(d_projection_matrices)); +} diff --git a/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu b/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu new file mode 100644 index 0000000000000000000000000000000000000000..0e528b1a956abebf1e43a9f18a9abba06e60e72a --- /dev/null +++ b/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu @@ -0,0 +1,178 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Voxel-driven cone-beam back-projector CUDA kernel using texture interpolation + * Implementation adapted from CONRAD + * PYRO-NN is developed as an Open Source project under the Apache License, + * Version 2.0. + */ + +#include "helper_headers/helper_grid.h" +#include "helper_headers/helper_math.h" +#include <cstdio> + +#define BLOCKSIZE_X 16 +#define BLOCKSIZE_Y 4 +#define BLOCKSIZE_Z 4 + +texture<float, cudaTextureType2DLayered> sinogram_as_texture; + +#define CUDART_INF_F __int_as_float(0x7f800000) + +#define gpuErrchk(ans) \ + { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, + bool abort = true) { + if (code != cudaSuccess) { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, + line); + if (abort) + exit(code); + } +} + +inline __device__ float3 map(float3 coordinates, float *d_projection_matrices, + int n) { + const float *matrix = &(d_projection_matrices[n * 12]); + + return make_float3(matrix[0] * coordinates.x + matrix[1] * coordinates.y + + matrix[2] * coordinates.z + matrix[3], + matrix[4] * coordinates.x + matrix[5] * coordinates.y + + matrix[6] * coordinates.z + matrix[7], + matrix[8] * coordinates.x + matrix[9] * coordinates.y + + matrix[10] * coordinates.z + matrix[11]); +} + +__global__ void backproject_3Dcone_beam_kernel_tex_interp( + float *vol, float *d_projection_matrices, const int number_of_projections, + const uint3 volume_size, const float3 volume_spacing, + const float3 volume_origin, const float projection_multiplier) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + const int k = blockIdx.z * blockDim.z + threadIdx.z; + + if (i >= volume_size.x || j >= volume_size.y || k >= volume_size.z) + return; + + const float3 coordinates = + index_to_physical(make_float3(i, j, k), volume_origin, volume_spacing); + + float val = 0.0f; + + for (int n = 0; n < number_of_projections; ++n) { + auto ip = map(coordinates, d_projection_matrices, n); + + ip.z = 1.0f / (float)ip.z; + ip.x *= ip.z; + ip.y *= ip.z; + val += tex2DLayered(sinogram_as_texture, ip.x + 0.5, ip.y + 0.5, n) * ip.z * + ip.z; + } + + // linear volume address + const unsigned int l = volume_size.x * (k * volume_size.y + j) + i; + vol[l] = (val * projection_multiplier); +} + +/*************** WARNING ******************./ + * + * Tensorflow is allocating the whole GPU memory for itself and just leave a + * small slack memory using cudaMalloc and cudaMalloc3D will allocate memory in + * this small slack memory ! Therefore, currently only small volumes can be used + * (they have to fit into the slack memory which TF does not allocae !) + * + * This is the kernel based on texture interpolation, thus, the allocations + * are not within the Tensorflow managed memory. If memory errors occure: + * 1. start Tensorflow with less gpu memory and allow growth + * 2. switch to software-based interpolation. + * + * TODO: use context->allocate_tmp and context->allocate_persistent instead of + * cudaMalloc for the projection_matrices array : + * https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment + * + */ +void Cone_Backprojection3D_Kernel_Tex_Interp_Launcher( + const float *sinogram_ptr, float *out, const float *projection_matrices, + const int number_of_projections, const int volume_width, + const int volume_height, const int volume_depth, + const float volume_spacing_x, const float volume_spacing_y, + const float volume_spacing_z, const float volume_origin_x, + const float volume_origin_y, const float volume_origin_z, + const int detector_width, const int detector_height, + const float projection_multiplier) { + // COPY matrix to graphics card as float array + auto matrices_size_b = number_of_projections * 12 * sizeof(float); + float *d_projection_matrices; + gpuErrchk(cudaMalloc(&d_projection_matrices, matrices_size_b)); + gpuErrchk(cudaMemcpy(d_projection_matrices, projection_matrices, + matrices_size_b, cudaMemcpyHostToDevice)); + + uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth); + float3 volume_spacing = + make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z); + float3 volume_origin = + make_float3(volume_origin_x, volume_origin_y, volume_origin_z); + + uint2 detector_size = make_uint2(detector_width, detector_height); + + // COPY volume to graphics card + + // set texture properties + sinogram_as_texture.addressMode[0] = cudaAddressModeBorder; + sinogram_as_texture.addressMode[1] = cudaAddressModeBorder; + sinogram_as_texture.addressMode[2] = cudaAddressModeBorder; + sinogram_as_texture.filterMode = cudaFilterModeLinear; + sinogram_as_texture.normalized = false; + + // malloc cuda array for texture + cudaExtent projExtent = + make_cudaExtent(detector_size.x, detector_size.y, number_of_projections); + + cudaArray *projArray; + + static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + gpuErrchk(cudaMalloc3DArray(&projArray, &channelDesc, projExtent, + cudaArrayLayered)); + + auto pitch_ptr = make_cudaPitchedPtr(const_cast<float *>(sinogram_ptr), + detector_size.x * sizeof(float), + detector_size.x, detector_size.y); + // copy data to 3D array + cudaMemcpy3DParms copyParams = {0}; + copyParams.srcPtr = pitch_ptr; + copyParams.dstArray = projArray; + copyParams.extent = projExtent; + copyParams.kind = cudaMemcpyDeviceToDevice; + gpuErrchk(cudaMemcpy3D(©Params)); + + // bind texture reference + gpuErrchk( + cudaBindTextureToArray(sinogram_as_texture, projArray, channelDesc)); + + // launch kernel + const unsigned int gridsize_x = (volume_size.x - 1) / BLOCKSIZE_X + 1; + const unsigned int gridsize_y = (volume_size.y - 1) / BLOCKSIZE_Y + 1; + const unsigned int gridsize_z = (volume_size.z - 1) / BLOCKSIZE_Z + 1; + const dim3 grid = dim3(gridsize_x, gridsize_y, gridsize_z); + const dim3 block = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z); + + backproject_3Dcone_beam_kernel_tex_interp<<<grid, block>>>( + out, d_projection_matrices, number_of_projections, volume_size, + volume_spacing, volume_origin, projection_multiplier); + + gpuErrchk(cudaUnbindTexture(sinogram_as_texture)); + gpuErrchk(cudaFreeArray(projArray)); + gpuErrchk(cudaFree(d_projection_matrices)); +} diff --git a/generated_files/cone_projector_3D_CudaKernel.cu b/generated_files/cone_projector_3D_CudaKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..70813e9cbc26d360d8c10c8c6c20b972e9cd7b4d --- /dev/null +++ b/generated_files/cone_projector_3D_CudaKernel.cu @@ -0,0 +1,257 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Ray-driven cone-beam projector CUDA kernel using kernel interpolation + * Implementation adapted from CONRAD + * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0. +*/ + +#include "helper_headers/helper_grid.h" +#include "helper_headers/helper_math.h" +#define CUDART_INF_F __int_as_float(0x7f800000) + +#define BLOCKSIZE_X 16 +#define BLOCKSIZE_Y 16 + +#include <cstdio> + +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + exit(code); + } +} + +inline __device__ float interp3D(const float* const_volume_ptr, const float3 point, const uint3 pointer_offset, const uint3 volume_size){ + + const int x_f = __float2int_rn( floor(point.x) ); + const int y_f = __float2int_rn( floor(point.y) ); + const int z_f = __float2int_rn( floor(point.z) ); + + const int x_c = x_f+1; + const int y_c = y_f+1; + const int z_c = z_f+1; + + uint i000 = z_f*pointer_offset.z + y_f * pointer_offset.y + x_f; + uint i001 = z_f*pointer_offset.z + y_f * pointer_offset.y + x_c; + uint i010 = z_f*pointer_offset.z + y_c * pointer_offset.y + x_f; + uint i011 = z_f*pointer_offset.z + y_c * pointer_offset.y + x_c; + uint i100 = z_c*pointer_offset.z + y_f * pointer_offset.y + x_f; + uint i101 = z_c*pointer_offset.z + y_f * pointer_offset.y + x_c; + uint i110 = z_c*pointer_offset.z + y_c * pointer_offset.y + x_f; + uint i111 = z_c*pointer_offset.z + y_c * pointer_offset.y + x_c; + + float p000 = ( z_f < 0 || z_f >= volume_size.z || y_f < 0 || y_f >= volume_size.y || x_f < 0 || x_f >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i000 ]); + float p001 = ( z_f < 0 || z_f >= volume_size.z || y_f < 0 || y_f >= volume_size.y || x_c < 0 || x_c >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i001 ]); + float p010 = ( z_f < 0 || z_f >= volume_size.z || y_c < 0 || y_c >= volume_size.y || x_f < 0 || x_f >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i010 ]); + float p011 = ( z_f < 0 || z_f >= volume_size.z || y_c < 0 || y_c >= volume_size.y || x_c < 0 || x_c >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i011 ]); + float p100 = ( z_c < 0 || z_c >= volume_size.z || y_f < 0 || y_f >= volume_size.y || x_f < 0 || x_f >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i100 ]); + float p101 = ( z_c < 0 || z_c >= volume_size.z || y_f < 0 || y_f >= volume_size.y || x_c < 0 || x_c >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i101 ]); + float p110 = ( z_c < 0 || z_c >= volume_size.z || y_c < 0 || y_c >= volume_size.y || x_f < 0 || x_f >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i110 ]); + float p111 = ( z_c < 0 || z_c >= volume_size.z || y_c < 0 || y_c >= volume_size.y || x_c < 0 || x_c >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i111 ]); + + const float x_d = (point.x - x_f); + const float y_d = (point.y - y_f); + const float z_d = (point.z - z_f); + + const float p00 = __fmaf_rn(p100,z_d,__fmaf_rn(p000,-z_d,p000)); + const float p01 = __fmaf_rn(p101,z_d,__fmaf_rn(p001,-z_d,p001)); + const float p10 = __fmaf_rn(p110,z_d,__fmaf_rn(p010,-z_d,p010)); + const float p11 = __fmaf_rn(p111,z_d,__fmaf_rn(p011,-z_d,p011)); + + const float p0 = __fmaf_rn(p10,y_d,__fmaf_rn(p00,-y_d,p00)); + const float p1 = __fmaf_rn(p11,y_d,__fmaf_rn(p01,-y_d,p01)); + + const float p = __fmaf_rn(p1,x_d,__fmaf_rn(p0,-x_d,p0)); + + return p; +} + +inline __device__ float kernel_project3D(const float* volume_ptr, const float3 source_point, const float3 ray_vector, + const float step_size, const uint3 volume_size, const uint3 pointer_offsets) +{ + float pixel = 0.0f; + // Step 1: compute alpha value at entry and exit point of the volume + float min_alpha, max_alpha; + min_alpha = 0; + max_alpha = CUDART_INF_F; + + if (0.0f != ray_vector.x) + { + float volume_min_edge_point = 0; + float volume_max_edge_point = volume_size.x; + + float reci = 1.0f / ray_vector.x; + float alpha0 = (volume_min_edge_point - source_point.x) * reci; + float alpha1 = (volume_max_edge_point - source_point.x) * reci; + min_alpha = fmin(alpha0, alpha1); + max_alpha = fmax(alpha0, alpha1); + } + + if (0.0f != ray_vector.y) + { + float volume_min_edge_point = 0; + float volume_max_edge_point = volume_size.y; + + float reci = 1.0f / ray_vector.y; + + float alpha0 = (volume_min_edge_point - source_point.y) * reci; + float alpha1 = (volume_max_edge_point - source_point.y) * reci; + min_alpha = fmax(min_alpha, fmin(alpha0, alpha1)); + max_alpha = fmin(max_alpha, fmax(alpha0, alpha1)); + } + + if (0.0f != ray_vector.z) + { + float volume_min_edge_point = 0; + float volume_max_edge_point = volume_size.z; + + float reci = 1.0f / ray_vector.z; + + float alpha0 = (volume_min_edge_point - source_point.z) * reci; + float alpha1 = (volume_max_edge_point - source_point.z) * reci; + min_alpha = fmax(min_alpha, fmin(alpha0, alpha1)); + max_alpha = fmin(max_alpha, fmax(alpha0, alpha1)); + } + // we start not at the exact entry point + // => we can be sure to be inside the volume + min_alpha += step_size * 0.5f; + // Step 2: Cast ray if it intersects the volume + // Trapezoidal rule (interpolating function = piecewise linear func) + + float3 point = make_float3(0,0,0); + // Entrance boundary + // For the initial interpolated value, only a half stepsize is + // considered in the computation. + if (min_alpha < max_alpha) + { + point.x = source_point.x + min_alpha * ray_vector.x; + point.y = source_point.y + min_alpha * ray_vector.y; + point.z = source_point.z + min_alpha * ray_vector.z; + pixel += 0.5f * interp3D(volume_ptr, point, pointer_offsets, volume_size); + min_alpha += step_size; + } + + while (min_alpha < max_alpha) + { + point.x = source_point.x + min_alpha * ray_vector.x; + point.y = source_point.y + min_alpha * ray_vector.y; + point.z = source_point.z + min_alpha * ray_vector.z; + pixel += interp3D(volume_ptr, point, pointer_offsets, volume_size);; + min_alpha += step_size; + } + // Scaling by stepsize; + pixel *= step_size; + + //Last segment of the line + if (pixel > 0.0f) + { + pixel -= 0.5f * step_size * interp3D(volume_ptr, point, pointer_offsets, volume_size); + min_alpha -= step_size; + float last_step_size = max_alpha - min_alpha; + + pixel += 0.5f * last_step_size* interp3D(volume_ptr, point, pointer_offsets, volume_size); + + point.x = source_point.x + max_alpha * ray_vector.x; + point.y = source_point.y + max_alpha * ray_vector.y; + point.z = source_point.z + max_alpha * ray_vector.z; + + // The last segment of the line integral takes care of the + // varying length. + pixel += 0.5f * last_step_size * interp3D(volume_ptr, point , pointer_offsets, volume_size); + } + return pixel; +} +__global__ void project_3Dcone_beam_kernel( const float* volume_ptr, float *pSinogram, + const float *d_inv_AR_matrices, const float3 *d_src_points, const float sampling_step_size, + const uint3 volume_size, const float3 volume_spacing, const uint2 detector_size, const int number_of_projections, + const uint3 pointer_offsets) +{ + //return; + uint2 detector_idx = make_uint2( blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y* blockDim.y + threadIdx.y ); + uint projection_number = blockIdx.z; + if (detector_idx.x >= detector_size.x || detector_idx.y >= detector_size.y || blockIdx.z >= number_of_projections) + { + return; + } + // //Preparations: + d_inv_AR_matrices += projection_number * 9; + float3 source_point = d_src_points[projection_number]; + //Compute ray direction + const float rx = d_inv_AR_matrices[2] + detector_idx.y * d_inv_AR_matrices[1] + detector_idx.x * d_inv_AR_matrices[0]; + const float ry = d_inv_AR_matrices[5] + detector_idx.y * d_inv_AR_matrices[4] + detector_idx.x * d_inv_AR_matrices[3]; + const float rz = d_inv_AR_matrices[8] + detector_idx.y * d_inv_AR_matrices[7] + detector_idx.x * d_inv_AR_matrices[6]; + + float3 ray_vector = make_float3(rx,ry,rz); + ray_vector = normalize(ray_vector); + + float pixel = kernel_project3D( + volume_ptr, + source_point, + ray_vector, + sampling_step_size, + volume_size, + pointer_offsets); + + unsigned sinogram_idx = projection_number * detector_size.y * detector_size.x + detector_idx.y * detector_size.x + detector_idx.x; + + pixel *= sqrt( (ray_vector.x * volume_spacing.x) * (ray_vector.x * volume_spacing.x) + + (ray_vector.y * volume_spacing.y) * (ray_vector.y * volume_spacing.y) + + (ray_vector.z * volume_spacing.z) * (ray_vector.z * volume_spacing.z) ); + + pSinogram[sinogram_idx] = pixel; + return; +} + +void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const float *inv_AR_matrix, const float *src_points, + const int number_of_projections, const int volume_width, const int volume_height, const int volume_depth, + const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z, + const int detector_width, const int detector_height, const float step_size) +{ + //COPY inv AR matrix to graphics card as float array + auto matrices_size_b = number_of_projections * 9 * sizeof(float); + float *d_inv_AR_matrices; + gpuErrchk(cudaMalloc(&d_inv_AR_matrices, matrices_size_b)); + gpuErrchk(cudaMemcpy(d_inv_AR_matrices, inv_AR_matrix, matrices_size_b, cudaMemcpyHostToDevice)); + //COPY source points to graphics card as float3 + auto src_points_size_b = number_of_projections * sizeof(float3); + + float3 *d_src_points; + gpuErrchk(cudaMalloc(&d_src_points, src_points_size_b)); + gpuErrchk(cudaMemcpy(d_src_points, src_points, src_points_size_b, cudaMemcpyHostToDevice)); + + uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth); + float3 volume_spacing = make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z); + + uint2 detector_size = make_uint2(detector_width, detector_height); + uint3 pointer_offsets = make_uint3(1,volume_width,volume_width*volume_height); + + const dim3 blocksize = dim3( BLOCKSIZE_X, BLOCKSIZE_Y, 1 ); + const dim3 gridsize = dim3( detector_size.x / blocksize.x + 1, detector_size.y / blocksize.y + 1 , number_of_projections+1); + + project_3Dcone_beam_kernel<<<gridsize, blocksize>>>(volume_ptr, out, d_inv_AR_matrices, d_src_points, step_size, + volume_size,volume_spacing, detector_size,number_of_projections,pointer_offsets); + + cudaDeviceSynchronize(); + + // check for errors + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk(cudaFree(d_inv_AR_matrices)); + gpuErrchk(cudaFree(d_src_points)); +} + diff --git a/generated_files/cone_projector_3D_CudaKernel_hardware_interp.cu b/generated_files/cone_projector_3D_CudaKernel_hardware_interp.cu new file mode 100644 index 0000000000000000000000000000000000000000..a533371b8ef59be520053ac2ecdf0865f3a22ffd --- /dev/null +++ b/generated_files/cone_projector_3D_CudaKernel_hardware_interp.cu @@ -0,0 +1,280 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Ray-driven cone-beam projector CUDA kernel using texture inteprolation + * Implementation adapted from CONRAD + * PYRO-NN is developed as an Open Source project under the Apache License, + * Version 2.0. + */ + +# +#include "helper_headers/helper_grid.h" +#include "helper_headers/helper_math.h" +texture<float, 3, cudaReadModeElementType> volume_as_texture; +#define CUDART_INF_F __int_as_float(0x7f800000) + +#include <cstdio> +#define BLOCKSIZE_X 16 +#define BLOCKSIZE_Y 16 + +#define gpuErrchk(ans) \ + { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, + bool abort = true) { + if (code != cudaSuccess) { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, + line); + exit(code); + } +} + +__device__ float kernel_project3D_tex_interp(const float3 source_point, + const float3 ray_vector, + const float step_size, + const uint3 volume_size) { + float pixel = 0.0f; + // Step 1: compute alpha value at entry and exit point of the volume + float min_alpha, max_alpha; + min_alpha = 0; + max_alpha = CUDART_INF_F; + + if (0.0f != ray_vector.x) { + float volume_min_edge_point = 0 - 0.5f; + float volume_max_edge_point = volume_size.x - 0.5f; + + float reci = 1.0f / ray_vector.x; + float alpha0 = (volume_min_edge_point - source_point.x) * reci; + float alpha1 = (volume_max_edge_point - source_point.x) * reci; + min_alpha = fmin(alpha0, alpha1); + max_alpha = fmax(alpha0, alpha1); + } + + if (0.0f != ray_vector.y) { + float volume_min_edge_point = 0 - 0.5f; + float volume_max_edge_point = volume_size.y - 0.5f; + + float reci = 1.0f / ray_vector.y; + float alpha0 = (volume_min_edge_point - source_point.y) * reci; + float alpha1 = (volume_max_edge_point - source_point.y) * reci; + min_alpha = fmax(min_alpha, fmin(alpha0, alpha1)); + max_alpha = fmin(max_alpha, fmax(alpha0, alpha1)); + } + + if (0.0f != ray_vector.z) { + float volume_min_edge_point = 0 - 0.5f; + float volume_max_edge_point = volume_size.z - 0.5f; + + float reci = 1.0f / ray_vector.z; + float alpha0 = (volume_min_edge_point - source_point.z) * reci; + float alpha1 = (volume_max_edge_point - source_point.z) * reci; + min_alpha = fmax(min_alpha, fmin(alpha0, alpha1)); + max_alpha = fmin(max_alpha, fmax(alpha0, alpha1)); + } + // we start not at the exact entry point + // => we can be sure to be inside the volume + min_alpha += step_size * 0.5f; + + // Step 2: Cast ray if it intersects the volume + // Trapezoidal rule (interpolating function = piecewise linear func) + + float px, py, pz; + + // Entrance boundary + // For the initial interpolated value, only a half stepsize is + // considered in the computation. + if (min_alpha < max_alpha) { + px = source_point.x + min_alpha * ray_vector.x; + py = source_point.y + min_alpha * ray_vector.y; + pz = source_point.z + min_alpha * ray_vector.z; + + pixel += 0.5f * tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f); + + min_alpha += step_size; + } + // Mid segments + while (min_alpha < max_alpha) { + px = source_point.x + min_alpha * ray_vector.x; + py = source_point.y + min_alpha * ray_vector.y; + pz = source_point.z + min_alpha * ray_vector.z; + + pixel += tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f); + + min_alpha += step_size; + } + // Scaling by stepsize; + pixel *= step_size; + + // Last segment of the line + if (pixel > 0.0f) { + pixel -= 0.5f * step_size * + tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f); + + min_alpha -= step_size; + float last_step_size = max_alpha - min_alpha; + + pixel += 0.5f * last_step_size * + tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f); + + px = source_point.x + max_alpha * ray_vector.x; + py = source_point.y + max_alpha * ray_vector.y; + pz = source_point.z + max_alpha * ray_vector.z; + + // The last segment of the line integral takes care of the + // varying length. + pixel += 0.5f * last_step_size * + tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f); + } + return pixel; +} +__global__ void project_3Dcone_beam_kernel_tex_interp( + float *pSinogram, const float *d_inv_AR_matrices, + const float3 *d_src_points, const float sampling_step_size, + const uint3 volume_size, const float3 volume_spacing, + const uint2 detector_size, const int number_of_projections) { + uint2 detector_idx = make_uint2(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + uint projection_number = blockIdx.z; + if (detector_idx.x >= detector_size.x || detector_idx.y >= detector_size.y || + blockIdx.z >= number_of_projections) { + return; + } + // //Preparations: + d_inv_AR_matrices += projection_number * 9; + float3 source_point = d_src_points[projection_number]; + + // Compute ray direction + const float rx = d_inv_AR_matrices[2] + + detector_idx.y * d_inv_AR_matrices[1] + + detector_idx.x * d_inv_AR_matrices[0]; + const float ry = d_inv_AR_matrices[5] + + detector_idx.y * d_inv_AR_matrices[4] + + detector_idx.x * d_inv_AR_matrices[3]; + const float rz = d_inv_AR_matrices[8] + + detector_idx.y * d_inv_AR_matrices[7] + + detector_idx.x * d_inv_AR_matrices[6]; + + float3 ray_vector = make_float3(rx, ry, rz); + ray_vector = normalize(ray_vector); + + float pixel = kernel_project3D_tex_interp(source_point, ray_vector, + sampling_step_size, volume_size); + + pixel *= sqrt( + (ray_vector.x * volume_spacing.x) * (ray_vector.x * volume_spacing.x) + + (ray_vector.y * volume_spacing.y) * (ray_vector.y * volume_spacing.y) + + (ray_vector.z * volume_spacing.z) * (ray_vector.z * volume_spacing.z)); + + unsigned sinogram_idx = + projection_number * detector_size.y * detector_size.x + + detector_idx.y * detector_size.x + detector_idx.x; + + pSinogram[sinogram_idx] = pixel; + return; +} + +/*************** WARNING ******************./ + * + * Tensorflow is allocating the whole GPU memory for itself and just leave a + * small slack memory using cudaMalloc and cudaMalloc3D will allocate memory in + * this small slack memory ! Therefore, currently only small volumes can be used + * (they have to fit into the slack memory which TF does not allocae !) + * + * This is the kernel based on texture interpolation, thus, the allocations + * are not within the Tensorflow managed memory. If memory errors occure: + * 1. start Tensorflow with less gpu memory and allow growth + * 2. switch to software-based interpolation. + * + * TODO: use context->allocate_tmp and context->allocate_persistent instead of + * cudaMalloc for the inv_AR_matrix and src_points array : + * https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment + * + */ +void Cone_Projection_Kernel_Tex_Interp_Launcher( + const float *__restrict__ volume_ptr, float *out, + const float *inv_AR_matrix, const float *src_points, + const int number_of_projections, const int volume_width, + const int volume_height, const int volume_depth, + const float volume_spacing_x, const float volume_spacing_y, + const float volume_spacing_z, const int detector_width, + const int detector_height, const float step_size) { + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + volume_as_texture.addressMode[0] = cudaAddressModeBorder; + volume_as_texture.addressMode[1] = cudaAddressModeBorder; + volume_as_texture.addressMode[2] = cudaAddressModeBorder; + volume_as_texture.filterMode = cudaFilterModeLinear; + volume_as_texture.normalized = false; + + // COPY inv AR matrix to graphics card as float array + auto matrices_size_b = number_of_projections * 9 * sizeof(float); + float *d_inv_AR_matrices; + gpuErrchk(cudaMalloc(&d_inv_AR_matrices, matrices_size_b)); + gpuErrchk(cudaMemcpy(d_inv_AR_matrices, inv_AR_matrix, matrices_size_b, + cudaMemcpyHostToDevice)); + // COPY source points to graphics card as float3 + auto src_points_size_b = number_of_projections * sizeof(float3); + float3 *d_src_points; + gpuErrchk(cudaMalloc(&d_src_points, src_points_size_b)); + gpuErrchk(cudaMemcpy(d_src_points, src_points, src_points_size_b, + cudaMemcpyHostToDevice)); + + // COPY volume to graphics card + // Malloc cuda array for texture + cudaExtent volume_extent = + make_cudaExtent(volume_width, volume_height, volume_depth); + cudaExtent volume_extent_byte = make_cudaExtent(sizeof(float) * volume_width, + volume_height, volume_depth); + + cudaPitchedPtr d_volumeMem = make_cudaPitchedPtr( + const_cast<float *>(volume_ptr), volume_width * sizeof(float), + volume_width, volume_height); + + cudaArray *volume_array; + gpuErrchk(cudaMalloc3DArray(&volume_array, &channelDesc, volume_extent)); + + cudaMemcpy3DParms copyParams = {0}; + copyParams.srcPtr = d_volumeMem; + copyParams.dstArray = volume_array; + copyParams.extent = volume_extent; + copyParams.kind = cudaMemcpyDeviceToDevice; + + gpuErrchk(cudaMemcpy3D(©Params)); + + gpuErrchk( + cudaBindTextureToArray(volume_as_texture, volume_array, channelDesc)) + + uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth); + float3 volume_spacing = + make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z); + + uint2 detector_size = make_uint2(detector_width, detector_height); + + const dim3 blocksize = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, 1); + const dim3 gridsize = + dim3(detector_size.x / blocksize.x + 1, detector_size.y / blocksize.y + 1, + number_of_projections + 1); + + project_3Dcone_beam_kernel_tex_interp<<<gridsize, blocksize>>>( + out, d_inv_AR_matrices, d_src_points, step_size, volume_size, + volume_spacing, detector_size, number_of_projections); + + cudaDeviceSynchronize(); + + // check for errors + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaFreeArray(volume_array)); + gpuErrchk(cudaUnbindTexture(volume_as_texture)); + gpuErrchk(cudaFree(d_inv_AR_matrices)); + gpuErrchk(cudaFree(d_src_points)); +} diff --git a/generated_files/fan_backprojector_2D_CudaKernel.cu b/generated_files/fan_backprojector_2D_CudaKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..b3833ca4bea4e28550092896493e6cd24641edcd --- /dev/null +++ b/generated_files/fan_backprojector_2D_CudaKernel.cu @@ -0,0 +1,134 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Voxel-driven fan-beam back-projection CUDA kernel. + * PYRO-NN is developed as an Open Source project under the Apache License, + * Version 2.0. + */ +#include "helper_headers/helper_geometry_gpu.h" +#include "helper_headers/helper_grid.h" +#include "helper_headers/helper_math.h" + + +texture<float, cudaTextureType2D, cudaReadModeElementType> sinogram_as_texture; +#define CUDART_INF_F __int_as_float(0x7f800000) + +__global__ void backproject_2Dfan_beam_kernel( + float *pVolume, const float2 *d_rays, const int number_of_projections, + const float sampling_step_size, const int2 volume_size, + const float2 volume_spacing, const float2 volume_origin, + const int detector_size, const float detector_spacing, + const float detector_origin, const float sid, const float sdd) { + const float pi = 3.14159265359f; + unsigned int volume_x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int volume_y = blockIdx.y * blockDim.y + threadIdx.y; + if (volume_x >= volume_size.x || volume_y >= volume_size.y) { + return; + } + // Preparations: + const float2 pixel_coordinate = index_to_physical( + make_float2(volume_x, volume_y), volume_origin, volume_spacing); + float pixel_value = 0.0f; + + for (int n = 0; n < number_of_projections; n++) { + float2 central_ray = d_rays[n]; + float2 detector_vec = make_float2(-central_ray.y, central_ray.x); + + float2 source_position = central_ray * (-sid); + float2 central_point = source_position + central_ray * sdd; + + float2 intersection = + intersectLines2D(pixel_coordinate, source_position, central_point, + central_point + detector_vec); + float distance_weight = + 1.0f / (float)length(pixel_coordinate - source_position); + float s = dot(intersection, detector_vec); + float s_idx = physical_to_index(s, detector_origin, detector_spacing); + + pixel_value += tex2D(sinogram_as_texture, s_idx + 0.5f, n + 0.5f) * + distance_weight * distance_weight; + } + + const unsigned volume_linearized_idx = volume_y * volume_size.x + volume_x; + pVolume[volume_linearized_idx] = + sid * sdd * pi * pixel_value / number_of_projections; + + return; +} +/*************** WARNING ******************./ + * + * Tensorflow is allocating the whole GPU memory for itself and just leave a + * small slack memory using cudaMalloc and cudaMalloc3D will allocate memory in + * this small slack memory ! Therefore, currently only small volumes can be used + * (they have to fit into the slack memory which TF does not allocae !) + * + * This is the kernel based on texture interpolation, thus, the allocations + * are not within the Tensorflow managed memory. If memory errors occure: + * 1. start Tensorflow with less gpu memory and allow growth + * 2. TODO: no software interpolation based 2D verions are available yet + * + * TODO: use context->allocate_tmp and context->allocate_persistent instead of + * cudaMalloc for the ray_vectors array : + * https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment + * + */ +void Fan_Backprojection2D_Kernel_Launcher( + const float *sinogram_ptr, float *out, const float *ray_vectors, + const int number_of_projections, const int volume_width, + const int volume_height, const float volume_spacing_x, + const float volume_spacing_y, const float volume_origin_x, + const float volume_origin_y, const int detector_size, + const float detector_spacing, const float detector_origin, const float sid, + const float sdd) { + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + sinogram_as_texture.addressMode[0] = cudaAddressModeBorder; + sinogram_as_texture.addressMode[1] = cudaAddressModeBorder; + sinogram_as_texture.filterMode = cudaFilterModeLinear; + sinogram_as_texture.normalized = false; + + cudaArray *sinogram_array; + cudaMallocArray(&sinogram_array, &channelDesc, detector_size, + number_of_projections); + cudaMemcpyToArray(sinogram_array, 0, 0, sinogram_ptr, + detector_size * number_of_projections * sizeof(float), + cudaMemcpyHostToDevice); + cudaBindTextureToArray(sinogram_as_texture, sinogram_array, channelDesc); + + auto ray_size_b = number_of_projections * sizeof(float2); + float2 *d_rays; + cudaMalloc(&d_rays, ray_size_b); + cudaMemcpy(d_rays, ray_vectors, ray_size_b, cudaMemcpyHostToDevice); + + float sampling_step_size = 1; + + int2 volume_size = make_int2(volume_width, volume_height); + float2 volume_spacing = make_float2(volume_spacing_x, volume_spacing_y); + float2 volume_origin = make_float2(volume_origin_x, volume_origin_y); + + const unsigned block_size = 16; + const dim3 threads_per_block = dim3(block_size, block_size); + const dim3 num_blocks = dim3(volume_width / threads_per_block.x + 1, + volume_height / threads_per_block.y + 1); + + backproject_2Dfan_beam_kernel<<<num_blocks, threads_per_block>>>( + out, d_rays, number_of_projections, sampling_step_size, volume_size, + volume_spacing, volume_origin, detector_size, detector_spacing, + detector_origin, sid, sdd); + + cudaUnbindTexture(sinogram_as_texture); + cudaFreeArray(sinogram_array); + cudaFree(d_rays); +} + diff --git a/generated_files/fan_projector_2D_CudaKernel.cu b/generated_files/fan_projector_2D_CudaKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..562c6fe23ecddd55e625491ecb6e5cee5ffdecc4 --- /dev/null +++ b/generated_files/fan_projector_2D_CudaKernel.cu @@ -0,0 +1,239 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Ray-driven fan-beam projector CUDA kernel. + * PYRO-NN is developed as an Open Source project under the Apache License, + * Version 2.0. + */ + +#include "helper_headers/helper_grid.h" +#include "helper_headers/helper_math.h" + +texture<float, cudaTextureType2D, cudaReadModeElementType> volume_as_texture; +#define CUDART_INF_F __int_as_float(0x7f800000) + +__device__ float kernel_project2D(const float2 source_point, + const float2 ray_vector, + const float step_size, const int2 volume_size, + const float2 volume_origin, + const float2 volume_spacing) { + + unsigned int detector_idx = blockIdx.x * blockDim.x + threadIdx.x; + int projection_idx = blockIdx.y; + + float pixel = 0.0f; + // Step 1: compute alpha value at entry and exit point of the volume + float min_alpha, max_alpha; + min_alpha = 0; + max_alpha = CUDART_INF_F; + + if (0.0f != ray_vector.x) { + float volume_min_edge_point = + index_to_physical(0, volume_origin.x, volume_spacing.x) - 0.5f; + float volume_max_edge_point = + index_to_physical(volume_size.x, volume_origin.x, volume_spacing.x) - + 0.5f; + + float reci = 1.0f / ray_vector.x; + float alpha0 = (volume_min_edge_point - source_point.x) * reci; + float alpha1 = (volume_max_edge_point - source_point.x) * reci; + min_alpha = fmin(alpha0, alpha1); + max_alpha = fmax(alpha0, alpha1); + } + + if (0.0f != ray_vector.y) { + float volume_min_edge_point = + index_to_physical(0, volume_origin.y, volume_spacing.y) - 0.5f; + float volume_max_edge_point = + index_to_physical(volume_size.y, volume_origin.y, volume_spacing.y) - + 0.5f; + + float reci = 1.0f / ray_vector.y; + float alpha0 = (volume_min_edge_point - source_point.y) * reci; + float alpha1 = (volume_max_edge_point - source_point.y) * reci; + min_alpha = fmax(min_alpha, fmin(alpha0, alpha1)); + max_alpha = fmin(max_alpha, fmax(alpha0, alpha1)); + } + + float px, py; + // pixel = source_point.x + min_alpha * ray_vector.x; + // Entrance boundary + // In CUDA, voxel centers are located at (xx.5, xx.5, xx.5), + // whereas, SwVolume has voxel centers at integers. + // For the initial interpolated value, only a half stepsize is + // considered in the computation. + if (min_alpha < max_alpha) { + px = source_point.x + min_alpha * ray_vector.x; + py = source_point.y + min_alpha * ray_vector.y; + + pixel += + 0.5f * + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + min_alpha += step_size; + } + // Mid segments + while (min_alpha < max_alpha) { + px = source_point.x + min_alpha * ray_vector.x; + py = source_point.y + min_alpha * ray_vector.y; + float2 interp_point = + physical_to_index(make_float2(px, py), volume_origin, volume_spacing); + pixel += + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + min_alpha += step_size; + } + // Scaling by stepsize; + pixel *= step_size; + + // Last segment of the line + if (pixel > 0.0f) { + pixel -= + 0.5f * step_size * + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + min_alpha -= step_size; + float last_step_size = max_alpha - min_alpha; + pixel += + 0.5f * last_step_size * + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + + px = source_point.x + max_alpha * ray_vector.x; + py = source_point.y + max_alpha * ray_vector.y; + // The last segment of the line integral takes care of the + // varying length. + pixel += + 0.5f * last_step_size * + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + } + return pixel; +} + +__global__ void project_2Dfan_beam_kernel( + float *pSinogram, const float2 *d_rays, const int number_of_projections, + const float sampling_step_size, const int2 volume_size, + const float2 volume_spacing, const float2 volume_origin, + const int detector_size, const float detector_spacing, + const float detector_origin, const float sid, const float sdd) { + unsigned int detector_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (detector_idx >= detector_size) { + return; + } + // Preparations: + + // Assume a source isocenter distance to compute the start of the ray, + // although sid is not neseccary for a par beam geometry + // TODO: use volume spacing to reduce ray length + int projection_idx = blockIdx.y; + float2 central_ray_vector = d_rays[projection_idx]; + + // create detector coordinate system (u,v) w.r.t the ray + float2 u_vec = make_float2(-central_ray_vector.y, central_ray_vector.x); + // calculate physical coordinate of detector pixel + float u = index_to_physical(detector_idx, detector_origin, detector_spacing); + // float u = detector_idx * detector_spacing + detector_mid; + + // Calculate "source"-Point (start point for the parallel ray), so we can use + // the projection kernel Assume a source isocenter distance to compute the + // start of the ray, although sid is not neseccary for a par beam geometry + float2 source_point = central_ray_vector * (-sid); + + float2 detector_point_world = + source_point + central_ray_vector * sdd + u_vec * u; + float2 ray_vector = normalize(detector_point_world - source_point); + + float pixel = kernel_project2D(source_point, ray_vector, + sampling_step_size * + fmin(volume_spacing.x, volume_spacing.y), + volume_size, volume_origin, volume_spacing); + + pixel *= sqrt( + (ray_vector.x * volume_spacing.x) * (ray_vector.x * volume_spacing.x) + + (ray_vector.y * volume_spacing.y) * (ray_vector.y * volume_spacing.y)); + + unsigned sinogram_idx = projection_idx * detector_size + detector_idx; + pSinogram[sinogram_idx] = pixel; + + return; +} +/*************** WARNING ******************./ + * + * Tensorflow is allocating the whole GPU memory for itself and just leave a + * small slack memory using cudaMalloc and cudaMalloc3D will allocate memory in + * this small slack memory ! Therefore, currently only small volumes can be used + * (they have to fit into the slack memory which TF does not allocae !) + * + * This is the kernel based on texture interpolation, thus, the allocations + * are not within the Tensorflow managed memory. If memory errors occure: + * 1. start Tensorflow with less gpu memory and allow growth + * 2. TODO: no software interpolation based 2D verions are available yet + * + * TODO: use context->allocate_tmp and context->allocate_persistent instead of + * cudaMalloc for the ray_vectors array : + * https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment + * + */ +void Fan_Projection_Kernel_Launcher( + const float *volume_ptr, float *out, const float *ray_vectors, + const int number_of_projections, const int volume_width, + const int volume_height, const float volume_spacing_x, + const float volume_spacing_y, const float volume_origin_x, + const float volume_origin_y, const int detector_size, + const float detector_spacing, const float detector_origin, const float sid, + const float sdd) { + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + volume_as_texture.addressMode[0] = cudaAddressModeBorder; + volume_as_texture.addressMode[1] = cudaAddressModeBorder; + volume_as_texture.filterMode = cudaFilterModeLinear; + volume_as_texture.normalized = false; + + cudaArray *volume_array; + cudaMallocArray(&volume_array, &channelDesc, volume_width, volume_height); + cudaMemcpyToArray(volume_array, 0, 0, volume_ptr, + volume_width * volume_height * sizeof(float), + cudaMemcpyHostToDevice); + cudaBindTextureToArray(volume_as_texture, volume_array, channelDesc); + + auto ray_size_b = number_of_projections * sizeof(float2); + float2 *d_rays; + cudaMalloc(&d_rays, ray_size_b); + cudaMemcpy(d_rays, ray_vectors, ray_size_b, cudaMemcpyHostToDevice); + + float sampling_step_size = 0.2; + + int2 volume_size = make_int2(volume_width, volume_height); + float2 volume_spacing = make_float2(volume_spacing_x, volume_spacing_y); + float2 volume_origin = make_float2(volume_origin_x, volume_origin_y); + + const unsigned blocksize = 256; + const dim3 gridsize = + dim3((detector_size / blocksize) + 1, number_of_projections); + project_2Dfan_beam_kernel<<<gridsize, blocksize>>>( + out, d_rays, number_of_projections, sampling_step_size, volume_size, + volume_spacing, volume_origin, detector_size, detector_spacing, + detector_origin, sid, sdd); + + cudaUnbindTexture(volume_as_texture); + cudaFreeArray(volume_array); + cudaFree(d_rays); +} diff --git a/generated_files/helper_headers/helper_eigen.h b/generated_files/helper_headers/helper_eigen.h new file mode 100644 index 0000000000000000000000000000000000000000..b31f845ad283f49cdc54c80253c528f441650ace --- /dev/null +++ b/generated_files/helper_headers/helper_eigen.h @@ -0,0 +1,35 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Conversion from: https://stackoverflow.com/questions/48795789/eigen-unsupported-tensor-to-eigen-matrix + * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0. +*/ +#include <unsupported/Eigen/CXX11/Tensor> + +template<typename T> +using MatrixType = Eigen::Matrix<T,Eigen::Dynamic, Eigen::Dynamic>; + +template<typename Scalar,int rank, typename sizeType> +MatrixType<Scalar> Tensor_to_Matrix(const Eigen::Tensor<Scalar,rank> &tensor,const sizeType rows,const sizeType cols) +{ + return Eigen::Map<const MatrixType<Scalar>> (tensor.data(), rows,cols); +} + +template<typename Scalar, typename... Dims> +Eigen::Tensor< Scalar, sizeof... (Dims)> Matrix_to_Tensor(const MatrixType<Scalar> &matrix, Dims... dims) +{ + constexpr int rank = sizeof... (Dims); + return Eigen::TensorMap<Eigen::Tensor<const Scalar, rank>>(matrix.data(), {dims...}); +} \ No newline at end of file diff --git a/generated_files/helper_headers/helper_geometry_cpu.h b/generated_files/helper_headers/helper_geometry_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..cc51aaf32f7818ef6e12ad386bd7c8aa2289344c --- /dev/null +++ b/generated_files/helper_headers/helper_geometry_cpu.h @@ -0,0 +1,43 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Helper methods to prepare projection matrices for projector kernel + * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0. +*/ +#ifndef HELPER_GEOMETRY_CPU_H +#define HELPER_GEOMETRY_CPU_H +#pragma once +#include <Eigen/Dense> +#include <Eigen/SVD> + +namespace Geometry{ + + /// Compute right null-space of A + Eigen::VectorXf nullspace(const Eigen::MatrixXf& A) + { + Eigen::JacobiSVD<Eigen::MatrixXf> svd(A, Eigen::ComputeFullU | Eigen::ComputeFullV); + auto V=svd.matrixV(); + return V.col(V.cols()-1); + } + /// Extract the world coordinates of the camera center from a projection matrix. (SVD based implementation) + Eigen::Vector4f getCameraCenter(const Eigen::MatrixXf& P) + { + Eigen::Vector4f C = Geometry::nullspace(P); + if (C(3)<-1e-12 || C(3)>1e-12) + C=C/C(3); // Def:Camera centers are always positive. + return C; + } +} +#endif \ No newline at end of file diff --git a/generated_files/helper_headers/helper_geometry_gpu.h b/generated_files/helper_headers/helper_geometry_gpu.h new file mode 100755 index 0000000000000000000000000000000000000000..b47c87de9264e37e77f052bfbc19239dd782d7b1 --- /dev/null +++ b/generated_files/helper_headers/helper_geometry_gpu.h @@ -0,0 +1,44 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Computes line intersection for projector kernel + * Implementation is adapted from CONRAD + * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0. +*/ +#pragma once +#ifndef HELPER_GEOMETRY_GPU_H +#define HELPER_GEOMETRY_GPU_H + +#include "helper_math.h" + +inline __device__ float2 intersectLines2D(float2 p1, float2 p2, float2 p3, float2 p4) +{ + float dNom = (p1.x - p2.x) * (p3.y - p4.y) - (p1.y - p2.y) * (p3.x - p4.x); + + if (dNom < 0.000001f && dNom > -0.000001f) + { + float2 retValue = {NAN, NAN}; + return retValue; + } + float x = (p1.x * p2.y - p1.y * p2.x) * (p3.x - p4.x) - (p1.x - p2.x) * (p3.x * p4.y - p3.y * p4.x); + float y = (p1.x * p2.y - p1.y * p2.x) * (p3.y - p4.y) - (p1.y - p2.y) * (p3.x * p4.y - p3.y * p4.x); + + x /= dNom; + y /= dNom; + float2 isectPt = {x, y}; + return isectPt; +} + +#endif diff --git a/generated_files/helper_headers/helper_grid.h b/generated_files/helper_headers/helper_grid.h new file mode 100755 index 0000000000000000000000000000000000000000..61fa5e3d8b5b92d5043a547905c7b64b7b7b845e --- /dev/null +++ b/generated_files/helper_headers/helper_grid.h @@ -0,0 +1,55 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Helper methods to convert index to physical coordinates and vice versa + * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0. +*/ +#ifndef HELPER_GRID_H +#define HELPER_GRID_H + + +inline __host__ __device__ float index_to_physical(float index, float origin, float spacing) +{ + return index * spacing + origin; +} + +inline __host__ __device__ float physical_to_index(float physical, float origin, float spacing) +{ + return (physical - origin) / spacing; +} + +inline __host__ __device__ float2 index_to_physical(float2 index, float2 origin, float2 spacing) +{ + return make_float2(index.x * spacing.x + origin.x, index.y * spacing.y + origin.y); +} + +inline __host__ __device__ float2 physical_to_index(float2 physical, float2 origin, float2 spacing) +{ + return make_float2((physical.x - origin.x) / spacing.x, (physical.y - origin.y) / spacing.y); +} + +inline __host__ __device__ float3 index_to_physical(float3 index, float3 origin, float3 spacing) +{ + return make_float3(index.x * spacing.x + origin.x, index.y * spacing.y + origin.y, index.z * spacing.z + origin.z); +} + +inline __host__ __device__ float3 physical_to_index(float3 physical, float3 origin, float3 spacing) +{ + return make_float3((physical.x - origin.x) / spacing.x, (physical.y - origin.y) / spacing.y, (physical.z - origin.z) / spacing.z); +} + +#endif + + diff --git a/generated_files/helper_headers/helper_math.h b/generated_files/helper_headers/helper_math.h new file mode 100755 index 0000000000000000000000000000000000000000..d56d461674938e8e9190c42acfeec6b61bcd2110 --- /dev/null +++ b/generated_files/helper_headers/helper_math.h @@ -0,0 +1,1449 @@ +/** + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* + * This file implements common mathematical operations on vector types + * (float3, float4 etc.) since these are not provided as standard by CUDA. + * + * The syntax is modeled on the Cg standard library. + * + * This is part of the Helper library includes + * + * Thanks to Linh Hah for additions and fixes. + */ + +#ifndef HELPER_MATH_H +#define HELPER_MATH_H + +#include "cuda_runtime.h" + +typedef unsigned int uint; +typedef unsigned short ushort; + +#ifndef __CUDACC__ +#include <math.h> + +//////////////////////////////////////////////////////////////////////////////// +// host implementations of CUDA functions +//////////////////////////////////////////////////////////////////////////////// + +inline float fminf(float a, float b) +{ + return a < b ? a : b; +} + +inline float fmaxf(float a, float b) +{ + return a > b ? a : b; +} + +inline int max(int a, int b) +{ + return a > b ? a : b; +} + +inline int min(int a, int b) +{ + return a < b ? a : b; +} + +inline float rsqrtf(float x) +{ + return 1.0f / sqrtf(x); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// constructors +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 make_float2(float s) +{ + return make_float2(s, s); +} +inline __host__ __device__ float2 make_float2(float3 a) +{ + return make_float2(a.x, a.y); +} +inline __host__ __device__ float2 make_float2(int2 a) +{ + return make_float2(float(a.x), float(a.y)); +} +inline __host__ __device__ float2 make_float2(uint2 a) +{ + return make_float2(float(a.x), float(a.y)); +} + +inline __host__ __device__ int2 make_int2(int s) +{ + return make_int2(s, s); +} +inline __host__ __device__ int2 make_int2(int3 a) +{ + return make_int2(a.x, a.y); +} +inline __host__ __device__ int2 make_int2(uint2 a) +{ + return make_int2(int(a.x), int(a.y)); +} +inline __host__ __device__ int2 make_int2(float2 a) +{ + return make_int2(int(a.x), int(a.y)); +} + +inline __host__ __device__ uint2 make_uint2(uint s) +{ + return make_uint2(s, s); +} +inline __host__ __device__ uint2 make_uint2(uint3 a) +{ + return make_uint2(a.x, a.y); +} +inline __host__ __device__ uint2 make_uint2(int2 a) +{ + return make_uint2(uint(a.x), uint(a.y)); +} + +inline __host__ __device__ float3 make_float3(float s) +{ + return make_float3(s, s, s); +} +inline __host__ __device__ float3 make_float3(float2 a) +{ + return make_float3(a.x, a.y, 0.0f); +} +inline __host__ __device__ float3 make_float3(float2 a, float s) +{ + return make_float3(a.x, a.y, s); +} +inline __host__ __device__ float3 make_float3(float4 a) +{ + return make_float3(a.x, a.y, a.z); +} +inline __host__ __device__ float3 make_float3(int3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} +inline __host__ __device__ float3 make_float3(uint3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} + +inline __host__ __device__ int3 make_int3(int s) +{ + return make_int3(s, s, s); +} +inline __host__ __device__ int3 make_int3(int2 a) +{ + return make_int3(a.x, a.y, 0); +} +inline __host__ __device__ int3 make_int3(int2 a, int s) +{ + return make_int3(a.x, a.y, s); +} +inline __host__ __device__ int3 make_int3(uint3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} +inline __host__ __device__ int3 make_int3(float3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} + +inline __host__ __device__ uint3 make_uint3(uint s) +{ + return make_uint3(s, s, s); +} +inline __host__ __device__ uint3 make_uint3(uint2 a) +{ + return make_uint3(a.x, a.y, 0); +} +inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) +{ + return make_uint3(a.x, a.y, s); +} +inline __host__ __device__ uint3 make_uint3(uint4 a) +{ + return make_uint3(a.x, a.y, a.z); +} +inline __host__ __device__ uint3 make_uint3(int3 a) +{ + return make_uint3(uint(a.x), uint(a.y), uint(a.z)); +} + +inline __host__ __device__ float4 make_float4(float s) +{ + return make_float4(s, s, s, s); +} +inline __host__ __device__ float4 make_float4(float3 a) +{ + return make_float4(a.x, a.y, a.z, 0.0f); +} +inline __host__ __device__ float4 make_float4(float3 a, float w) +{ + return make_float4(a.x, a.y, a.z, w); +} +inline __host__ __device__ float4 make_float4(int4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} +inline __host__ __device__ float4 make_float4(uint4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} + +inline __host__ __device__ int4 make_int4(int s) +{ + return make_int4(s, s, s, s); +} +inline __host__ __device__ int4 make_int4(int3 a) +{ + return make_int4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ int4 make_int4(int3 a, int w) +{ + return make_int4(a.x, a.y, a.z, w); +} +inline __host__ __device__ int4 make_int4(uint4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} +inline __host__ __device__ int4 make_int4(float4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} + + +inline __host__ __device__ uint4 make_uint4(uint s) +{ + return make_uint4(s, s, s, s); +} +inline __host__ __device__ uint4 make_uint4(uint3 a) +{ + return make_uint4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) +{ + return make_uint4(a.x, a.y, a.z, w); +} +inline __host__ __device__ uint4 make_uint4(int4 a) +{ + return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// negate +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 &a) +{ + return make_float2(-a.x, -a.y); +} +inline __host__ __device__ int2 operator-(int2 &a) +{ + return make_int2(-a.x, -a.y); +} +inline __host__ __device__ float3 operator-(float3 &a) +{ + return make_float3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ int3 operator-(int3 &a) +{ + return make_int3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ float4 operator-(float4 &a) +{ + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +inline __host__ __device__ int4 operator-(int4 &a) +{ + return make_int4(-a.x, -a.y, -a.z, -a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// addition +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator+(float2 a, float2 b) +{ + return make_float2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(float2 &a, float2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ float2 operator+(float2 a, float b) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ float2 operator+(float b, float2 a) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(float2 &a, float b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ int2 operator+(int2 a, int2 b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(int2 &a, int2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ int2 operator+(int2 a, int b) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ int2 operator+(int b, int2 a) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(int2 &a, int b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) +{ + return make_uint2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(uint2 &a, uint2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ uint2 operator+(uint2 a, uint b) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ uint2 operator+(uint b, uint2 a) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(uint2 &a, uint b) +{ + a.x += b; + a.y += b; +} + + +inline __host__ __device__ float3 operator+(float3 a, float3 b) +{ + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(float3 &a, float3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ float3 operator+(float3 a, float b) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(float3 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int3 a, int3 b) +{ + return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(int3 &a, int3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ int3 operator+(int3 a, int b) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(int3 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) +{ + return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(uint3 &a, uint3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ uint3 operator+(uint3 a, uint b) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(uint3 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int b, int3 a) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ uint3 operator+(uint b, uint3 a) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ float3 operator+(float b, float3 a) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} + +inline __host__ __device__ float4 operator+(float4 a, float4 b) +{ + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(float4 &a, float4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ float4 operator+(float4 a, float b) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ float4 operator+(float b, float4 a) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(float4 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ int4 operator+(int4 a, int4 b) +{ + return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(int4 &a, int4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ int4 operator+(int4 a, int b) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ int4 operator+(int b, int4 a) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(int4 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) +{ + return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(uint4 &a, uint4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ uint4 operator+(uint4 a, uint b) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ uint4 operator+(uint b, uint4 a) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(uint4 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +//////////////////////////////////////////////////////////////////////////////// +// subtract +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 a, float2 b) +{ + return make_float2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(float2 &a, float2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ float2 operator-(float2 a, float b) +{ + return make_float2(a.x - b, a.y - b); +} +inline __host__ __device__ float2 operator-(float b, float2 a) +{ + return make_float2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(float2 &a, float b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ int2 operator-(int2 a, int2 b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(int2 &a, int2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ int2 operator-(int2 a, int b) +{ + return make_int2(a.x - b, a.y - b); +} +inline __host__ __device__ int2 operator-(int b, int2 a) +{ + return make_int2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(int2 &a, int b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) +{ + return make_uint2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ uint2 operator-(uint2 a, uint b) +{ + return make_uint2(a.x - b, a.y - b); +} +inline __host__ __device__ uint2 operator-(uint b, uint2 a) +{ + return make_uint2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ float3 operator-(float3 a, float3 b) +{ + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(float3 &a, float3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ float3 operator-(float3 a, float b) +{ + return make_float3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ float3 operator-(float b, float3 a) +{ + return make_float3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(float3 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ int3 operator-(int3 a, int3 b) +{ + return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(int3 &a, int3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ int3 operator-(int3 a, int b) +{ + return make_int3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ int3 operator-(int b, int3 a) +{ + return make_int3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(int3 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) +{ + return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ uint3 operator-(uint3 a, uint b) +{ + return make_uint3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ uint3 operator-(uint b, uint3 a) +{ + return make_uint3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ float4 operator-(float4 a, float4 b) +{ + return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(float4 &a, float4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ float4 operator-(float4 a, float b) +{ + return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ void operator-=(float4 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ int4 operator-(int4 a, int4 b) +{ + return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(int4 &a, int4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ int4 operator-(int4 a, int b) +{ + return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ int4 operator-(int b, int4 a) +{ + return make_int4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(int4 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) +{ + return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ uint4 operator-(uint4 a, uint b) +{ + return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ uint4 operator-(uint b, uint4 a) +{ + return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// multiply +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator*(float2 a, float2 b) +{ + return make_float2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(float2 &a, float2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ float2 operator*(float2 a, float b) +{ + return make_float2(a.x * b, a.y * b); +} +inline __host__ __device__ float2 operator*(float b, float2 a) +{ + return make_float2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(float2 &a, float b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ int2 operator*(int2 a, int2 b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(int2 &a, int2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ int2 operator*(int2 a, int b) +{ + return make_int2(a.x * b, a.y * b); +} +inline __host__ __device__ int2 operator*(int b, int2 a) +{ + return make_int2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(int2 &a, int b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) +{ + return make_uint2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ uint2 operator*(uint2 a, uint b) +{ + return make_uint2(a.x * b, a.y * b); +} +inline __host__ __device__ uint2 operator*(uint b, uint2 a) +{ + return make_uint2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ float3 operator*(float3 a, float3 b) +{ + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(float3 &a, float3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ float3 operator*(float3 a, float b) +{ + return make_float3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ float3 operator*(float b, float3 a) +{ + return make_float3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(float3 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ int3 operator*(int3 a, int3 b) +{ + return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(int3 &a, int3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ int3 operator*(int3 a, int b) +{ + return make_int3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ int3 operator*(int b, int3 a) +{ + return make_int3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(int3 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) +{ + return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ uint3 operator*(uint3 a, uint b) +{ + return make_uint3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ uint3 operator*(uint b, uint3 a) +{ + return make_uint3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ float4 operator*(float4 a, float4 b) +{ + return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(float4 &a, float4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ float4 operator*(float4 a, float b) +{ + return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ float4 operator*(float b, float4 a) +{ + return make_float4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(float4 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ int4 operator*(int4 a, int4 b) +{ + return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(int4 &a, int4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ int4 operator*(int4 a, int b) +{ + return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ int4 operator*(int b, int4 a) +{ + return make_int4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(int4 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) +{ + return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ uint4 operator*(uint4 a, uint b) +{ + return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ uint4 operator*(uint b, uint4 a) +{ + return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// divide +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator/(float2 a, float2 b) +{ + return make_float2(a.x / b.x, a.y / b.y); +} +inline __host__ __device__ void operator/=(float2 &a, float2 b) +{ + a.x /= b.x; + a.y /= b.y; +} +inline __host__ __device__ float2 operator/(float2 a, float b) +{ + return make_float2(a.x / b, a.y / b); +} +inline __host__ __device__ void operator/=(float2 &a, float b) +{ + a.x /= b; + a.y /= b; +} +inline __host__ __device__ float2 operator/(float b, float2 a) +{ + return make_float2(b / a.x, b / a.y); +} + +inline __host__ __device__ float3 operator/(float3 a, float3 b) +{ + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ void operator/=(float3 &a, float3 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; +} +inline __host__ __device__ float3 operator/(float3 a, float b) +{ + return make_float3(a.x / b, a.y / b, a.z / b); +} +inline __host__ __device__ void operator/=(float3 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; +} +inline __host__ __device__ float3 operator/(float b, float3 a) +{ + return make_float3(b / a.x, b / a.y, b / a.z); +} + +inline __host__ __device__ float4 operator/(float4 a, float4 b) +{ + return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} +inline __host__ __device__ void operator/=(float4 &a, float4 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; + a.w /= b.w; +} +inline __host__ __device__ float4 operator/(float4 a, float b) +{ + return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); +} +inline __host__ __device__ void operator/=(float4 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; + a.w /= b; +} +inline __host__ __device__ float4 operator/(float b, float4 a) +{ + return make_float4(b / a.x, b / a.y, b / a.z, b / a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// min +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fminf(float2 a, float2 b) +{ + return make_float2(fminf(a.x,b.x), fminf(a.y,b.y)); +} +inline __host__ __device__ float3 fminf(float3 a, float3 b) +{ + return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z)); +} +inline __host__ __device__ float4 fminf(float4 a, float4 b) +{ + return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w)); +} + +inline __host__ __device__ int2 min(int2 a, int2 b) +{ + return make_int2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ int3 min(int3 a, int3 b) +{ + return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ int4 min(int4 a, int4 b) +{ + return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +inline __host__ __device__ uint2 min(uint2 a, uint2 b) +{ + return make_uint2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ uint3 min(uint3 a, uint3 b) +{ + return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ uint4 min(uint4 a, uint4 b) +{ + return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// max +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmaxf(float2 a, float2 b) +{ + return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y)); +} +inline __host__ __device__ float3 fmaxf(float3 a, float3 b) +{ + return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z)); +} +inline __host__ __device__ float4 fmaxf(float4 a, float4 b) +{ + return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w)); +} + +inline __host__ __device__ int2 max(int2 a, int2 b) +{ + return make_int2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ int3 max(int3 a, int3 b) +{ + return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ int4 max(int4 a, int4 b) +{ + return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +inline __host__ __device__ uint2 max(uint2 a, uint2 b) +{ + return make_uint2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ uint3 max(uint3 a, uint3 b) +{ + return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ uint4 max(uint4 a, uint4 b) +{ + return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// lerp +// - linear interpolation between a and b, based on value t in [0, 1] range +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float lerp(float a, float b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) +{ + return a + t*(b-a); +} + +//////////////////////////////////////////////////////////////////////////////// +// clamp +// - clamp the value v to be in the range [a, b] +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float clamp(float f, float a, float b) +{ + return fmaxf(a, fminf(f, b)); +} +inline __device__ __host__ int clamp(int f, int a, int b) +{ + return max(a, min(f, b)); +} +inline __device__ __host__ uint clamp(uint f, uint a, uint b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ float2 clamp(float2 v, float a, float b) +{ + return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) +{ + return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ float3 clamp(float3 v, float a, float b) +{ + return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) +{ + return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ float4 clamp(float4 v, float a, float b) +{ + return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) +{ + return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ int2 clamp(int2 v, int a, int b) +{ + return make_int2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) +{ + return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ int3 clamp(int3 v, int a, int b) +{ + return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) +{ + return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ int4 clamp(int4 v, int a, int b) +{ + return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) +{ + return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) +{ + return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) +{ + return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) +{ + return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) +{ + return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) +{ + return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) +{ + return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// dot product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float dot(float2 a, float2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ float dot(float3 a, float3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ float dot(float4 a, float4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ int dot(int2 a, int2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ int dot(int3 a, int3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ int dot(int4 a, int4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ uint dot(uint2 a, uint2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ uint dot(uint3 a, uint3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ uint dot(uint4 a, uint4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +//////////////////////////////////////////////////////////////////////////////// +// length +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float length(float2 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float3 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float4 v) +{ + return sqrtf(dot(v, v)); +} + +//////////////////////////////////////////////////////////////////////////////// +// normalize +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 normalize(float2 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float3 normalize(float3 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float4 normalize(float4 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} + +//////////////////////////////////////////////////////////////////////////////// +// floor +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 floorf(float2 v) +{ + return make_float2(floorf(v.x), floorf(v.y)); +} +inline __host__ __device__ float3 floorf(float3 v) +{ + return make_float3(floorf(v.x), floorf(v.y), floorf(v.z)); +} +inline __host__ __device__ float4 floorf(float4 v) +{ + return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// frac - returns the fractional portion of a scalar or each vector component +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float fracf(float v) +{ + return v - floorf(v); +} +inline __host__ __device__ float2 fracf(float2 v) +{ + return make_float2(fracf(v.x), fracf(v.y)); +} +inline __host__ __device__ float3 fracf(float3 v) +{ + return make_float3(fracf(v.x), fracf(v.y), fracf(v.z)); +} +inline __host__ __device__ float4 fracf(float4 v) +{ + return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// fmod +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmodf(float2 a, float2 b) +{ + return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y)); +} +inline __host__ __device__ float3 fmodf(float3 a, float3 b) +{ + return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z)); +} +inline __host__ __device__ float4 fmodf(float4 a, float4 b) +{ + return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// absolute value +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fabs(float2 v) +{ + return make_float2(fabs(v.x), fabs(v.y)); +} +inline __host__ __device__ float3 fabs(float3 v) +{ + return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); +} +inline __host__ __device__ float4 fabs(float4 v) +{ + return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); +} + +inline __host__ __device__ int2 abs(int2 v) +{ + return make_int2(abs(v.x), abs(v.y)); +} +inline __host__ __device__ int3 abs(int3 v) +{ + return make_int3(abs(v.x), abs(v.y), abs(v.z)); +} +inline __host__ __device__ int4 abs(int4 v) +{ + return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// reflect +// - returns reflection of incident ray I around surface normal N +// - N should be normalized, reflected vector's length is equal to length of I +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 reflect(float3 i, float3 n) +{ + return i - 2.0f * n * dot(n,i); +} + +//////////////////////////////////////////////////////////////////////////////// +// cross product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 cross(float3 a, float3 b) +{ + return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); +} + +//////////////////////////////////////////////////////////////////////////////// +// smoothstep +// - returns 0 if x < a +// - returns 1 if x > b +// - otherwise returns smooth interpolation between 0 and 1 based on x +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float smoothstep(float a, float b, float x) +{ + float y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(3.0f - (2.0f*y))); +} +inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) +{ + float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y))); +} +inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) +{ + float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y))); +} +inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) +{ + float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y))); +} + +#endif diff --git a/generated_files/par_backprojector_2D_CudaKernel.cu b/generated_files/par_backprojector_2D_CudaKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..15c7c67601fe71f681cb64be25e5bfca4ee557fa --- /dev/null +++ b/generated_files/par_backprojector_2D_CudaKernel.cu @@ -0,0 +1,111 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Voxel-driven parllel-beam back-projector CUDA kernel + * Implementation partially adapted from CONRAD + * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0. +*/ + +#include "helper_headers/helper_grid.h" +#include "helper_headers/helper_math.h" + +texture<float, cudaTextureType2D, cudaReadModeElementType> sinogram_as_texture; +#define CUDART_INF_F __int_as_float(0x7f800000) + +__global__ void backproject_2Dpar_beam_kernel(float *pVolume, const float2 *d_rays, const int number_of_projections, + const int2 volume_size, const float2 volume_spacing, const float2 volume_origin, + const int detector_size, const float detector_spacing, const float detector_origin) +{ + const float pi = 3.14159265359f; + unsigned int volume_x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int volume_y = blockIdx.y * blockDim.y + threadIdx.y; + if (volume_x >= volume_size.x || volume_y >= volume_size.y) + { + return; + } + //Preparations: + const float2 pixel_coordinate = index_to_physical(make_float2(volume_x, volume_y), volume_origin, volume_spacing); + float pixel_value = 0.0f; + + for (int n = 0; n < number_of_projections; n++) + { + float2 detector_normal = d_rays[n]; + float2 detector_vec = make_float2(-detector_normal.y, detector_normal.x); + + float s = dot(pixel_coordinate, detector_vec); + float s_idx = physical_to_index(s, detector_origin, detector_spacing); + + pixel_value += tex2D(sinogram_as_texture, s_idx + 0.5f, n + 0.5f); + } + + const unsigned volume_linearized_idx = volume_y * volume_size.x + volume_x; + pVolume[volume_linearized_idx] = pi * pixel_value / number_of_projections; + + return; +} +/*************** WARNING ******************./ + * + * Tensorflow is allocating the whole GPU memory for itself and just leave a small slack memory + * using cudaMalloc and cudaMalloc3D will allocate memory in this small slack memory ! + * Therefore, currently only small volumes can be used (they have to fit into the slack memory which TF does not allocae !) + * + * This is the kernel based on texture interpolation, thus, the allocations are not within the Tensorflow managed memory. + * If memory errors occure: + * 1. start Tensorflow with less gpu memory and allow growth + * 2. TODO: no software interpolation based 2D verions are available yet + * + * TODO: use context->allocate_tmp and context->allocate_persistent instead of cudaMalloc for the ray_vectors array + * : https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment + * + */ +void Parallel_Backprojection2D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *ray_vectors, const int number_of_projections, + const int volume_width, const int volume_height, const float volume_spacing_x, const float volume_spacing_y, + const float volume_origin_x, const float volume_origin_y, + const int detector_size, const float detector_spacing, const float detector_origin) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + sinogram_as_texture.addressMode[0] = cudaAddressModeBorder; + sinogram_as_texture.addressMode[1] = cudaAddressModeBorder; + sinogram_as_texture.filterMode = cudaFilterModeLinear; + sinogram_as_texture.normalized = false; + + cudaArray *sinogram_array; + cudaMallocArray(&sinogram_array, &channelDesc, detector_size, number_of_projections); + cudaMemcpyToArray(sinogram_array, 0, 0, sinogram_ptr, detector_size * number_of_projections * sizeof(float), cudaMemcpyHostToDevice); + cudaBindTextureToArray(sinogram_as_texture, sinogram_array, channelDesc); + + auto ray_size_b = number_of_projections * sizeof(float2); + float2 *d_rays; + cudaMalloc(&d_rays, ray_size_b); + cudaMemcpy(d_rays, ray_vectors, ray_size_b, cudaMemcpyHostToDevice); + + int2 volume_size = make_int2(volume_width, volume_height); + float2 volume_spacing = make_float2(volume_spacing_x, volume_spacing_y); + float2 volume_origin = make_float2(volume_origin_x, volume_origin_y); + + const unsigned block_size = 16; + const dim3 threads_per_block = dim3(block_size, block_size); + const dim3 num_blocks = dim3(volume_width / threads_per_block.x + 1, volume_height / threads_per_block.y + 1); + + backproject_2Dpar_beam_kernel<<<num_blocks, threads_per_block>>>(out, d_rays, number_of_projections, + volume_size, volume_spacing, volume_origin, + detector_size, detector_spacing, detector_origin); + + cudaUnbindTexture(sinogram_as_texture); + cudaFreeArray(sinogram_array); + cudaFree(d_rays); +} + + diff --git a/generated_files/par_projector_2D_CudaKernel.cu b/generated_files/par_projector_2D_CudaKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8fcae27ce8c80b8adad5a552e2b70df783d52270 --- /dev/null +++ b/generated_files/par_projector_2D_CudaKernel.cu @@ -0,0 +1,223 @@ +/* + * Copyright [2019] [Christopher Syben] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Voxel-driven parllel-beam projector CUDA kernel + * Implementation partially adapted from CONRAD + * PYRO-NN is developed as an Open Source project under the Apache License, + * Version 2.0. + */ +#include "helper_headers/helper_grid.h" +#include "helper_headers/helper_math.h" + +texture<float, cudaTextureType2D, cudaReadModeElementType> volume_as_texture; +#define CUDART_INF_F __int_as_float(0x7f800000) + +inline __device__ float +kernel_project2D(const float2 source_point, const float2 ray_vector, + const float step_size, const int2 volume_size, + const float2 volume_origin, const float2 volume_spacing) { + + unsigned int detector_idx = blockIdx.x * blockDim.x + threadIdx.x; + int projection_idx = blockIdx.y; + + float pixel = 0.0f; + // Step 1: compute alpha value at entry and exit point of the volume + float min_alpha, max_alpha; + min_alpha = 0; + max_alpha = CUDART_INF_F; + if (0.0f != ray_vector.x) { + float volume_min_edge_point = + index_to_physical(0, volume_origin.x, volume_spacing.x) - 0.5f; + float volume_max_edge_point = + index_to_physical(volume_size.x, volume_origin.x, volume_spacing.x) - + 0.5f; + + float reci = 1.0f / ray_vector.x; + float alpha0 = (volume_min_edge_point - source_point.x) * reci; + float alpha1 = (volume_max_edge_point - source_point.x) * reci; + min_alpha = fmin(alpha0, alpha1); + max_alpha = fmax(alpha0, alpha1); + } + + if (0.0f != ray_vector.y) { + float volume_min_edge_point = + index_to_physical(0, volume_origin.y, volume_spacing.y) - 0.5f; + float volume_max_edge_point = + index_to_physical(volume_size.y, volume_origin.y, volume_spacing.y) - + 0.5f; + + float reci = 1.0f / ray_vector.y; + float alpha0 = (volume_min_edge_point - source_point.y) * reci; + float alpha1 = (volume_max_edge_point - source_point.y) * reci; + min_alpha = fmax(min_alpha, fmin(alpha0, alpha1)); + max_alpha = fmin(max_alpha, fmax(alpha0, alpha1)); + } + + float px, py; + // pixel = source_point.x + min_alpha * ray_vector.x; + // Entrance boundary + // In CUDA, voxel centers are located at (xx.5, xx.5, xx.5), + // whereas, SwVolume has voxel centers at integers. + // For the initial interpolated value, only a half stepsize is + // considered in the computation. + if (min_alpha < max_alpha) { + px = source_point.x + min_alpha * ray_vector.x; + py = source_point.y + min_alpha * ray_vector.y; + + pixel += + 0.5f * + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + min_alpha += step_size; + } + // Mid segments + while (min_alpha < max_alpha) { + px = source_point.x + min_alpha * ray_vector.x; + py = source_point.y + min_alpha * ray_vector.y; + float2 interp_point = + physical_to_index(make_float2(px, py), volume_origin, volume_spacing); + pixel += + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + min_alpha += step_size; + } + // Scaling by stepsize; + pixel *= step_size; + + // Last segment of the line + if (pixel > 0.0f) { + pixel -= + 0.5f * step_size * + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + min_alpha -= step_size; + float last_step_size = max_alpha - min_alpha; + pixel += + 0.5f * last_step_size * + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + + px = source_point.x + max_alpha * ray_vector.x; + py = source_point.y + max_alpha * ray_vector.y; + // The last segment of the line integral takes care of the + // varying length. + pixel += + 0.5f * last_step_size * + tex2D(volume_as_texture, + physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f, + physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f); + } + return pixel; +} + +__global__ void project_2Dpar_beam_kernel( + float *pSinogram, const float2 *d_rays, const int number_of_projections, + const float sampling_step_size, const int2 volume_size, + const float2 volume_spacing, const float2 volume_origin, + const int detector_size, const float detector_spacing, + const float detector_origin) { + unsigned int detector_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (detector_idx >= detector_size) { + return; + } + // Preparations: + + // Assume a source isocenter distance to compute the start of the ray, + // although sid is not neseccary for a par beam geometry + float sid = sqrt((float)(volume_size.x * volume_spacing.x * volume_size.x * + volume_spacing.x) + + (volume_size.y * volume_spacing.y * volume_size.y * + volume_spacing.y)) * + 1.2f; + + int projection_idx = blockIdx.y; + float2 ray_vector = d_rays[projection_idx]; + + // create detector coordinate system (u,v) w.r.t the ray + float2 u_vec = make_float2(-ray_vector.y, ray_vector.x); + // calculate physical coordinate of detector pixel + float u = index_to_physical(detector_idx, detector_origin, detector_spacing); + + // Calculate "source"-Point (start point for the parallel ray), so we can use + // the projection kernel Assume a source isocenter distance to compute the + // start of the ray, although sid is not neseccary for a par beam geometry + float2 virtual_source_point = ray_vector * (-sid) + u_vec * u; + + float pixel = kernel_project2D( + virtual_source_point, ray_vector, + sampling_step_size, // * fmin(volume_spacing.x, volume_spacing.y), + volume_size, volume_origin, volume_spacing); + + pixel *= sqrt( + (ray_vector.x * volume_spacing.x) * (ray_vector.x * volume_spacing.x) + + (ray_vector.y * volume_spacing.y) * (ray_vector.y * volume_spacing.y)); + + unsigned sinogram_idx = projection_idx * detector_size + detector_idx; + pSinogram[sinogram_idx] = pixel; + + return; +} + +void Parallel_Projection2D_Kernel_Launcher( + const float *volume_ptr, float *out, const float *ray_vectors, + const int number_of_projections, const int volume_width, + const int volume_height, const float volume_spacing_x, + const float volume_spacing_y, const float volume_origin_x, + const float volume_origin_y, const int detector_size, + const float detector_spacing, const float detector_origin) { + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + volume_as_texture.addressMode[0] = cudaAddressModeBorder; + volume_as_texture.addressMode[1] = cudaAddressModeBorder; + volume_as_texture.filterMode = cudaFilterModeLinear; + volume_as_texture.normalized = false; + + cudaArray *volume_array; + cudaMallocArray(&volume_array, &channelDesc, volume_width, volume_height); + cudaMemcpyToArray(volume_array, 0, 0, volume_ptr, + volume_width * volume_height * sizeof(float), + cudaMemcpyHostToDevice); + cudaBindTextureToArray(volume_as_texture, volume_array, channelDesc); + + auto ray_size_b = number_of_projections * sizeof(float2); + float2 *d_rays; + cudaMalloc(&d_rays, ray_size_b); + cudaMemcpy(d_rays, ray_vectors, ray_size_b, cudaMemcpyHostToDevice); + + float sampling_step_size = 0.2; + + int2 volume_size = make_int2(volume_width, volume_height); + float2 volume_spacing = make_float2(volume_spacing_x, volume_spacing_y); + float2 volume_origin = make_float2(volume_origin_x, volume_origin_y); + + const unsigned blocksize = 256; + const dim3 gridsize = + dim3((detector_size / blocksize) + 1, number_of_projections); + project_2Dpar_beam_kernel<<<gridsize, blocksize>>>( + out, d_rays, number_of_projections, sampling_step_size, volume_size, + volume_spacing, volume_origin, detector_size, detector_spacing, + detector_origin); + + // cleanup + cudaUnbindTexture(volume_as_texture); + cudaFreeArray(volume_array); + cudaFree(d_rays); +} + + diff --git a/src/pyronn_torch/pyronn_torch.cpp b/generated_files/pyronn_torch.cpp similarity index 73% rename from src/pyronn_torch/pyronn_torch.cpp rename to generated_files/pyronn_torch.cpp index 411ee34186bfd8d9312547d27722a5d88f79cf4d..0d30c7a0ded8a3a638ab4289b3ea5e148a16b11f 100644 --- a/src/pyronn_torch/pyronn_torch.cpp +++ b/generated_files/pyronn_torch.cpp @@ -11,43 +11,43 @@ #include <torch/extension.h> -void Cone_Backprojection3D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *projection_matrices, const int number_of_projections, - const int volume_width, const int volume_height, const int volume_depth, - const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z, - const float volume_origin_x, const float volume_origin_y, const float volume_origin_z, - const int detector_width, const int detector_height, const float projection_multiplier); - - -void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const float *inv_AR_matrix, const float *src_points, - const int number_of_projections, const int volume_width, const int volume_height, const int volume_depth, - const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z, - const int detector_width, const int detector_height, const float step_size); - - -void Cone_Projection_Kernel_Tex_Interp_Launcher( - const float *__restrict__ volume_ptr, float *out, - const float *inv_AR_matrix, const float *src_points, - const int number_of_projections, const int volume_width, - const int volume_height, const int volume_depth, - const float volume_spacing_x, const float volume_spacing_y, - const float volume_spacing_z, const int detector_width, - const int detector_height, const float step_size); - - -void Parallel_Backprojection2D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *ray_vectors, const int number_of_projections, - const int volume_width, const int volume_height, const float volume_spacing_x, const float volume_spacing_y, - const float volume_origin_x, const float volume_origin_y, - const int detector_size, const float detector_spacing, const float detector_origin); - - -void Parallel_Projection2D_Kernel_Launcher( - const float *volume_ptr, float *out, const float *ray_vectors, - const int number_of_projections, const int volume_width, - const int volume_height, const float volume_spacing_x, - const float volume_spacing_y, const float volume_origin_x, - const float volume_origin_y, const int detector_size, - const float detector_spacing, const float detector_origin); - + void Cone_Backprojection3D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *projection_matrices, const int number_of_projections, + const int volume_width, const int volume_height, const int volume_depth, + const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z, + const float volume_origin_x, const float volume_origin_y, const float volume_origin_z, + const int detector_width, const int detector_height, const float projection_multiplier); + + + void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const float *inv_AR_matrix, const float *src_points, + const int number_of_projections, const int volume_width, const int volume_height, const int volume_depth, + const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z, + const int detector_width, const int detector_height, const float step_size); + + + void Cone_Projection_Kernel_Tex_Interp_Launcher( + const float *__restrict__ volume_ptr, float *out, + const float *inv_AR_matrix, const float *src_points, + const int number_of_projections, const int volume_width, + const int volume_height, const int volume_depth, + const float volume_spacing_x, const float volume_spacing_y, + const float volume_spacing_z, const int detector_width, + const int detector_height, const float step_size); + + + void Parallel_Backprojection2D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *ray_vectors, const int number_of_projections, + const int volume_width, const int volume_height, const float volume_spacing_x, const float volume_spacing_y, + const float volume_origin_x, const float volume_origin_y, + const int detector_size, const float detector_spacing, const float detector_origin); + + + void Parallel_Projection2D_Kernel_Launcher( + const float *volume_ptr, float *out, const float *ray_vectors, + const int number_of_projections, const int volume_width, + const int volume_height, const float volume_spacing_x, + const float volume_spacing_y, const float volume_origin_x, + const float volume_origin_y, const int detector_size, + const float detector_spacing, const float detector_origin); + using namespace pybind11::literals; diff --git a/setup.py b/setup.py index 5038bb1a45162f8c52bd20b0c3b3fd101a990cd6..8d9d7782680a1bc53d15e11f661091ca4129071a 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,6 @@ import sys from glob import glob -from os import makedirs -from os.path import basename, dirname, join -from shutil import copyfile, copytree, rmtree +from os.path import dirname, join from pkg_resources import VersionConflict, require from setuptools import setup @@ -20,20 +18,9 @@ if __name__ == "__main__": object_cache = dirname(__file__) module_name = 'pyronn_torch_cpp' - source_files = glob(join(dirname(__file__), 'src', 'pyronn_torch', 'PYRO-NN-Layers', '*.cu.cc')) + cuda_sources = glob(join(dirname(__file__), 'generated_files', '*.cu')) - generated_file = join('src', 'pyronn_torch', 'pyronn_torch.cpp') - - cuda_sources = [] - makedirs(join(object_cache, module_name), exist_ok=True) - rmtree(join(object_cache, module_name, 'helper_headers'), ignore_errors=True) - copytree(join(dirname(__file__), 'src', 'pyronn_torch', 'PYRO-NN-Layers', 'helper_headers'), - join(object_cache, module_name, 'helper_headers')) - - for s in source_files: - dst = join(object_cache, module_name, basename(s).replace('.cu.cc', '.cu')) - copyfile(s, dst) # Torch only accepts *.cu as CUDA - cuda_sources.append(join(module_name, basename(s).replace('.cu.cc', '.cu'))) + generated_file = join('generated_files', 'pyronn_torch.cpp') setup(use_pyscaffold=True, ext_modules=[ diff --git a/src/pyronn_torch/codegen.py b/src/pyronn_torch/codegen.py index 632b72e8a88a58d85a881f473f9c95147f215b17..5f1e96e8bf7666d342333660a03c8e1d5dacf5b9 100644 --- a/src/pyronn_torch/codegen.py +++ b/src/pyronn_torch/codegen.py @@ -172,14 +172,15 @@ def generate_shared_object(output_folder=None, source_files=None, show_code=False, framework_module_class=TorchModule, - generate_code_only=False): + generate_code_only=False, + update_repo_files=False): object_cache = get_cache_config()['object_cache'] module_name = 'pyronn_torch_cpp' if not output_folder: - output_folder = dirname(__file__) + output_folder = join(dirname(__file__), '..', '..', 'generated_files') if not source_files: source_files = glob(join(dirname(__file__), 'PYRO-NN-Layers', '*.cu.cc')) @@ -189,11 +190,18 @@ def generate_shared_object(output_folder=None, rmtree(join(object_cache, module_name, 'helper_headers'), ignore_errors=True) copytree(join(dirname(__file__), 'PYRO-NN-Layers', 'helper_headers'), join(object_cache, module_name, 'helper_headers')) + if update_repo_files: + copytree(join(dirname(__file__), 'PYRO-NN-Layers', 'helper_headers'), + join(output_folder, 'helper_headers')) for s in source_files: dst = join(object_cache, module_name, basename(s).replace('.cu.cc', '.cu')) copyfile(s, dst) # Torch only accepts *.cu as CUDA cuda_sources.append(dst) + if update_repo_files: + dst = join(output_folder, basename(s).replace('.cu.cc', '.cu')) + copyfile(s, dst) # Torch only accepts *.cu as CUDA + module = framework_module_class(module_name, FUNCTIONS.values()) @@ -210,13 +218,15 @@ def generate_shared_object(output_folder=None, shared_object_file = module.compiled_file copyfile(shared_object_file, join(output_folder, module_name + '.so')) - if show_code: + + if update_repo_files: with open(join(output_folder, 'pyronn_torch.cpp'), 'w') as f: f.write(pystencils.get_code_str(module, custom_backend=FrameworkIntegrationPrinter())) return extension + def compile_shared_object(output_folder=None, source_files=None): """ Same as generate_shared_object without pystencils dependency @@ -268,7 +278,7 @@ def main(): parser.add_argument('--source-files', default=None) args = parser.parse_args() - generate_shared_object(args.output_folder, args.source_files, show_code=True) + generate_shared_object(args.output_folder, args.source_files, show_code=True, update_repo_files=True) # compile_shared_object(args.output_folder, args.source_files)