From 102157fde623eeb9c42c6d14c97339a4f497414a Mon Sep 17 00:00:00 2001 From: Stephan Seitz <stephan.seitz@fau.de> Date: Fri, 13 Mar 2020 14:52:38 +0100 Subject: [PATCH] Avoid copying of projection_matrices on each kernel call --- generated_files/cone_backprojector_3D_CudaKernel.cu | 11 +++-------- ...one_backprojector_3D_CudaKernel_hardware_interp.cu | 11 +++-------- generated_files/cone_projector_3D_CudaKernel.cu | 11 +---------- src/pyronn_torch/PYRO-NN-Layers | 2 +- src/pyronn_torch/codegen.py | 1 + 5 files changed, 9 insertions(+), 27 deletions(-) diff --git a/generated_files/cone_backprojector_3D_CudaKernel.cu b/generated_files/cone_backprojector_3D_CudaKernel.cu index 4b899de..dc92f4b 100644 --- a/generated_files/cone_backprojector_3D_CudaKernel.cu +++ b/generated_files/cone_backprojector_3D_CudaKernel.cu @@ -43,7 +43,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, } } -inline __device__ float3 map(float3 coordinates, float *d_projection_matrices, +inline __device__ float3 map(float3 coordinates, const float *d_projection_matrices, int n) { const float *matrix = &(d_projection_matrices[n * 12]); @@ -99,7 +99,7 @@ inline __device__ float interp2D(const float *const_volume_ptr, } __global__ void backproject_3Dcone_beam_kernel( - const float *sinogram_ptr, float *vol, float *d_projection_matrices, + const float *sinogram_ptr, float *vol, const float *d_projection_matrices, const int number_of_projections, const uint3 volume_size, const float3 volume_spacing, const float3 volume_origin, const uint2 detector_size, const uint3 pointer_offsets, @@ -144,10 +144,6 @@ void Cone_Backprojection3D_Kernel_Launcher( const float projection_multiplier) { // COPY matrix to graphics card as float array auto matrices_size_b = number_of_projections * 12 * sizeof(float); - float *d_projection_matrices; - gpuErrchk(cudaMalloc(&d_projection_matrices, matrices_size_b)); - gpuErrchk(cudaMemcpy(d_projection_matrices, projection_matrices, - matrices_size_b, cudaMemcpyHostToDevice)); uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth); float3 volume_spacing = @@ -168,10 +164,9 @@ void Cone_Backprojection3D_Kernel_Launcher( const dim3 block = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z); backproject_3Dcone_beam_kernel<<<grid, block>>>( - sinogram_ptr, out, d_projection_matrices, number_of_projections, + sinogram_ptr, out, projection_matrices, number_of_projections, volume_size, volume_spacing, volume_origin, detector_size, pointer_offsets, projection_multiplier); gpuErrchk(cudaUnbindTexture(sinogram_as_texture)); - gpuErrchk(cudaFree(d_projection_matrices)); } diff --git a/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu b/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu index 0e528b1..34682bb 100644 --- a/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu +++ b/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu @@ -43,7 +43,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, } } -inline __device__ float3 map(float3 coordinates, float *d_projection_matrices, +inline __device__ float3 map(float3 coordinates, const float *d_projection_matrices, int n) { const float *matrix = &(d_projection_matrices[n * 12]); @@ -56,7 +56,7 @@ inline __device__ float3 map(float3 coordinates, float *d_projection_matrices, } __global__ void backproject_3Dcone_beam_kernel_tex_interp( - float *vol, float *d_projection_matrices, const int number_of_projections, + float *vol, const float *d_projection_matrices, const int number_of_projections, const uint3 volume_size, const float3 volume_spacing, const float3 volume_origin, const float projection_multiplier) { const int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -114,10 +114,6 @@ void Cone_Backprojection3D_Kernel_Tex_Interp_Launcher( const float projection_multiplier) { // COPY matrix to graphics card as float array auto matrices_size_b = number_of_projections * 12 * sizeof(float); - float *d_projection_matrices; - gpuErrchk(cudaMalloc(&d_projection_matrices, matrices_size_b)); - gpuErrchk(cudaMemcpy(d_projection_matrices, projection_matrices, - matrices_size_b, cudaMemcpyHostToDevice)); uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth); float3 volume_spacing = @@ -169,10 +165,9 @@ void Cone_Backprojection3D_Kernel_Tex_Interp_Launcher( const dim3 block = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z); backproject_3Dcone_beam_kernel_tex_interp<<<grid, block>>>( - out, d_projection_matrices, number_of_projections, volume_size, + out, projection_matrices, number_of_projections, volume_size, volume_spacing, volume_origin, projection_multiplier); gpuErrchk(cudaUnbindTexture(sinogram_as_texture)); gpuErrchk(cudaFreeArray(projArray)); - gpuErrchk(cudaFree(d_projection_matrices)); } diff --git a/generated_files/cone_projector_3D_CudaKernel.cu b/generated_files/cone_projector_3D_CudaKernel.cu index 70813e9..f5608f6 100644 --- a/generated_files/cone_projector_3D_CudaKernel.cu +++ b/generated_files/cone_projector_3D_CudaKernel.cu @@ -225,16 +225,9 @@ void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const { //COPY inv AR matrix to graphics card as float array auto matrices_size_b = number_of_projections * 9 * sizeof(float); - float *d_inv_AR_matrices; - gpuErrchk(cudaMalloc(&d_inv_AR_matrices, matrices_size_b)); - gpuErrchk(cudaMemcpy(d_inv_AR_matrices, inv_AR_matrix, matrices_size_b, cudaMemcpyHostToDevice)); //COPY source points to graphics card as float3 auto src_points_size_b = number_of_projections * sizeof(float3); - float3 *d_src_points; - gpuErrchk(cudaMalloc(&d_src_points, src_points_size_b)); - gpuErrchk(cudaMemcpy(d_src_points, src_points, src_points_size_b, cudaMemcpyHostToDevice)); - uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth); float3 volume_spacing = make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z); @@ -244,14 +237,12 @@ void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const const dim3 blocksize = dim3( BLOCKSIZE_X, BLOCKSIZE_Y, 1 ); const dim3 gridsize = dim3( detector_size.x / blocksize.x + 1, detector_size.y / blocksize.y + 1 , number_of_projections+1); - project_3Dcone_beam_kernel<<<gridsize, blocksize>>>(volume_ptr, out, d_inv_AR_matrices, d_src_points, step_size, + project_3Dcone_beam_kernel<<<gridsize, blocksize>>>(volume_ptr, out, inv_AR_matrix, reinterpret_cast<const float3*>(src_points), step_size, volume_size,volume_spacing, detector_size,number_of_projections,pointer_offsets); cudaDeviceSynchronize(); // check for errors gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk(cudaFree(d_inv_AR_matrices)); - gpuErrchk(cudaFree(d_src_points)); } diff --git a/src/pyronn_torch/PYRO-NN-Layers b/src/pyronn_torch/PYRO-NN-Layers index 1d10af3..2cf5bdb 160000 --- a/src/pyronn_torch/PYRO-NN-Layers +++ b/src/pyronn_torch/PYRO-NN-Layers @@ -1 +1 @@ -Subproject commit 1d10af309ff19d637bf4e991f4cb0aa58b561840 +Subproject commit 2cf5bdbf36e606eedf51ffb2ae84a7e3a4bb4492 diff --git a/src/pyronn_torch/codegen.py b/src/pyronn_torch/codegen.py index 97b3763..602937a 100644 --- a/src/pyronn_torch/codegen.py +++ b/src/pyronn_torch/codegen.py @@ -192,6 +192,7 @@ def generate_shared_object(output_folder=None, copytree(join(dirname(__file__), 'PYRO-NN-Layers', 'helper_headers'), join(object_cache, module_name, 'helper_headers')) if update_repo_files: + rmtree(join(output_folder, 'helper_headers'), ignore_errors=True) copytree(join(dirname(__file__), 'PYRO-NN-Layers', 'helper_headers'), join(output_folder, 'helper_headers')) -- GitLab