From 102157fde623eeb9c42c6d14c97339a4f497414a Mon Sep 17 00:00:00 2001
From: Stephan Seitz <stephan.seitz@fau.de>
Date: Fri, 13 Mar 2020 14:52:38 +0100
Subject: [PATCH] Avoid copying of projection_matrices on each kernel call

---
 generated_files/cone_backprojector_3D_CudaKernel.cu   | 11 +++--------
 ...one_backprojector_3D_CudaKernel_hardware_interp.cu | 11 +++--------
 generated_files/cone_projector_3D_CudaKernel.cu       | 11 +----------
 src/pyronn_torch/PYRO-NN-Layers                       |  2 +-
 src/pyronn_torch/codegen.py                           |  1 +
 5 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/generated_files/cone_backprojector_3D_CudaKernel.cu b/generated_files/cone_backprojector_3D_CudaKernel.cu
index 4b899de..dc92f4b 100644
--- a/generated_files/cone_backprojector_3D_CudaKernel.cu
+++ b/generated_files/cone_backprojector_3D_CudaKernel.cu
@@ -43,7 +43,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
   }
 }
 
-inline __device__ float3 map(float3 coordinates, float *d_projection_matrices,
+inline __device__ float3 map(float3 coordinates, const float *d_projection_matrices,
                              int n) {
   const float *matrix = &(d_projection_matrices[n * 12]);
 
@@ -99,7 +99,7 @@ inline __device__ float interp2D(const float *const_volume_ptr,
 }
 
 __global__ void backproject_3Dcone_beam_kernel(
-    const float *sinogram_ptr, float *vol, float *d_projection_matrices,
+    const float *sinogram_ptr, float *vol, const float *d_projection_matrices,
     const int number_of_projections, const uint3 volume_size,
     const float3 volume_spacing, const float3 volume_origin,
     const uint2 detector_size, const uint3 pointer_offsets,
@@ -144,10 +144,6 @@ void Cone_Backprojection3D_Kernel_Launcher(
     const float projection_multiplier) {
   // COPY matrix to graphics card as float array
   auto matrices_size_b = number_of_projections * 12 * sizeof(float);
-  float *d_projection_matrices;
-  gpuErrchk(cudaMalloc(&d_projection_matrices, matrices_size_b));
-  gpuErrchk(cudaMemcpy(d_projection_matrices, projection_matrices,
-                       matrices_size_b, cudaMemcpyHostToDevice));
 
   uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth);
   float3 volume_spacing =
@@ -168,10 +164,9 @@ void Cone_Backprojection3D_Kernel_Launcher(
   const dim3 block = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z);
 
   backproject_3Dcone_beam_kernel<<<grid, block>>>(
-      sinogram_ptr, out, d_projection_matrices, number_of_projections,
+      sinogram_ptr, out, projection_matrices, number_of_projections,
       volume_size, volume_spacing, volume_origin, detector_size,
       pointer_offsets, projection_multiplier);
 
   gpuErrchk(cudaUnbindTexture(sinogram_as_texture));
-  gpuErrchk(cudaFree(d_projection_matrices));
 }
diff --git a/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu b/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu
index 0e528b1..34682bb 100644
--- a/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu
+++ b/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu
@@ -43,7 +43,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
   }
 }
 
-inline __device__ float3 map(float3 coordinates, float *d_projection_matrices,
+inline __device__ float3 map(float3 coordinates, const float *d_projection_matrices,
                              int n) {
   const float *matrix = &(d_projection_matrices[n * 12]);
 
@@ -56,7 +56,7 @@ inline __device__ float3 map(float3 coordinates, float *d_projection_matrices,
 }
 
 __global__ void backproject_3Dcone_beam_kernel_tex_interp(
-    float *vol, float *d_projection_matrices, const int number_of_projections,
+    float *vol, const float *d_projection_matrices, const int number_of_projections,
     const uint3 volume_size, const float3 volume_spacing,
     const float3 volume_origin, const float projection_multiplier) {
   const int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -114,10 +114,6 @@ void Cone_Backprojection3D_Kernel_Tex_Interp_Launcher(
     const float projection_multiplier) {
   // COPY matrix to graphics card as float array
   auto matrices_size_b = number_of_projections * 12 * sizeof(float);
-  float *d_projection_matrices;
-  gpuErrchk(cudaMalloc(&d_projection_matrices, matrices_size_b));
-  gpuErrchk(cudaMemcpy(d_projection_matrices, projection_matrices,
-                       matrices_size_b, cudaMemcpyHostToDevice));
 
   uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth);
   float3 volume_spacing =
@@ -169,10 +165,9 @@ void Cone_Backprojection3D_Kernel_Tex_Interp_Launcher(
   const dim3 block = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z);
 
   backproject_3Dcone_beam_kernel_tex_interp<<<grid, block>>>(
-      out, d_projection_matrices, number_of_projections, volume_size,
+      out, projection_matrices, number_of_projections, volume_size,
       volume_spacing, volume_origin, projection_multiplier);
 
   gpuErrchk(cudaUnbindTexture(sinogram_as_texture));
   gpuErrchk(cudaFreeArray(projArray));
-  gpuErrchk(cudaFree(d_projection_matrices));
 }
diff --git a/generated_files/cone_projector_3D_CudaKernel.cu b/generated_files/cone_projector_3D_CudaKernel.cu
index 70813e9..f5608f6 100644
--- a/generated_files/cone_projector_3D_CudaKernel.cu
+++ b/generated_files/cone_projector_3D_CudaKernel.cu
@@ -225,16 +225,9 @@ void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const
 {
     //COPY inv AR matrix to graphics card as float array
     auto matrices_size_b = number_of_projections * 9 * sizeof(float);
-    float *d_inv_AR_matrices;
-    gpuErrchk(cudaMalloc(&d_inv_AR_matrices, matrices_size_b));
-    gpuErrchk(cudaMemcpy(d_inv_AR_matrices, inv_AR_matrix, matrices_size_b, cudaMemcpyHostToDevice));
     //COPY source points to graphics card as float3
     auto src_points_size_b = number_of_projections * sizeof(float3);
 
-    float3 *d_src_points;
-    gpuErrchk(cudaMalloc(&d_src_points, src_points_size_b));
-    gpuErrchk(cudaMemcpy(d_src_points, src_points, src_points_size_b, cudaMemcpyHostToDevice));
-
     uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth);
     float3 volume_spacing = make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z);
 
@@ -244,14 +237,12 @@ void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const
     const dim3 blocksize = dim3( BLOCKSIZE_X, BLOCKSIZE_Y, 1 );
     const dim3 gridsize = dim3( detector_size.x / blocksize.x + 1, detector_size.y / blocksize.y + 1 , number_of_projections+1);
 
-    project_3Dcone_beam_kernel<<<gridsize, blocksize>>>(volume_ptr, out, d_inv_AR_matrices, d_src_points, step_size,
+    project_3Dcone_beam_kernel<<<gridsize, blocksize>>>(volume_ptr, out, inv_AR_matrix, reinterpret_cast<const float3*>(src_points), step_size,
                                         volume_size,volume_spacing, detector_size,number_of_projections,pointer_offsets);
 
     cudaDeviceSynchronize();
 
     // check for errors
     gpuErrchk( cudaPeekAtLastError() );
-    gpuErrchk(cudaFree(d_inv_AR_matrices));
-    gpuErrchk(cudaFree(d_src_points));
 }
 
diff --git a/src/pyronn_torch/PYRO-NN-Layers b/src/pyronn_torch/PYRO-NN-Layers
index 1d10af3..2cf5bdb 160000
--- a/src/pyronn_torch/PYRO-NN-Layers
+++ b/src/pyronn_torch/PYRO-NN-Layers
@@ -1 +1 @@
-Subproject commit 1d10af309ff19d637bf4e991f4cb0aa58b561840
+Subproject commit 2cf5bdbf36e606eedf51ffb2ae84a7e3a4bb4492
diff --git a/src/pyronn_torch/codegen.py b/src/pyronn_torch/codegen.py
index 97b3763..602937a 100644
--- a/src/pyronn_torch/codegen.py
+++ b/src/pyronn_torch/codegen.py
@@ -192,6 +192,7 @@ def generate_shared_object(output_folder=None,
     copytree(join(dirname(__file__), 'PYRO-NN-Layers', 'helper_headers'),
              join(object_cache, module_name, 'helper_headers'))
     if update_repo_files:
+        rmtree(join(output_folder, 'helper_headers'), ignore_errors=True)
         copytree(join(dirname(__file__), 'PYRO-NN-Layers', 'helper_headers'),
                  join(output_folder, 'helper_headers'))
 
-- 
GitLab