diff --git a/generated_files/cone_backprojector_3D_CudaKernel.cu b/generated_files/cone_backprojector_3D_CudaKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4b899de4c1b8d8b0feb284dd6feea16612f4ebd1
--- /dev/null
+++ b/generated_files/cone_backprojector_3D_CudaKernel.cu
@@ -0,0 +1,177 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Voxel-driven cone-beam back-projector CUDA kernel using kernel interpolation
+ * Implementation adapted from CONRAD
+ * PYRO-NN is developed as an Open Source project under the Apache License,
+ * Version 2.0.
+ */
+
+#include "helper_headers/helper_grid.h"
+#include "helper_headers/helper_math.h"
+
+#define BLOCKSIZE_X 16
+#define BLOCKSIZE_Y 4
+#define BLOCKSIZE_Z 4
+
+#include <cstdio>
+texture<float, cudaTextureType2DLayered> sinogram_as_texture;
+
+#define CUDART_INF_F __int_as_float(0x7f800000)
+
+#define gpuErrchk(ans)                                                         \
+  { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort)
+      exit(code);
+  }
+}
+
+inline __device__ float3 map(float3 coordinates, float *d_projection_matrices,
+                             int n) {
+  const float *matrix = &(d_projection_matrices[n * 12]);
+
+  return make_float3(matrix[0] * coordinates.x + matrix[1] * coordinates.y +
+                         matrix[2] * coordinates.z + matrix[3],
+                     matrix[4] * coordinates.x + matrix[5] * coordinates.y +
+                         matrix[6] * coordinates.z + matrix[7],
+                     matrix[8] * coordinates.x + matrix[9] * coordinates.y +
+                         matrix[10] * coordinates.z + matrix[11]);
+}
+
+inline __device__ float interp2D(const float *const_volume_ptr,
+                                 const float3 point, const uint3 pointer_offset,
+                                 const uint2 detector_size) {
+
+  const int x_f = __float2int_rn(floor(point.x));
+  const int y_f = __float2int_rn(floor(point.y));
+
+  const int x_c = x_f + 1;
+  const int y_c = y_f + 1;
+
+  int i00 = (uint)point.z * pointer_offset.z + y_f * pointer_offset.y + x_f;
+  int i01 = (uint)point.z * pointer_offset.z + y_f * pointer_offset.y + x_c;
+  int i10 = (uint)point.z * pointer_offset.z + y_c * pointer_offset.y + x_f;
+  int i11 = (uint)point.z * pointer_offset.z + y_c * pointer_offset.y + x_c;
+
+  float p00 =
+      (x_f < 0 || x_f > detector_size.x || y_f < 0 || y_f > detector_size.y)
+          ? 0.0f
+          : __ldg(&const_volume_ptr[i00]);
+  float p01 =
+      (x_c < 0 || x_c > detector_size.x || y_f < 0 || y_f > detector_size.y)
+          ? 0.0f
+          : __ldg(&const_volume_ptr[i01]);
+  float p10 =
+      (x_f < 0 || x_f > detector_size.x || y_c < 0 || y_c > detector_size.y)
+          ? 0.0f
+          : __ldg(&const_volume_ptr[i10]);
+  float p11 =
+      (x_c < 0 || x_c > detector_size.x || y_c < 0 || y_c > detector_size.y)
+          ? 0.0f
+          : __ldg(&const_volume_ptr[i11]);
+
+  const float x_d = (point.x - x_f);
+  const float y_d = (point.y - y_f);
+
+  const float p0 = __fmaf_rn(p01, x_d, __fmaf_rn(p00, -x_d, p00));
+  const float p1 = __fmaf_rn(p11, x_d, __fmaf_rn(p10, -x_d, p10));
+
+  const float p = __fmaf_rn(p1, y_d, __fmaf_rn(p0, -y_d, p0));
+
+  return p;
+}
+
+__global__ void backproject_3Dcone_beam_kernel(
+    const float *sinogram_ptr, float *vol, float *d_projection_matrices,
+    const int number_of_projections, const uint3 volume_size,
+    const float3 volume_spacing, const float3 volume_origin,
+    const uint2 detector_size, const uint3 pointer_offsets,
+    const float projection_multiplier) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  const int j = blockIdx.y * blockDim.y + threadIdx.y;
+  const int k = blockIdx.z * blockDim.z + threadIdx.z;
+
+  if (i >= volume_size.x || j >= volume_size.y || k >= volume_size.z)
+    return;
+
+  const float3 coordinates =
+      index_to_physical(make_float3(i, j, k), volume_origin, volume_spacing);
+
+  float val = 0.0f;
+
+  for (int n = 0; n < number_of_projections; ++n) {
+    auto ip = map(coordinates, d_projection_matrices, n);
+
+    ip.z = 1.0f / ip.z;
+    ip.x *= ip.z;
+    ip.y *= ip.z;
+    float3 point = make_float3(ip.x, ip.y, n);
+
+    val += interp2D(sinogram_ptr, point, pointer_offsets, detector_size) *
+           ip.z * ip.z;
+  }
+
+  // linear volume address
+  const unsigned int l = volume_size.x * (k * volume_size.y + j) + i;
+  vol[l] = (val * projection_multiplier);
+}
+
+void Cone_Backprojection3D_Kernel_Launcher(
+    const float *sinogram_ptr, float *out, const float *projection_matrices,
+    const int number_of_projections, const int volume_width,
+    const int volume_height, const int volume_depth,
+    const float volume_spacing_x, const float volume_spacing_y,
+    const float volume_spacing_z, const float volume_origin_x,
+    const float volume_origin_y, const float volume_origin_z,
+    const int detector_width, const int detector_height,
+    const float projection_multiplier) {
+  // COPY matrix to graphics card as float array
+  auto matrices_size_b = number_of_projections * 12 * sizeof(float);
+  float *d_projection_matrices;
+  gpuErrchk(cudaMalloc(&d_projection_matrices, matrices_size_b));
+  gpuErrchk(cudaMemcpy(d_projection_matrices, projection_matrices,
+                       matrices_size_b, cudaMemcpyHostToDevice));
+
+  uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth);
+  float3 volume_spacing =
+      make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z);
+  float3 volume_origin =
+      make_float3(volume_origin_x, volume_origin_y, volume_origin_z);
+
+  uint2 detector_size = make_uint2(detector_width, detector_height);
+
+  uint3 pointer_offsets =
+      make_uint3(1, detector_width, detector_height * detector_width);
+
+  // launch kernel
+  const unsigned int gridsize_x = (volume_size.x - 1) / BLOCKSIZE_X + 1;
+  const unsigned int gridsize_y = (volume_size.y - 1) / BLOCKSIZE_Y + 1;
+  const unsigned int gridsize_z = (volume_size.z - 1) / BLOCKSIZE_Z + 1;
+  const dim3 grid = dim3(gridsize_x, gridsize_y, gridsize_z);
+  const dim3 block = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z);
+
+  backproject_3Dcone_beam_kernel<<<grid, block>>>(
+      sinogram_ptr, out, d_projection_matrices, number_of_projections,
+      volume_size, volume_spacing, volume_origin, detector_size,
+      pointer_offsets, projection_multiplier);
+
+  gpuErrchk(cudaUnbindTexture(sinogram_as_texture));
+  gpuErrchk(cudaFree(d_projection_matrices));
+}
diff --git a/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu b/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0e528b1a956abebf1e43a9f18a9abba06e60e72a
--- /dev/null
+++ b/generated_files/cone_backprojector_3D_CudaKernel_hardware_interp.cu
@@ -0,0 +1,178 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Voxel-driven cone-beam back-projector CUDA kernel using texture interpolation
+ * Implementation adapted from CONRAD
+ * PYRO-NN is developed as an Open Source project under the Apache License,
+ * Version 2.0.
+ */
+
+#include "helper_headers/helper_grid.h"
+#include "helper_headers/helper_math.h"
+#include <cstdio>
+
+#define BLOCKSIZE_X 16
+#define BLOCKSIZE_Y 4
+#define BLOCKSIZE_Z 4
+
+texture<float, cudaTextureType2DLayered> sinogram_as_texture;
+
+#define CUDART_INF_F __int_as_float(0x7f800000)
+
+#define gpuErrchk(ans)                                                         \
+  { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort)
+      exit(code);
+  }
+}
+
+inline __device__ float3 map(float3 coordinates, float *d_projection_matrices,
+                             int n) {
+  const float *matrix = &(d_projection_matrices[n * 12]);
+
+  return make_float3(matrix[0] * coordinates.x + matrix[1] * coordinates.y +
+                         matrix[2] * coordinates.z + matrix[3],
+                     matrix[4] * coordinates.x + matrix[5] * coordinates.y +
+                         matrix[6] * coordinates.z + matrix[7],
+                     matrix[8] * coordinates.x + matrix[9] * coordinates.y +
+                         matrix[10] * coordinates.z + matrix[11]);
+}
+
+__global__ void backproject_3Dcone_beam_kernel_tex_interp(
+    float *vol, float *d_projection_matrices, const int number_of_projections,
+    const uint3 volume_size, const float3 volume_spacing,
+    const float3 volume_origin, const float projection_multiplier) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  const int j = blockIdx.y * blockDim.y + threadIdx.y;
+  const int k = blockIdx.z * blockDim.z + threadIdx.z;
+
+  if (i >= volume_size.x || j >= volume_size.y || k >= volume_size.z)
+    return;
+
+  const float3 coordinates =
+      index_to_physical(make_float3(i, j, k), volume_origin, volume_spacing);
+
+  float val = 0.0f;
+
+  for (int n = 0; n < number_of_projections; ++n) {
+    auto ip = map(coordinates, d_projection_matrices, n);
+
+    ip.z = 1.0f / (float)ip.z;
+    ip.x *= ip.z;
+    ip.y *= ip.z;
+    val += tex2DLayered(sinogram_as_texture, ip.x + 0.5, ip.y + 0.5, n) * ip.z *
+           ip.z;
+  }
+
+  // linear volume address
+  const unsigned int l = volume_size.x * (k * volume_size.y + j) + i;
+  vol[l] = (val * projection_multiplier);
+}
+
+/*************** WARNING ******************./
+ *
+ *   Tensorflow is allocating the whole GPU memory for itself and just leave a
+ * small slack memory using cudaMalloc and cudaMalloc3D will allocate memory in
+ * this small slack memory ! Therefore, currently only small volumes can be used
+ * (they have to fit into the slack memory which TF does not allocae !)
+ *
+ *   This is the kernel based on texture interpolation, thus, the allocations
+ * are not within the Tensorflow managed memory. If memory errors occure:
+ *    1. start Tensorflow with less gpu memory and allow growth
+ *    2. switch to software-based interpolation.
+ *
+ *   TODO: use context->allocate_tmp and context->allocate_persistent instead of
+ * cudaMalloc for the projection_matrices array :
+ * https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment
+ *
+ */
+void Cone_Backprojection3D_Kernel_Tex_Interp_Launcher(
+    const float *sinogram_ptr, float *out, const float *projection_matrices,
+    const int number_of_projections, const int volume_width,
+    const int volume_height, const int volume_depth,
+    const float volume_spacing_x, const float volume_spacing_y,
+    const float volume_spacing_z, const float volume_origin_x,
+    const float volume_origin_y, const float volume_origin_z,
+    const int detector_width, const int detector_height,
+    const float projection_multiplier) {
+  // COPY matrix to graphics card as float array
+  auto matrices_size_b = number_of_projections * 12 * sizeof(float);
+  float *d_projection_matrices;
+  gpuErrchk(cudaMalloc(&d_projection_matrices, matrices_size_b));
+  gpuErrchk(cudaMemcpy(d_projection_matrices, projection_matrices,
+                       matrices_size_b, cudaMemcpyHostToDevice));
+
+  uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth);
+  float3 volume_spacing =
+      make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z);
+  float3 volume_origin =
+      make_float3(volume_origin_x, volume_origin_y, volume_origin_z);
+
+  uint2 detector_size = make_uint2(detector_width, detector_height);
+
+  // COPY volume to graphics card
+
+  // set texture properties
+  sinogram_as_texture.addressMode[0] = cudaAddressModeBorder;
+  sinogram_as_texture.addressMode[1] = cudaAddressModeBorder;
+  sinogram_as_texture.addressMode[2] = cudaAddressModeBorder;
+  sinogram_as_texture.filterMode = cudaFilterModeLinear;
+  sinogram_as_texture.normalized = false;
+
+  // malloc cuda array for texture
+  cudaExtent projExtent =
+      make_cudaExtent(detector_size.x, detector_size.y, number_of_projections);
+
+  cudaArray *projArray;
+
+  static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+  gpuErrchk(cudaMalloc3DArray(&projArray, &channelDesc, projExtent,
+                              cudaArrayLayered));
+
+  auto pitch_ptr = make_cudaPitchedPtr(const_cast<float *>(sinogram_ptr),
+                                       detector_size.x * sizeof(float),
+                                       detector_size.x, detector_size.y);
+  // copy data to 3D array
+  cudaMemcpy3DParms copyParams = {0};
+  copyParams.srcPtr = pitch_ptr;
+  copyParams.dstArray = projArray;
+  copyParams.extent = projExtent;
+  copyParams.kind = cudaMemcpyDeviceToDevice;
+  gpuErrchk(cudaMemcpy3D(&copyParams));
+
+  // bind texture reference
+  gpuErrchk(
+      cudaBindTextureToArray(sinogram_as_texture, projArray, channelDesc));
+
+  // launch kernel
+  const unsigned int gridsize_x = (volume_size.x - 1) / BLOCKSIZE_X + 1;
+  const unsigned int gridsize_y = (volume_size.y - 1) / BLOCKSIZE_Y + 1;
+  const unsigned int gridsize_z = (volume_size.z - 1) / BLOCKSIZE_Z + 1;
+  const dim3 grid = dim3(gridsize_x, gridsize_y, gridsize_z);
+  const dim3 block = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z);
+
+  backproject_3Dcone_beam_kernel_tex_interp<<<grid, block>>>(
+      out, d_projection_matrices, number_of_projections, volume_size,
+      volume_spacing, volume_origin, projection_multiplier);
+
+  gpuErrchk(cudaUnbindTexture(sinogram_as_texture));
+  gpuErrchk(cudaFreeArray(projArray));
+  gpuErrchk(cudaFree(d_projection_matrices));
+}
diff --git a/generated_files/cone_projector_3D_CudaKernel.cu b/generated_files/cone_projector_3D_CudaKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..70813e9cbc26d360d8c10c8c6c20b972e9cd7b4d
--- /dev/null
+++ b/generated_files/cone_projector_3D_CudaKernel.cu
@@ -0,0 +1,257 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * Ray-driven cone-beam projector CUDA kernel using kernel interpolation
+ * Implementation adapted from CONRAD
+ * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0.
+*/
+
+#include "helper_headers/helper_grid.h"
+#include "helper_headers/helper_math.h"
+#define CUDART_INF_F __int_as_float(0x7f800000)
+
+#define BLOCKSIZE_X           16
+#define BLOCKSIZE_Y           16
+
+#include <cstdio>
+
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) 
+   {        
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      exit(code);
+   }
+}
+
+inline __device__ float interp3D(const float* const_volume_ptr, const float3 point, const uint3 pointer_offset, const uint3 volume_size){
+
+    const int x_f = __float2int_rn( floor(point.x) ); 
+    const int y_f = __float2int_rn( floor(point.y) );
+    const int z_f = __float2int_rn( floor(point.z) );
+
+    const int x_c = x_f+1;
+    const int y_c = y_f+1;
+    const int z_c = z_f+1;
+     
+    uint i000 = z_f*pointer_offset.z + y_f * pointer_offset.y + x_f;
+    uint i001 = z_f*pointer_offset.z + y_f * pointer_offset.y + x_c;
+    uint i010 = z_f*pointer_offset.z + y_c * pointer_offset.y + x_f;
+    uint i011 = z_f*pointer_offset.z + y_c * pointer_offset.y + x_c;
+    uint i100 = z_c*pointer_offset.z + y_f * pointer_offset.y + x_f;
+    uint i101 = z_c*pointer_offset.z + y_f * pointer_offset.y + x_c;
+    uint i110 = z_c*pointer_offset.z + y_c * pointer_offset.y + x_f;
+    uint i111 = z_c*pointer_offset.z + y_c * pointer_offset.y + x_c;
+
+    float p000 = ( z_f < 0 || z_f >= volume_size.z || y_f < 0 || y_f >= volume_size.y || x_f < 0 || x_f >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i000 ]);
+    float p001 = ( z_f < 0 || z_f >= volume_size.z || y_f < 0 || y_f >= volume_size.y || x_c < 0 || x_c >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i001 ]);
+    float p010 = ( z_f < 0 || z_f >= volume_size.z || y_c < 0 || y_c >= volume_size.y || x_f < 0 || x_f >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i010 ]); 
+    float p011 = ( z_f < 0 || z_f >= volume_size.z || y_c < 0 || y_c >= volume_size.y || x_c < 0 || x_c >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i011 ]);
+    float p100 = ( z_c < 0 || z_c >= volume_size.z || y_f < 0 || y_f >= volume_size.y || x_f < 0 || x_f >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i100 ]);
+    float p101 = ( z_c < 0 || z_c >= volume_size.z || y_f < 0 || y_f >= volume_size.y || x_c < 0 || x_c >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i101 ]);
+    float p110 = ( z_c < 0 || z_c >= volume_size.z || y_c < 0 || y_c >= volume_size.y || x_f < 0 || x_f >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i110 ]);
+    float p111 = ( z_c < 0 || z_c >= volume_size.z || y_c < 0 || y_c >= volume_size.y || x_c < 0 || x_c >= volume_size.x ) ? 0.0f : __ldg(&const_volume_ptr[ i111 ]);
+
+    const float x_d = (point.x - x_f);
+    const float y_d = (point.y - y_f); 
+    const float z_d = (point.z - z_f);
+
+    const float p00 = __fmaf_rn(p100,z_d,__fmaf_rn(p000,-z_d,p000));
+    const float p01 = __fmaf_rn(p101,z_d,__fmaf_rn(p001,-z_d,p001));
+    const float p10 = __fmaf_rn(p110,z_d,__fmaf_rn(p010,-z_d,p010));
+    const float p11 = __fmaf_rn(p111,z_d,__fmaf_rn(p011,-z_d,p011));
+
+    const float p0 = __fmaf_rn(p10,y_d,__fmaf_rn(p00,-y_d,p00));
+    const float p1 = __fmaf_rn(p11,y_d,__fmaf_rn(p01,-y_d,p01));
+
+    const float p = __fmaf_rn(p1,x_d,__fmaf_rn(p0,-x_d,p0));
+
+    return p;
+}
+
+inline __device__ float kernel_project3D(const float* volume_ptr, const float3 source_point, const float3 ray_vector,
+                                         const float step_size, const uint3 volume_size, const uint3 pointer_offsets)
+{
+    float pixel = 0.0f;
+    // Step 1: compute alpha value at entry and exit point of the volume
+    float min_alpha, max_alpha;
+    min_alpha = 0;
+    max_alpha = CUDART_INF_F;
+    
+    if (0.0f != ray_vector.x)
+    {
+        float volume_min_edge_point = 0;
+        float volume_max_edge_point = volume_size.x;
+
+        float reci = 1.0f / ray_vector.x;
+        float alpha0 = (volume_min_edge_point - source_point.x) * reci;
+        float alpha1 = (volume_max_edge_point - source_point.x) * reci;
+        min_alpha = fmin(alpha0, alpha1);
+        max_alpha = fmax(alpha0, alpha1);
+    }
+
+    if (0.0f != ray_vector.y)
+    {
+        float volume_min_edge_point = 0;
+        float volume_max_edge_point = volume_size.y;
+
+        float reci = 1.0f / ray_vector.y;
+
+        float alpha0 = (volume_min_edge_point - source_point.y) * reci;
+        float alpha1 = (volume_max_edge_point - source_point.y) * reci;
+        min_alpha = fmax(min_alpha, fmin(alpha0, alpha1));
+        max_alpha = fmin(max_alpha, fmax(alpha0, alpha1));
+    }
+
+    if (0.0f != ray_vector.z)
+    {
+        float volume_min_edge_point = 0;
+        float volume_max_edge_point = volume_size.z;
+
+        float reci = 1.0f / ray_vector.z;
+
+        float alpha0 = (volume_min_edge_point - source_point.z) * reci;
+        float alpha1 = (volume_max_edge_point - source_point.z) * reci;
+        min_alpha = fmax(min_alpha, fmin(alpha0, alpha1));
+        max_alpha = fmin(max_alpha, fmax(alpha0, alpha1));
+    }
+    // we start not at the exact entry point
+    // => we can be sure to be inside the volume
+    min_alpha += step_size * 0.5f;
+    // Step 2: Cast ray if it intersects the volume
+    // Trapezoidal rule (interpolating function = piecewise linear func)
+
+    float3 point = make_float3(0,0,0);
+    // Entrance boundary
+    // For the initial interpolated value, only a half stepsize is
+    //  considered in the computation.
+    if (min_alpha < max_alpha)
+    {
+        point.x = source_point.x + min_alpha * ray_vector.x;
+        point.y = source_point.y + min_alpha * ray_vector.y;
+        point.z = source_point.z + min_alpha * ray_vector.z;
+        pixel += 0.5f * interp3D(volume_ptr, point, pointer_offsets, volume_size);
+        min_alpha += step_size;        
+    }
+
+    while (min_alpha < max_alpha)
+    {
+        point.x = source_point.x + min_alpha * ray_vector.x;
+        point.y = source_point.y + min_alpha * ray_vector.y;
+        point.z = source_point.z + min_alpha * ray_vector.z;
+        pixel += interp3D(volume_ptr, point, pointer_offsets, volume_size);;
+        min_alpha += step_size;
+    }    
+    // Scaling by stepsize;
+    pixel *= step_size;
+
+    //Last segment of the line
+    if (pixel > 0.0f)
+    {   
+        pixel -= 0.5f * step_size * interp3D(volume_ptr,  point, pointer_offsets, volume_size);
+        min_alpha -= step_size;
+        float last_step_size = max_alpha - min_alpha;
+
+        pixel += 0.5f * last_step_size* interp3D(volume_ptr, point, pointer_offsets, volume_size);
+
+        point.x = source_point.x + max_alpha * ray_vector.x;
+        point.y = source_point.y + max_alpha * ray_vector.y;
+        point.z = source_point.z + max_alpha * ray_vector.z;
+
+        // The last segment of the line integral takes care of the
+        // varying length.
+        pixel += 0.5f * last_step_size * interp3D(volume_ptr, point , pointer_offsets, volume_size);
+    }
+    return pixel;
+}
+__global__ void project_3Dcone_beam_kernel( const float* volume_ptr, float *pSinogram, 
+                                            const float *d_inv_AR_matrices, const float3 *d_src_points, const float sampling_step_size,
+                                            const uint3 volume_size, const float3 volume_spacing, const uint2 detector_size, const int number_of_projections, 
+                                            const uint3 pointer_offsets)
+{
+    //return;
+    uint2 detector_idx = make_uint2( blockIdx.x * blockDim.x + threadIdx.x,  blockIdx.y* blockDim.y + threadIdx.y  );
+    uint projection_number = blockIdx.z;
+    if (detector_idx.x >= detector_size.x || detector_idx.y >= detector_size.y || blockIdx.z >= number_of_projections)
+    {
+        return;
+    }
+    // //Preparations:
+	d_inv_AR_matrices += projection_number * 9;
+    float3 source_point = d_src_points[projection_number];
+    //Compute ray direction
+    const float rx = d_inv_AR_matrices[2] + detector_idx.y * d_inv_AR_matrices[1] + detector_idx.x * d_inv_AR_matrices[0];
+    const float ry = d_inv_AR_matrices[5] + detector_idx.y * d_inv_AR_matrices[4] + detector_idx.x * d_inv_AR_matrices[3];
+    const float rz = d_inv_AR_matrices[8] + detector_idx.y * d_inv_AR_matrices[7] + detector_idx.x * d_inv_AR_matrices[6];
+
+    float3 ray_vector = make_float3(rx,ry,rz);
+    ray_vector = normalize(ray_vector);
+
+    float pixel = kernel_project3D(
+        volume_ptr,
+        source_point,
+        ray_vector,
+        sampling_step_size,
+        volume_size,
+        pointer_offsets);
+
+    unsigned sinogram_idx = projection_number * detector_size.y * detector_size.x +  detector_idx.y * detector_size.x + detector_idx.x;
+
+     pixel *= sqrt(  (ray_vector.x * volume_spacing.x) * (ray_vector.x * volume_spacing.x) +
+                     (ray_vector.y * volume_spacing.y) * (ray_vector.y * volume_spacing.y) +
+                     (ray_vector.z * volume_spacing.z) * (ray_vector.z * volume_spacing.z)  );
+
+    pSinogram[sinogram_idx] = pixel;
+    return;
+}
+
+void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const float *inv_AR_matrix, const float *src_points, 
+                                    const int number_of_projections, const int volume_width, const int volume_height, const int volume_depth, 
+                                    const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z,
+                                    const int detector_width, const int detector_height, const float step_size)
+{
+    //COPY inv AR matrix to graphics card as float array
+    auto matrices_size_b = number_of_projections * 9 * sizeof(float);
+    float *d_inv_AR_matrices;
+    gpuErrchk(cudaMalloc(&d_inv_AR_matrices, matrices_size_b));
+    gpuErrchk(cudaMemcpy(d_inv_AR_matrices, inv_AR_matrix, matrices_size_b, cudaMemcpyHostToDevice));
+    //COPY source points to graphics card as float3
+    auto src_points_size_b = number_of_projections * sizeof(float3);
+
+    float3 *d_src_points;
+    gpuErrchk(cudaMalloc(&d_src_points, src_points_size_b));
+    gpuErrchk(cudaMemcpy(d_src_points, src_points, src_points_size_b, cudaMemcpyHostToDevice));
+
+    uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth);
+    float3 volume_spacing = make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z);
+
+    uint2 detector_size = make_uint2(detector_width, detector_height);
+    uint3 pointer_offsets = make_uint3(1,volume_width,volume_width*volume_height);
+    
+    const dim3 blocksize = dim3( BLOCKSIZE_X, BLOCKSIZE_Y, 1 );
+    const dim3 gridsize = dim3( detector_size.x / blocksize.x + 1, detector_size.y / blocksize.y + 1 , number_of_projections+1);
+
+    project_3Dcone_beam_kernel<<<gridsize, blocksize>>>(volume_ptr, out, d_inv_AR_matrices, d_src_points, step_size,
+                                        volume_size,volume_spacing, detector_size,number_of_projections,pointer_offsets);
+
+    cudaDeviceSynchronize();
+
+    // check for errors
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk(cudaFree(d_inv_AR_matrices));
+    gpuErrchk(cudaFree(d_src_points));
+}
+
diff --git a/generated_files/cone_projector_3D_CudaKernel_hardware_interp.cu b/generated_files/cone_projector_3D_CudaKernel_hardware_interp.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a533371b8ef59be520053ac2ecdf0865f3a22ffd
--- /dev/null
+++ b/generated_files/cone_projector_3D_CudaKernel_hardware_interp.cu
@@ -0,0 +1,280 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Ray-driven cone-beam projector CUDA kernel using texture inteprolation
+ * Implementation adapted from CONRAD
+ * PYRO-NN is developed as an Open Source project under the Apache License,
+ * Version 2.0.
+ */
+
+#
+#include "helper_headers/helper_grid.h"
+#include "helper_headers/helper_math.h"
+texture<float, 3, cudaReadModeElementType> volume_as_texture;
+#define CUDART_INF_F __int_as_float(0x7f800000)
+
+#include <cstdio>
+#define BLOCKSIZE_X 16
+#define BLOCKSIZE_Y 16
+
+#define gpuErrchk(ans)                                                         \
+  { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    exit(code);
+  }
+}
+
+__device__ float kernel_project3D_tex_interp(const float3 source_point,
+                                             const float3 ray_vector,
+                                             const float step_size,
+                                             const uint3 volume_size) {
+  float pixel = 0.0f;
+  // Step 1: compute alpha value at entry and exit point of the volume
+  float min_alpha, max_alpha;
+  min_alpha = 0;
+  max_alpha = CUDART_INF_F;
+
+  if (0.0f != ray_vector.x) {
+    float volume_min_edge_point = 0 - 0.5f;
+    float volume_max_edge_point = volume_size.x - 0.5f;
+
+    float reci = 1.0f / ray_vector.x;
+    float alpha0 = (volume_min_edge_point - source_point.x) * reci;
+    float alpha1 = (volume_max_edge_point - source_point.x) * reci;
+    min_alpha = fmin(alpha0, alpha1);
+    max_alpha = fmax(alpha0, alpha1);
+  }
+
+  if (0.0f != ray_vector.y) {
+    float volume_min_edge_point = 0 - 0.5f;
+    float volume_max_edge_point = volume_size.y - 0.5f;
+
+    float reci = 1.0f / ray_vector.y;
+    float alpha0 = (volume_min_edge_point - source_point.y) * reci;
+    float alpha1 = (volume_max_edge_point - source_point.y) * reci;
+    min_alpha = fmax(min_alpha, fmin(alpha0, alpha1));
+    max_alpha = fmin(max_alpha, fmax(alpha0, alpha1));
+  }
+
+  if (0.0f != ray_vector.z) {
+    float volume_min_edge_point = 0 - 0.5f;
+    float volume_max_edge_point = volume_size.z - 0.5f;
+
+    float reci = 1.0f / ray_vector.z;
+    float alpha0 = (volume_min_edge_point - source_point.z) * reci;
+    float alpha1 = (volume_max_edge_point - source_point.z) * reci;
+    min_alpha = fmax(min_alpha, fmin(alpha0, alpha1));
+    max_alpha = fmin(max_alpha, fmax(alpha0, alpha1));
+  }
+  // we start not at the exact entry point
+  // => we can be sure to be inside the volume
+  min_alpha += step_size * 0.5f;
+
+  // Step 2: Cast ray if it intersects the volume
+  // Trapezoidal rule (interpolating function = piecewise linear func)
+
+  float px, py, pz;
+
+  // Entrance boundary
+  // For the initial interpolated value, only a half stepsize is
+  //  considered in the computation.
+  if (min_alpha < max_alpha) {
+    px = source_point.x + min_alpha * ray_vector.x;
+    py = source_point.y + min_alpha * ray_vector.y;
+    pz = source_point.z + min_alpha * ray_vector.z;
+
+    pixel += 0.5f * tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f);
+
+    min_alpha += step_size;
+  }
+  // Mid segments
+  while (min_alpha < max_alpha) {
+    px = source_point.x + min_alpha * ray_vector.x;
+    py = source_point.y + min_alpha * ray_vector.y;
+    pz = source_point.z + min_alpha * ray_vector.z;
+
+    pixel += tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f);
+
+    min_alpha += step_size;
+  }
+  // Scaling by stepsize;
+  pixel *= step_size;
+
+  // Last segment of the line
+  if (pixel > 0.0f) {
+    pixel -= 0.5f * step_size *
+             tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f);
+
+    min_alpha -= step_size;
+    float last_step_size = max_alpha - min_alpha;
+
+    pixel += 0.5f * last_step_size *
+             tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f);
+
+    px = source_point.x + max_alpha * ray_vector.x;
+    py = source_point.y + max_alpha * ray_vector.y;
+    pz = source_point.z + max_alpha * ray_vector.z;
+
+    // The last segment of the line integral takes care of the
+    // varying length.
+    pixel += 0.5f * last_step_size *
+             tex3D(volume_as_texture, px + 0.5f, py + 0.5f, pz + 0.5f);
+  }
+  return pixel;
+}
+__global__ void project_3Dcone_beam_kernel_tex_interp(
+    float *pSinogram, const float *d_inv_AR_matrices,
+    const float3 *d_src_points, const float sampling_step_size,
+    const uint3 volume_size, const float3 volume_spacing,
+    const uint2 detector_size, const int number_of_projections) {
+  uint2 detector_idx = make_uint2(blockIdx.x * blockDim.x + threadIdx.x,
+                                  blockIdx.y * blockDim.y + threadIdx.y);
+  uint projection_number = blockIdx.z;
+  if (detector_idx.x >= detector_size.x || detector_idx.y >= detector_size.y ||
+      blockIdx.z >= number_of_projections) {
+    return;
+  }
+  // //Preparations:
+  d_inv_AR_matrices += projection_number * 9;
+  float3 source_point = d_src_points[projection_number];
+
+  // Compute ray direction
+  const float rx = d_inv_AR_matrices[2] +
+                   detector_idx.y * d_inv_AR_matrices[1] +
+                   detector_idx.x * d_inv_AR_matrices[0];
+  const float ry = d_inv_AR_matrices[5] +
+                   detector_idx.y * d_inv_AR_matrices[4] +
+                   detector_idx.x * d_inv_AR_matrices[3];
+  const float rz = d_inv_AR_matrices[8] +
+                   detector_idx.y * d_inv_AR_matrices[7] +
+                   detector_idx.x * d_inv_AR_matrices[6];
+
+  float3 ray_vector = make_float3(rx, ry, rz);
+  ray_vector = normalize(ray_vector);
+
+  float pixel = kernel_project3D_tex_interp(source_point, ray_vector,
+                                            sampling_step_size, volume_size);
+
+  pixel *= sqrt(
+      (ray_vector.x * volume_spacing.x) * (ray_vector.x * volume_spacing.x) +
+      (ray_vector.y * volume_spacing.y) * (ray_vector.y * volume_spacing.y) +
+      (ray_vector.z * volume_spacing.z) * (ray_vector.z * volume_spacing.z));
+
+  unsigned sinogram_idx =
+      projection_number * detector_size.y * detector_size.x +
+      detector_idx.y * detector_size.x + detector_idx.x;
+
+  pSinogram[sinogram_idx] = pixel;
+  return;
+}
+
+/*************** WARNING ******************./
+ *
+ *   Tensorflow is allocating the whole GPU memory for itself and just leave a
+ * small slack memory using cudaMalloc and cudaMalloc3D will allocate memory in
+ * this small slack memory ! Therefore, currently only small volumes can be used
+ * (they have to fit into the slack memory which TF does not allocae !)
+ *
+ *   This is the kernel based on texture interpolation, thus, the allocations
+ * are not within the Tensorflow managed memory. If memory errors occure:
+ *    1. start Tensorflow with less gpu memory and allow growth
+ *    2. switch to software-based interpolation.
+ *
+ *   TODO: use context->allocate_tmp and context->allocate_persistent instead of
+ * cudaMalloc for the inv_AR_matrix and src_points array :
+ * https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment
+ *
+ */
+void Cone_Projection_Kernel_Tex_Interp_Launcher(
+    const float *__restrict__ volume_ptr, float *out,
+    const float *inv_AR_matrix, const float *src_points,
+    const int number_of_projections, const int volume_width,
+    const int volume_height, const int volume_depth,
+    const float volume_spacing_x, const float volume_spacing_y,
+    const float volume_spacing_z, const int detector_width,
+    const int detector_height, const float step_size) {
+  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+  volume_as_texture.addressMode[0] = cudaAddressModeBorder;
+  volume_as_texture.addressMode[1] = cudaAddressModeBorder;
+  volume_as_texture.addressMode[2] = cudaAddressModeBorder;
+  volume_as_texture.filterMode = cudaFilterModeLinear;
+  volume_as_texture.normalized = false;
+
+  // COPY inv AR matrix to graphics card as float array
+  auto matrices_size_b = number_of_projections * 9 * sizeof(float);
+  float *d_inv_AR_matrices;
+  gpuErrchk(cudaMalloc(&d_inv_AR_matrices, matrices_size_b));
+  gpuErrchk(cudaMemcpy(d_inv_AR_matrices, inv_AR_matrix, matrices_size_b,
+                       cudaMemcpyHostToDevice));
+  // COPY source points to graphics card as float3
+  auto src_points_size_b = number_of_projections * sizeof(float3);
+  float3 *d_src_points;
+  gpuErrchk(cudaMalloc(&d_src_points, src_points_size_b));
+  gpuErrchk(cudaMemcpy(d_src_points, src_points, src_points_size_b,
+                       cudaMemcpyHostToDevice));
+
+  // COPY volume to graphics card
+  // Malloc cuda array for texture
+  cudaExtent volume_extent =
+      make_cudaExtent(volume_width, volume_height, volume_depth);
+  cudaExtent volume_extent_byte = make_cudaExtent(sizeof(float) * volume_width,
+                                                  volume_height, volume_depth);
+
+  cudaPitchedPtr d_volumeMem = make_cudaPitchedPtr(
+      const_cast<float *>(volume_ptr), volume_width * sizeof(float),
+      volume_width, volume_height);
+
+  cudaArray *volume_array;
+  gpuErrchk(cudaMalloc3DArray(&volume_array, &channelDesc, volume_extent));
+
+  cudaMemcpy3DParms copyParams = {0};
+  copyParams.srcPtr = d_volumeMem;
+  copyParams.dstArray = volume_array;
+  copyParams.extent = volume_extent;
+  copyParams.kind = cudaMemcpyDeviceToDevice;
+
+  gpuErrchk(cudaMemcpy3D(&copyParams));
+
+  gpuErrchk(
+      cudaBindTextureToArray(volume_as_texture, volume_array, channelDesc))
+
+      uint3 volume_size = make_uint3(volume_width, volume_height, volume_depth);
+  float3 volume_spacing =
+      make_float3(volume_spacing_x, volume_spacing_y, volume_spacing_z);
+
+  uint2 detector_size = make_uint2(detector_width, detector_height);
+
+  const dim3 blocksize = dim3(BLOCKSIZE_X, BLOCKSIZE_Y, 1);
+  const dim3 gridsize =
+      dim3(detector_size.x / blocksize.x + 1, detector_size.y / blocksize.y + 1,
+           number_of_projections + 1);
+
+  project_3Dcone_beam_kernel_tex_interp<<<gridsize, blocksize>>>(
+      out, d_inv_AR_matrices, d_src_points, step_size, volume_size,
+      volume_spacing, detector_size, number_of_projections);
+
+  cudaDeviceSynchronize();
+
+  // check for errors
+  gpuErrchk(cudaPeekAtLastError());
+  gpuErrchk(cudaFreeArray(volume_array));
+  gpuErrchk(cudaUnbindTexture(volume_as_texture));
+  gpuErrchk(cudaFree(d_inv_AR_matrices));
+  gpuErrchk(cudaFree(d_src_points));
+}
diff --git a/generated_files/fan_backprojector_2D_CudaKernel.cu b/generated_files/fan_backprojector_2D_CudaKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b3833ca4bea4e28550092896493e6cd24641edcd
--- /dev/null
+++ b/generated_files/fan_backprojector_2D_CudaKernel.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Voxel-driven fan-beam back-projection CUDA kernel.
+ * PYRO-NN is developed as an Open Source project under the Apache License,
+ * Version 2.0.
+ */
+#include "helper_headers/helper_geometry_gpu.h"
+#include "helper_headers/helper_grid.h"
+#include "helper_headers/helper_math.h"
+
+
+texture<float, cudaTextureType2D, cudaReadModeElementType> sinogram_as_texture;
+#define CUDART_INF_F __int_as_float(0x7f800000)
+
+__global__ void backproject_2Dfan_beam_kernel(
+    float *pVolume, const float2 *d_rays, const int number_of_projections,
+    const float sampling_step_size, const int2 volume_size,
+    const float2 volume_spacing, const float2 volume_origin,
+    const int detector_size, const float detector_spacing,
+    const float detector_origin, const float sid, const float sdd) {
+  const float pi = 3.14159265359f;
+  unsigned int volume_x = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int volume_y = blockIdx.y * blockDim.y + threadIdx.y;
+  if (volume_x >= volume_size.x || volume_y >= volume_size.y) {
+    return;
+  }
+  // Preparations:
+  const float2 pixel_coordinate = index_to_physical(
+      make_float2(volume_x, volume_y), volume_origin, volume_spacing);
+  float pixel_value = 0.0f;
+
+  for (int n = 0; n < number_of_projections; n++) {
+    float2 central_ray = d_rays[n];
+    float2 detector_vec = make_float2(-central_ray.y, central_ray.x);
+
+    float2 source_position = central_ray * (-sid);
+    float2 central_point = source_position + central_ray * sdd;
+
+    float2 intersection =
+        intersectLines2D(pixel_coordinate, source_position, central_point,
+                         central_point + detector_vec);
+    float distance_weight =
+        1.0f / (float)length(pixel_coordinate - source_position);
+    float s = dot(intersection, detector_vec);
+    float s_idx = physical_to_index(s, detector_origin, detector_spacing);
+
+    pixel_value += tex2D(sinogram_as_texture, s_idx + 0.5f, n + 0.5f) *
+                   distance_weight * distance_weight;
+  }
+
+  const unsigned volume_linearized_idx = volume_y * volume_size.x + volume_x;
+  pVolume[volume_linearized_idx] =
+      sid * sdd * pi * pixel_value / number_of_projections;
+
+  return;
+}
+/*************** WARNING ******************./
+ *
+ *   Tensorflow is allocating the whole GPU memory for itself and just leave a
+ * small slack memory using cudaMalloc and cudaMalloc3D will allocate memory in
+ * this small slack memory ! Therefore, currently only small volumes can be used
+ * (they have to fit into the slack memory which TF does not allocae !)
+ *
+ *   This is the kernel based on texture interpolation, thus, the allocations
+ * are not within the Tensorflow managed memory. If memory errors occure:
+ *    1. start Tensorflow with less gpu memory and allow growth
+ *    2. TODO: no software interpolation based 2D verions are available yet
+ *
+ *   TODO: use context->allocate_tmp and context->allocate_persistent instead of
+ * cudaMalloc for the ray_vectors array :
+ * https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment
+ *
+ */
+void Fan_Backprojection2D_Kernel_Launcher(
+    const float *sinogram_ptr, float *out, const float *ray_vectors,
+    const int number_of_projections, const int volume_width,
+    const int volume_height, const float volume_spacing_x,
+    const float volume_spacing_y, const float volume_origin_x,
+    const float volume_origin_y, const int detector_size,
+    const float detector_spacing, const float detector_origin, const float sid,
+    const float sdd) {
+  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+  sinogram_as_texture.addressMode[0] = cudaAddressModeBorder;
+  sinogram_as_texture.addressMode[1] = cudaAddressModeBorder;
+  sinogram_as_texture.filterMode = cudaFilterModeLinear;
+  sinogram_as_texture.normalized = false;
+
+  cudaArray *sinogram_array;
+  cudaMallocArray(&sinogram_array, &channelDesc, detector_size,
+                  number_of_projections);
+  cudaMemcpyToArray(sinogram_array, 0, 0, sinogram_ptr,
+                    detector_size * number_of_projections * sizeof(float),
+                    cudaMemcpyHostToDevice);
+  cudaBindTextureToArray(sinogram_as_texture, sinogram_array, channelDesc);
+
+  auto ray_size_b = number_of_projections * sizeof(float2);
+  float2 *d_rays;
+  cudaMalloc(&d_rays, ray_size_b);
+  cudaMemcpy(d_rays, ray_vectors, ray_size_b, cudaMemcpyHostToDevice);
+
+  float sampling_step_size = 1;
+
+  int2 volume_size = make_int2(volume_width, volume_height);
+  float2 volume_spacing = make_float2(volume_spacing_x, volume_spacing_y);
+  float2 volume_origin = make_float2(volume_origin_x, volume_origin_y);
+
+  const unsigned block_size = 16;
+  const dim3 threads_per_block = dim3(block_size, block_size);
+  const dim3 num_blocks = dim3(volume_width / threads_per_block.x + 1,
+                               volume_height / threads_per_block.y + 1);
+
+  backproject_2Dfan_beam_kernel<<<num_blocks, threads_per_block>>>(
+      out, d_rays, number_of_projections, sampling_step_size, volume_size,
+      volume_spacing, volume_origin, detector_size, detector_spacing,
+      detector_origin, sid, sdd);
+
+  cudaUnbindTexture(sinogram_as_texture);
+  cudaFreeArray(sinogram_array);
+  cudaFree(d_rays);
+}
+
diff --git a/generated_files/fan_projector_2D_CudaKernel.cu b/generated_files/fan_projector_2D_CudaKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..562c6fe23ecddd55e625491ecb6e5cee5ffdecc4
--- /dev/null
+++ b/generated_files/fan_projector_2D_CudaKernel.cu
@@ -0,0 +1,239 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Ray-driven fan-beam projector CUDA kernel.
+ * PYRO-NN is developed as an Open Source project under the Apache License,
+ * Version 2.0.
+ */
+
+#include "helper_headers/helper_grid.h"
+#include "helper_headers/helper_math.h"
+
+texture<float, cudaTextureType2D, cudaReadModeElementType> volume_as_texture;
+#define CUDART_INF_F __int_as_float(0x7f800000)
+
+__device__ float kernel_project2D(const float2 source_point,
+                                  const float2 ray_vector,
+                                  const float step_size, const int2 volume_size,
+                                  const float2 volume_origin,
+                                  const float2 volume_spacing) {
+
+  unsigned int detector_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int projection_idx = blockIdx.y;
+
+  float pixel = 0.0f;
+  // Step 1: compute alpha value at entry and exit point of the volume
+  float min_alpha, max_alpha;
+  min_alpha = 0;
+  max_alpha = CUDART_INF_F;
+
+  if (0.0f != ray_vector.x) {
+    float volume_min_edge_point =
+        index_to_physical(0, volume_origin.x, volume_spacing.x) - 0.5f;
+    float volume_max_edge_point =
+        index_to_physical(volume_size.x, volume_origin.x, volume_spacing.x) -
+        0.5f;
+
+    float reci = 1.0f / ray_vector.x;
+    float alpha0 = (volume_min_edge_point - source_point.x) * reci;
+    float alpha1 = (volume_max_edge_point - source_point.x) * reci;
+    min_alpha = fmin(alpha0, alpha1);
+    max_alpha = fmax(alpha0, alpha1);
+  }
+
+  if (0.0f != ray_vector.y) {
+    float volume_min_edge_point =
+        index_to_physical(0, volume_origin.y, volume_spacing.y) - 0.5f;
+    float volume_max_edge_point =
+        index_to_physical(volume_size.y, volume_origin.y, volume_spacing.y) -
+        0.5f;
+
+    float reci = 1.0f / ray_vector.y;
+    float alpha0 = (volume_min_edge_point - source_point.y) * reci;
+    float alpha1 = (volume_max_edge_point - source_point.y) * reci;
+    min_alpha = fmax(min_alpha, fmin(alpha0, alpha1));
+    max_alpha = fmin(max_alpha, fmax(alpha0, alpha1));
+  }
+
+  float px, py;
+  // pixel = source_point.x + min_alpha * ray_vector.x;
+  // Entrance boundary
+  // In CUDA, voxel centers are located at (xx.5, xx.5, xx.5),
+  //  whereas, SwVolume has voxel centers at integers.
+  // For the initial interpolated value, only a half stepsize is
+  //  considered in the computation.
+  if (min_alpha < max_alpha) {
+    px = source_point.x + min_alpha * ray_vector.x;
+    py = source_point.y + min_alpha * ray_vector.y;
+
+    pixel +=
+        0.5f *
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+    min_alpha += step_size;
+  }
+  // Mid segments
+  while (min_alpha < max_alpha) {
+    px = source_point.x + min_alpha * ray_vector.x;
+    py = source_point.y + min_alpha * ray_vector.y;
+    float2 interp_point =
+        physical_to_index(make_float2(px, py), volume_origin, volume_spacing);
+    pixel +=
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+    min_alpha += step_size;
+  }
+  // Scaling by stepsize;
+  pixel *= step_size;
+
+  // Last segment of the line
+  if (pixel > 0.0f) {
+    pixel -=
+        0.5f * step_size *
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+    min_alpha -= step_size;
+    float last_step_size = max_alpha - min_alpha;
+    pixel +=
+        0.5f * last_step_size *
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+
+    px = source_point.x + max_alpha * ray_vector.x;
+    py = source_point.y + max_alpha * ray_vector.y;
+    // The last segment of the line integral takes care of the
+    // varying length.
+    pixel +=
+        0.5f * last_step_size *
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+  }
+  return pixel;
+}
+
+__global__ void project_2Dfan_beam_kernel(
+    float *pSinogram, const float2 *d_rays, const int number_of_projections,
+    const float sampling_step_size, const int2 volume_size,
+    const float2 volume_spacing, const float2 volume_origin,
+    const int detector_size, const float detector_spacing,
+    const float detector_origin, const float sid, const float sdd) {
+  unsigned int detector_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (detector_idx >= detector_size) {
+    return;
+  }
+  // Preparations:
+
+  // Assume a source isocenter distance to compute the start of the ray,
+  // although sid is not neseccary for a par beam geometry
+  // TODO: use volume spacing to reduce ray length
+  int projection_idx = blockIdx.y;
+  float2 central_ray_vector = d_rays[projection_idx];
+
+  // create detector coordinate system (u,v) w.r.t the ray
+  float2 u_vec = make_float2(-central_ray_vector.y, central_ray_vector.x);
+  // calculate physical coordinate of detector pixel
+  float u = index_to_physical(detector_idx, detector_origin, detector_spacing);
+  // float u = detector_idx * detector_spacing + detector_mid;
+
+  // Calculate "source"-Point (start point for the parallel ray), so we can use
+  // the projection kernel Assume a source isocenter distance to compute the
+  // start of the ray, although sid is not neseccary for a par beam geometry
+  float2 source_point = central_ray_vector * (-sid);
+
+  float2 detector_point_world =
+      source_point + central_ray_vector * sdd + u_vec * u;
+  float2 ray_vector = normalize(detector_point_world - source_point);
+
+  float pixel = kernel_project2D(source_point, ray_vector,
+                                 sampling_step_size *
+                                     fmin(volume_spacing.x, volume_spacing.y),
+                                 volume_size, volume_origin, volume_spacing);
+
+  pixel *= sqrt(
+      (ray_vector.x * volume_spacing.x) * (ray_vector.x * volume_spacing.x) +
+      (ray_vector.y * volume_spacing.y) * (ray_vector.y * volume_spacing.y));
+
+  unsigned sinogram_idx = projection_idx * detector_size + detector_idx;
+  pSinogram[sinogram_idx] = pixel;
+
+  return;
+}
+/*************** WARNING ******************./
+ *
+ *   Tensorflow is allocating the whole GPU memory for itself and just leave a
+ * small slack memory using cudaMalloc and cudaMalloc3D will allocate memory in
+ * this small slack memory ! Therefore, currently only small volumes can be used
+ * (they have to fit into the slack memory which TF does not allocae !)
+ *
+ *   This is the kernel based on texture interpolation, thus, the allocations
+ * are not within the Tensorflow managed memory. If memory errors occure:
+ *    1. start Tensorflow with less gpu memory and allow growth
+ *    2. TODO: no software interpolation based 2D verions are available yet
+ *
+ *   TODO: use context->allocate_tmp and context->allocate_persistent instead of
+ * cudaMalloc for the ray_vectors array :
+ * https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment
+ *
+ */
+void Fan_Projection_Kernel_Launcher(
+    const float *volume_ptr, float *out, const float *ray_vectors,
+    const int number_of_projections, const int volume_width,
+    const int volume_height, const float volume_spacing_x,
+    const float volume_spacing_y, const float volume_origin_x,
+    const float volume_origin_y, const int detector_size,
+    const float detector_spacing, const float detector_origin, const float sid,
+    const float sdd) {
+  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+  volume_as_texture.addressMode[0] = cudaAddressModeBorder;
+  volume_as_texture.addressMode[1] = cudaAddressModeBorder;
+  volume_as_texture.filterMode = cudaFilterModeLinear;
+  volume_as_texture.normalized = false;
+
+  cudaArray *volume_array;
+  cudaMallocArray(&volume_array, &channelDesc, volume_width, volume_height);
+  cudaMemcpyToArray(volume_array, 0, 0, volume_ptr,
+                    volume_width * volume_height * sizeof(float),
+                    cudaMemcpyHostToDevice);
+  cudaBindTextureToArray(volume_as_texture, volume_array, channelDesc);
+
+  auto ray_size_b = number_of_projections * sizeof(float2);
+  float2 *d_rays;
+  cudaMalloc(&d_rays, ray_size_b);
+  cudaMemcpy(d_rays, ray_vectors, ray_size_b, cudaMemcpyHostToDevice);
+
+  float sampling_step_size = 0.2;
+
+  int2 volume_size = make_int2(volume_width, volume_height);
+  float2 volume_spacing = make_float2(volume_spacing_x, volume_spacing_y);
+  float2 volume_origin = make_float2(volume_origin_x, volume_origin_y);
+
+  const unsigned blocksize = 256;
+  const dim3 gridsize =
+      dim3((detector_size / blocksize) + 1, number_of_projections);
+  project_2Dfan_beam_kernel<<<gridsize, blocksize>>>(
+      out, d_rays, number_of_projections, sampling_step_size, volume_size,
+      volume_spacing, volume_origin, detector_size, detector_spacing,
+      detector_origin, sid, sdd);
+
+  cudaUnbindTexture(volume_as_texture);
+  cudaFreeArray(volume_array);
+  cudaFree(d_rays);
+}
diff --git a/generated_files/helper_headers/helper_eigen.h b/generated_files/helper_headers/helper_eigen.h
new file mode 100644
index 0000000000000000000000000000000000000000..b31f845ad283f49cdc54c80253c528f441650ace
--- /dev/null
+++ b/generated_files/helper_headers/helper_eigen.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * Conversion from: https://stackoverflow.com/questions/48795789/eigen-unsupported-tensor-to-eigen-matrix
+ * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0.
+*/
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template<typename T>
+using  MatrixType = Eigen::Matrix<T,Eigen::Dynamic, Eigen::Dynamic>;
+
+template<typename Scalar,int rank, typename sizeType>
+MatrixType<Scalar> Tensor_to_Matrix(const Eigen::Tensor<Scalar,rank> &tensor,const sizeType rows,const sizeType cols)
+{
+    return Eigen::Map<const MatrixType<Scalar>> (tensor.data(), rows,cols);
+}
+
+template<typename Scalar, typename... Dims>
+Eigen::Tensor< Scalar, sizeof... (Dims)> Matrix_to_Tensor(const MatrixType<Scalar> &matrix, Dims... dims)
+{
+    constexpr int rank = sizeof... (Dims);
+    return Eigen::TensorMap<Eigen::Tensor<const Scalar, rank>>(matrix.data(), {dims...});
+}
\ No newline at end of file
diff --git a/generated_files/helper_headers/helper_geometry_cpu.h b/generated_files/helper_headers/helper_geometry_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc51aaf32f7818ef6e12ad386bd7c8aa2289344c
--- /dev/null
+++ b/generated_files/helper_headers/helper_geometry_cpu.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * Helper methods to prepare projection matrices for projector kernel
+ * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0.
+*/
+#ifndef HELPER_GEOMETRY_CPU_H
+#define HELPER_GEOMETRY_CPU_H
+#pragma once
+#include <Eigen/Dense>
+#include <Eigen/SVD>
+
+namespace Geometry{
+    
+    /// Compute right null-space of A
+    Eigen::VectorXf nullspace(const Eigen::MatrixXf& A)
+    {
+        Eigen::JacobiSVD<Eigen::MatrixXf> svd(A, Eigen::ComputeFullU | Eigen::ComputeFullV);
+        auto V=svd.matrixV();
+        return V.col(V.cols()-1);
+    }
+    /// Extract the world coordinates of the camera center from a projection matrix. (SVD based implementation)
+    Eigen::Vector4f getCameraCenter(const Eigen::MatrixXf& P)
+    {
+        Eigen::Vector4f C = Geometry::nullspace(P);
+        if (C(3)<-1e-12 || C(3)>1e-12)
+            C=C/C(3); // Def:Camera centers are always positive.
+        return C;
+    }
+}
+#endif
\ No newline at end of file
diff --git a/generated_files/helper_headers/helper_geometry_gpu.h b/generated_files/helper_headers/helper_geometry_gpu.h
new file mode 100755
index 0000000000000000000000000000000000000000..b47c87de9264e37e77f052bfbc19239dd782d7b1
--- /dev/null
+++ b/generated_files/helper_headers/helper_geometry_gpu.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * Computes line intersection for projector kernel
+ * Implementation is adapted from CONRAD
+ * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0.
+*/
+#pragma once
+#ifndef HELPER_GEOMETRY_GPU_H
+#define HELPER_GEOMETRY_GPU_H
+
+#include "helper_math.h"
+
+inline __device__ float2 intersectLines2D(float2 p1, float2 p2, float2 p3, float2 p4)
+{
+    float dNom = (p1.x - p2.x) * (p3.y - p4.y) - (p1.y - p2.y) * (p3.x - p4.x);
+
+    if (dNom < 0.000001f && dNom > -0.000001f)
+    {
+        float2 retValue = {NAN, NAN};
+        return retValue;
+    }
+    float x = (p1.x * p2.y - p1.y * p2.x) * (p3.x - p4.x) - (p1.x - p2.x) * (p3.x * p4.y - p3.y * p4.x);
+    float y = (p1.x * p2.y - p1.y * p2.x) * (p3.y - p4.y) - (p1.y - p2.y) * (p3.x * p4.y - p3.y * p4.x);
+
+    x /= dNom;
+    y /= dNom;
+    float2 isectPt = {x, y};
+    return isectPt;
+}
+
+#endif
diff --git a/generated_files/helper_headers/helper_grid.h b/generated_files/helper_headers/helper_grid.h
new file mode 100755
index 0000000000000000000000000000000000000000..61fa5e3d8b5b92d5043a547905c7b64b7b7b845e
--- /dev/null
+++ b/generated_files/helper_headers/helper_grid.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * Helper methods to convert index to physical coordinates and vice versa
+ * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0.
+*/
+#ifndef HELPER_GRID_H
+#define HELPER_GRID_H
+
+
+inline __host__ __device__ float index_to_physical(float index, float origin, float spacing)
+{
+    return index * spacing + origin;
+}
+
+inline __host__ __device__ float physical_to_index(float physical, float origin, float spacing)
+{
+    return (physical - origin) / spacing;
+}
+
+inline __host__ __device__ float2 index_to_physical(float2 index, float2 origin, float2 spacing)
+{
+    return make_float2(index.x * spacing.x + origin.x, index.y * spacing.y + origin.y);
+}
+
+inline __host__ __device__ float2 physical_to_index(float2 physical, float2 origin, float2 spacing)
+{
+    return make_float2((physical.x - origin.x) / spacing.x, (physical.y - origin.y) / spacing.y);
+}
+
+inline __host__ __device__ float3 index_to_physical(float3 index, float3 origin, float3 spacing)
+{
+    return make_float3(index.x * spacing.x + origin.x, index.y * spacing.y + origin.y, index.z * spacing.z + origin.z);
+}
+
+inline __host__ __device__ float3 physical_to_index(float3 physical, float3 origin, float3 spacing)
+{
+    return make_float3((physical.x - origin.x) / spacing.x, (physical.y - origin.y) / spacing.y, (physical.z - origin.z) / spacing.z);
+}
+
+#endif
+
+
diff --git a/generated_files/helper_headers/helper_math.h b/generated_files/helper_headers/helper_math.h
new file mode 100755
index 0000000000000000000000000000000000000000..d56d461674938e8e9190c42acfeec6b61bcd2110
--- /dev/null
+++ b/generated_files/helper_headers/helper_math.h
@@ -0,0 +1,1449 @@
+/**
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+ *  This file implements common mathematical operations on vector types
+ *  (float3, float4 etc.) since these are not provided as standard by CUDA.
+ *
+ *  The syntax is modeled on the Cg standard library.
+ *
+ *  This is part of the Helper library includes
+ *
+ *    Thanks to Linh Hah for additions and fixes.
+ */
+
+#ifndef HELPER_MATH_H
+#define HELPER_MATH_H
+
+#include "cuda_runtime.h"
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#ifndef __CUDACC__
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+
+inline float fminf(float a, float b)
+{
+    return a < b ? a : b;
+}
+
+inline float fmaxf(float a, float b)
+{
+    return a > b ? a : b;
+}
+
+inline int max(int a, int b)
+{
+    return a > b ? a : b;
+}
+
+inline int min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline float rsqrtf(float x)
+{
+    return 1.0f / sqrtf(x);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 make_float2(float s)
+{
+    return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(float3 a)
+{
+    return make_float2(a.x, a.y);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ float2 make_float2(uint2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+
+inline __host__ __device__ int2 make_int2(int s)
+{
+    return make_int2(s, s);
+}
+inline __host__ __device__ int2 make_int2(int3 a)
+{
+    return make_int2(a.x, a.y);
+}
+inline __host__ __device__ int2 make_int2(uint2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ int2 make_int2(float2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+
+inline __host__ __device__ uint2 make_uint2(uint s)
+{
+    return make_uint2(s, s);
+}
+inline __host__ __device__ uint2 make_uint2(uint3 a)
+{
+    return make_uint2(a.x, a.y);
+}
+inline __host__ __device__ uint2 make_uint2(int2 a)
+{
+    return make_uint2(uint(a.x), uint(a.y));
+}
+
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+    return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+    return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ float3 make_float3(uint3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+
+inline __host__ __device__ int3 make_int3(int s)
+{
+    return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(int2 a)
+{
+    return make_int3(a.x, a.y, 0);
+}
+inline __host__ __device__ int3 make_int3(int2 a, int s)
+{
+    return make_int3(a.x, a.y, s);
+}
+inline __host__ __device__ int3 make_int3(uint3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+    return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a)
+{
+    return make_uint3(a.x, a.y, 0);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
+{
+    return make_uint3(a.x, a.y, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint4 a)
+{
+    return make_uint3(a.x, a.y, a.z);
+}
+inline __host__ __device__ uint3 make_uint3(int3 a)
+{
+    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+    return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ float4 make_float4(uint4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+
+inline __host__ __device__ int4 make_int4(int s)
+{
+    return make_int4(s, s, s, s);
+}
+inline __host__ __device__ int4 make_int4(int3 a)
+{
+    return make_int4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ int4 make_int4(int3 a, int w)
+{
+    return make_int4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ int4 make_int4(uint4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ int4 make_int4(float4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+
+
+inline __host__ __device__ uint4 make_uint4(uint s)
+{
+    return make_uint4(s, s, s, s);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a)
+{
+    return make_uint4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
+{
+    return make_uint4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ uint4 make_uint4(int4 a)
+{
+    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ int4 operator-(int4 &a)
+{
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ float2 operator+(float2 a, float b)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ float2 operator+(float b, float2 a)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(float2 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+    return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float3 operator+(float b, float3 a)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float b, float4 a)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(float4 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ float2 operator-(float2 a, float b)
+{
+    return make_float2(a.x - b, a.y - b);
+}
+inline __host__ __device__ float2 operator-(float b, float2 a)
+{
+    return make_float2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+    return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+    return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+    return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+    return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+    return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ float3 operator-(float b, float3 a)
+{
+    return make_float3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+    return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+    return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+    return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+    return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ float4 operator-(float4 a, float b)
+{
+    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ void operator-=(float4 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ float2 operator*(float2 a, float b)
+{
+    return make_float2(a.x * b, a.y * b);
+}
+inline __host__ __device__ float2 operator*(float b, float2 a)
+{
+    return make_float2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+    return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+    return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+    return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+    return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+    return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ float3 operator*(float3 a, float b)
+{
+    return make_float3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ float3 operator*(float b, float3 a)
+{
+    return make_float3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+    return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+    return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+    return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+    return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ float4 operator*(float4 a, float b)
+{
+    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ float4 operator*(float b, float4 a)
+{
+    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ void operator/=(float2 &a, float2 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+}
+inline __host__ __device__ float2 operator/(float2 a, float b)
+{
+    return make_float2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(float2 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+}
+inline __host__ __device__ float2 operator/(float b, float2 a)
+{
+    return make_float2(b / a.x, b / a.y);
+}
+
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(float3 &a, float3 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+inline __host__ __device__ float3 operator/(float3 a, float b)
+{
+    return make_float3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(float3 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+}
+inline __host__ __device__ float3 operator/(float b, float3 a)
+{
+    return make_float3(b / a.x, b / a.y, b / a.z);
+}
+
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
+}
+inline __host__ __device__ void operator/=(float4 &a, float4 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+    a.w /= b.w;
+}
+inline __host__ __device__ float4 operator/(float4 a, float b)
+{
+    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
+}
+inline __host__ __device__ void operator/=(float4 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+    a.w /= b;
+}
+inline __host__ __device__ float4 operator/(float b, float4 a)
+{
+    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+
+inline  __host__ __device__ float2 fminf(float2 a, float2 b)
+{
+    return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
+}
+inline __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline  __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+    return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+    return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
+{
+    return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
+}
+inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+    return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+    return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float4 v)
+{
+    return sqrtf(dot(v, v));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 floorf(float2 v)
+{
+    return make_float2(floorf(v.x), floorf(v.y));
+}
+inline __host__ __device__ float3 floorf(float3 v)
+{
+    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ float4 floorf(float4 v)
+{
+    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float fracf(float v)
+{
+    return v - floorf(v);
+}
+inline __host__ __device__ float2 fracf(float2 v)
+{
+    return make_float2(fracf(v.x), fracf(v.y));
+}
+inline __host__ __device__ float3 fracf(float3 v)
+{
+    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ float4 fracf(float4 v)
+{
+    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmodf(float2 a, float2 b)
+{
+    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+inline __host__ __device__ float3 fmodf(float3 a, float3 b)
+{
+    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ float4 fmodf(float4 a, float4 b)
+{
+    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fabs(float2 v)
+{
+    return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __host__ __device__ float3 fabs(float3 v)
+{
+    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ float4 fabs(float4 v)
+{
+    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+
+inline __host__ __device__ int2 abs(int2 v)
+{
+    return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+    return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+    return i - 2.0f * n * dot(n,i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// smoothstep
+// - returns 0 if x < a
+// - returns 1 if x > b
+// - otherwise returns smooth interpolation between 0 and 1 based on x
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float smoothstep(float a, float b, float x)
+{
+    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(3.0f - (2.0f*y)));
+}
+inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
+{
+    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
+}
+inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
+{
+    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
+}
+inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
+{
+    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
+}
+
+#endif
diff --git a/generated_files/par_backprojector_2D_CudaKernel.cu b/generated_files/par_backprojector_2D_CudaKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..15c7c67601fe71f681cb64be25e5bfca4ee557fa
--- /dev/null
+++ b/generated_files/par_backprojector_2D_CudaKernel.cu
@@ -0,0 +1,111 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * Voxel-driven parllel-beam back-projector CUDA kernel
+ * Implementation partially adapted from CONRAD
+ * PYRO-NN is developed as an Open Source project under the Apache License, Version 2.0.
+*/
+
+#include "helper_headers/helper_grid.h"
+#include "helper_headers/helper_math.h"
+
+texture<float, cudaTextureType2D, cudaReadModeElementType> sinogram_as_texture;
+#define CUDART_INF_F __int_as_float(0x7f800000)
+
+__global__ void backproject_2Dpar_beam_kernel(float *pVolume, const float2 *d_rays, const int number_of_projections,
+                                              const int2 volume_size, const float2 volume_spacing, const float2 volume_origin,
+                                              const int detector_size, const float detector_spacing, const float detector_origin)
+{
+    const float pi = 3.14159265359f;
+    unsigned int volume_x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int volume_y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (volume_x >= volume_size.x || volume_y >= volume_size.y)
+    {
+        return;
+    }
+    //Preparations:
+    const float2 pixel_coordinate = index_to_physical(make_float2(volume_x, volume_y), volume_origin, volume_spacing);
+    float pixel_value = 0.0f;
+
+    for (int n = 0; n < number_of_projections; n++)
+    {
+        float2 detector_normal = d_rays[n];
+        float2 detector_vec = make_float2(-detector_normal.y, detector_normal.x);
+
+        float s = dot(pixel_coordinate, detector_vec);
+        float s_idx = physical_to_index(s, detector_origin, detector_spacing);
+
+        pixel_value += tex2D(sinogram_as_texture, s_idx + 0.5f, n + 0.5f);
+    }
+
+    const unsigned volume_linearized_idx = volume_y * volume_size.x + volume_x;
+    pVolume[volume_linearized_idx] = pi * pixel_value / number_of_projections;
+
+    return;
+}
+/*************** WARNING ******************./
+    * 
+    *   Tensorflow is allocating the whole GPU memory for itself and just leave a small slack memory
+    *   using cudaMalloc and cudaMalloc3D will allocate memory in this small slack memory !
+    *   Therefore, currently only small volumes can be used (they have to fit into the slack memory which TF does not allocae !)
+    * 
+    *   This is the kernel based on texture interpolation, thus, the allocations are not within the Tensorflow managed memory.
+    *   If memory errors occure:
+    *    1. start Tensorflow with less gpu memory and allow growth
+    *    2. TODO: no software interpolation based 2D verions are available yet
+    * 
+    *   TODO: use context->allocate_tmp and context->allocate_persistent instead of cudaMalloc for the ray_vectors array
+    *       : https://stackoverflow.com/questions/48580580/tensorflow-new-op-cuda-kernel-memory-managment
+    * 
+    */
+void Parallel_Backprojection2D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *ray_vectors, const int number_of_projections,
+                                               const int volume_width, const int volume_height, const float volume_spacing_x, const float volume_spacing_y,
+                                               const float volume_origin_x, const float volume_origin_y,
+                                               const int detector_size, const float detector_spacing, const float detector_origin)
+{
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    sinogram_as_texture.addressMode[0] = cudaAddressModeBorder;
+    sinogram_as_texture.addressMode[1] = cudaAddressModeBorder;
+    sinogram_as_texture.filterMode = cudaFilterModeLinear;
+    sinogram_as_texture.normalized = false;
+
+    cudaArray *sinogram_array;
+    cudaMallocArray(&sinogram_array, &channelDesc, detector_size, number_of_projections);
+    cudaMemcpyToArray(sinogram_array, 0, 0, sinogram_ptr, detector_size * number_of_projections * sizeof(float), cudaMemcpyHostToDevice);
+    cudaBindTextureToArray(sinogram_as_texture, sinogram_array, channelDesc);
+
+    auto ray_size_b = number_of_projections * sizeof(float2);
+    float2 *d_rays;
+    cudaMalloc(&d_rays, ray_size_b);
+    cudaMemcpy(d_rays, ray_vectors, ray_size_b, cudaMemcpyHostToDevice);
+
+    int2 volume_size = make_int2(volume_width, volume_height);
+    float2 volume_spacing = make_float2(volume_spacing_x, volume_spacing_y);
+    float2 volume_origin = make_float2(volume_origin_x, volume_origin_y);
+
+    const unsigned block_size = 16;
+    const dim3 threads_per_block = dim3(block_size, block_size);
+    const dim3 num_blocks = dim3(volume_width / threads_per_block.x + 1, volume_height / threads_per_block.y + 1);
+
+    backproject_2Dpar_beam_kernel<<<num_blocks, threads_per_block>>>(out, d_rays, number_of_projections,
+                                                                     volume_size, volume_spacing, volume_origin,
+                                                                     detector_size, detector_spacing, detector_origin);
+
+    cudaUnbindTexture(sinogram_as_texture);
+    cudaFreeArray(sinogram_array);
+    cudaFree(d_rays);
+}
+
+
diff --git a/generated_files/par_projector_2D_CudaKernel.cu b/generated_files/par_projector_2D_CudaKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8fcae27ce8c80b8adad5a552e2b70df783d52270
--- /dev/null
+++ b/generated_files/par_projector_2D_CudaKernel.cu
@@ -0,0 +1,223 @@
+/*
+ * Copyright [2019] [Christopher Syben]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Voxel-driven parllel-beam projector CUDA kernel
+ * Implementation partially adapted from CONRAD
+ * PYRO-NN is developed as an Open Source project under the Apache License,
+ * Version 2.0.
+ */
+#include "helper_headers/helper_grid.h"
+#include "helper_headers/helper_math.h"
+
+texture<float, cudaTextureType2D, cudaReadModeElementType> volume_as_texture;
+#define CUDART_INF_F __int_as_float(0x7f800000)
+
+inline __device__ float
+kernel_project2D(const float2 source_point, const float2 ray_vector,
+                 const float step_size, const int2 volume_size,
+                 const float2 volume_origin, const float2 volume_spacing) {
+
+  unsigned int detector_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int projection_idx = blockIdx.y;
+
+  float pixel = 0.0f;
+  // Step 1: compute alpha value at entry and exit point of the volume
+  float min_alpha, max_alpha;
+  min_alpha = 0;
+  max_alpha = CUDART_INF_F;
+  if (0.0f != ray_vector.x) {
+    float volume_min_edge_point =
+        index_to_physical(0, volume_origin.x, volume_spacing.x) - 0.5f;
+    float volume_max_edge_point =
+        index_to_physical(volume_size.x, volume_origin.x, volume_spacing.x) -
+        0.5f;
+
+    float reci = 1.0f / ray_vector.x;
+    float alpha0 = (volume_min_edge_point - source_point.x) * reci;
+    float alpha1 = (volume_max_edge_point - source_point.x) * reci;
+    min_alpha = fmin(alpha0, alpha1);
+    max_alpha = fmax(alpha0, alpha1);
+  }
+
+  if (0.0f != ray_vector.y) {
+    float volume_min_edge_point =
+        index_to_physical(0, volume_origin.y, volume_spacing.y) - 0.5f;
+    float volume_max_edge_point =
+        index_to_physical(volume_size.y, volume_origin.y, volume_spacing.y) -
+        0.5f;
+
+    float reci = 1.0f / ray_vector.y;
+    float alpha0 = (volume_min_edge_point - source_point.y) * reci;
+    float alpha1 = (volume_max_edge_point - source_point.y) * reci;
+    min_alpha = fmax(min_alpha, fmin(alpha0, alpha1));
+    max_alpha = fmin(max_alpha, fmax(alpha0, alpha1));
+  }
+
+  float px, py;
+  // pixel = source_point.x + min_alpha * ray_vector.x;
+  // Entrance boundary
+  // In CUDA, voxel centers are located at (xx.5, xx.5, xx.5),
+  //  whereas, SwVolume has voxel centers at integers.
+  // For the initial interpolated value, only a half stepsize is
+  //  considered in the computation.
+  if (min_alpha < max_alpha) {
+    px = source_point.x + min_alpha * ray_vector.x;
+    py = source_point.y + min_alpha * ray_vector.y;
+
+    pixel +=
+        0.5f *
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+    min_alpha += step_size;
+  }
+  // Mid segments
+  while (min_alpha < max_alpha) {
+    px = source_point.x + min_alpha * ray_vector.x;
+    py = source_point.y + min_alpha * ray_vector.y;
+    float2 interp_point =
+        physical_to_index(make_float2(px, py), volume_origin, volume_spacing);
+    pixel +=
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+    min_alpha += step_size;
+  }
+  // Scaling by stepsize;
+  pixel *= step_size;
+
+  // Last segment of the line
+  if (pixel > 0.0f) {
+    pixel -=
+        0.5f * step_size *
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+    min_alpha -= step_size;
+    float last_step_size = max_alpha - min_alpha;
+    pixel +=
+        0.5f * last_step_size *
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+
+    px = source_point.x + max_alpha * ray_vector.x;
+    py = source_point.y + max_alpha * ray_vector.y;
+    // The last segment of the line integral takes care of the
+    // varying length.
+    pixel +=
+        0.5f * last_step_size *
+        tex2D(volume_as_texture,
+              physical_to_index(px, volume_origin.x, volume_spacing.x) + 0.5f,
+              physical_to_index(py, volume_origin.y, volume_spacing.y) + 0.5f);
+  }
+  return pixel;
+}
+
+__global__ void project_2Dpar_beam_kernel(
+    float *pSinogram, const float2 *d_rays, const int number_of_projections,
+    const float sampling_step_size, const int2 volume_size,
+    const float2 volume_spacing, const float2 volume_origin,
+    const int detector_size, const float detector_spacing,
+    const float detector_origin) {
+  unsigned int detector_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (detector_idx >= detector_size) {
+    return;
+  }
+  // Preparations:
+
+  // Assume a source isocenter distance to compute the start of the ray,
+  // although sid is not neseccary for a par beam geometry
+  float sid = sqrt((float)(volume_size.x * volume_spacing.x * volume_size.x *
+                           volume_spacing.x) +
+                   (volume_size.y * volume_spacing.y * volume_size.y *
+                    volume_spacing.y)) *
+              1.2f;
+
+  int projection_idx = blockIdx.y;
+  float2 ray_vector = d_rays[projection_idx];
+
+  // create detector coordinate system (u,v) w.r.t the ray
+  float2 u_vec = make_float2(-ray_vector.y, ray_vector.x);
+  // calculate physical coordinate of detector pixel
+  float u = index_to_physical(detector_idx, detector_origin, detector_spacing);
+
+  // Calculate "source"-Point (start point for the parallel ray), so we can use
+  // the projection kernel Assume a source isocenter distance to compute the
+  // start of the ray, although sid is not neseccary for a par beam geometry
+  float2 virtual_source_point = ray_vector * (-sid) + u_vec * u;
+
+  float pixel = kernel_project2D(
+      virtual_source_point, ray_vector,
+      sampling_step_size, // * fmin(volume_spacing.x, volume_spacing.y),
+      volume_size, volume_origin, volume_spacing);
+
+  pixel *= sqrt(
+      (ray_vector.x * volume_spacing.x) * (ray_vector.x * volume_spacing.x) +
+      (ray_vector.y * volume_spacing.y) * (ray_vector.y * volume_spacing.y));
+
+  unsigned sinogram_idx = projection_idx * detector_size + detector_idx;
+  pSinogram[sinogram_idx] = pixel;
+
+  return;
+}
+
+void Parallel_Projection2D_Kernel_Launcher(
+    const float *volume_ptr, float *out, const float *ray_vectors,
+    const int number_of_projections, const int volume_width,
+    const int volume_height, const float volume_spacing_x,
+    const float volume_spacing_y, const float volume_origin_x,
+    const float volume_origin_y, const int detector_size,
+    const float detector_spacing, const float detector_origin) {
+  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+  volume_as_texture.addressMode[0] = cudaAddressModeBorder;
+  volume_as_texture.addressMode[1] = cudaAddressModeBorder;
+  volume_as_texture.filterMode = cudaFilterModeLinear;
+  volume_as_texture.normalized = false;
+
+  cudaArray *volume_array;
+  cudaMallocArray(&volume_array, &channelDesc, volume_width, volume_height);
+  cudaMemcpyToArray(volume_array, 0, 0, volume_ptr,
+                    volume_width * volume_height * sizeof(float),
+                    cudaMemcpyHostToDevice);
+  cudaBindTextureToArray(volume_as_texture, volume_array, channelDesc);
+
+  auto ray_size_b = number_of_projections * sizeof(float2);
+  float2 *d_rays;
+  cudaMalloc(&d_rays, ray_size_b);
+  cudaMemcpy(d_rays, ray_vectors, ray_size_b, cudaMemcpyHostToDevice);
+
+  float sampling_step_size = 0.2;
+
+  int2 volume_size = make_int2(volume_width, volume_height);
+  float2 volume_spacing = make_float2(volume_spacing_x, volume_spacing_y);
+  float2 volume_origin = make_float2(volume_origin_x, volume_origin_y);
+
+  const unsigned blocksize = 256;
+  const dim3 gridsize =
+      dim3((detector_size / blocksize) + 1, number_of_projections);
+  project_2Dpar_beam_kernel<<<gridsize, blocksize>>>(
+      out, d_rays, number_of_projections, sampling_step_size, volume_size,
+      volume_spacing, volume_origin, detector_size, detector_spacing,
+      detector_origin);
+
+  // cleanup
+  cudaUnbindTexture(volume_as_texture);
+  cudaFreeArray(volume_array);
+  cudaFree(d_rays);
+}
+
+
diff --git a/src/pyronn_torch/pyronn_torch.cpp b/generated_files/pyronn_torch.cpp
similarity index 73%
rename from src/pyronn_torch/pyronn_torch.cpp
rename to generated_files/pyronn_torch.cpp
index 411ee34186bfd8d9312547d27722a5d88f79cf4d..0d30c7a0ded8a3a638ab4289b3ea5e148a16b11f 100644
--- a/src/pyronn_torch/pyronn_torch.cpp
+++ b/generated_files/pyronn_torch.cpp
@@ -11,43 +11,43 @@
 #include <torch/extension.h>
 
 
-void Cone_Backprojection3D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *projection_matrices, const int number_of_projections,
-                                          const int volume_width, const int volume_height, const int volume_depth,
-                                          const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z,
-                                          const float volume_origin_x, const float volume_origin_y, const float volume_origin_z,
-                                          const int detector_width, const int detector_height, const float projection_multiplier);
-
-
-void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const float *inv_AR_matrix, const float *src_points, 
-                                    const int number_of_projections, const int volume_width, const int volume_height, const int volume_depth, 
-                                    const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z,
-                                    const int detector_width, const int detector_height, const float step_size);
-
-
-void Cone_Projection_Kernel_Tex_Interp_Launcher(
-    const float *__restrict__ volume_ptr, float *out,
-    const float *inv_AR_matrix, const float *src_points,
-    const int number_of_projections, const int volume_width,
-    const int volume_height, const int volume_depth,
-    const float volume_spacing_x, const float volume_spacing_y,
-    const float volume_spacing_z, const int detector_width,
-    const int detector_height, const float step_size);
-
-
-void Parallel_Backprojection2D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *ray_vectors, const int number_of_projections,
-                                               const int volume_width, const int volume_height, const float volume_spacing_x, const float volume_spacing_y,
-                                               const float volume_origin_x, const float volume_origin_y,
-                                               const int detector_size, const float detector_spacing, const float detector_origin);
-
-
-void Parallel_Projection2D_Kernel_Launcher(
-    const float *volume_ptr, float *out, const float *ray_vectors,
-    const int number_of_projections, const int volume_width,
-    const int volume_height, const float volume_spacing_x,
-    const float volume_spacing_y, const float volume_origin_x,
-    const float volume_origin_y, const int detector_size,
-    const float detector_spacing, const float detector_origin);
-
+    void Cone_Backprojection3D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *projection_matrices, const int number_of_projections,
+                                              const int volume_width, const int volume_height, const int volume_depth,
+                                              const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z,
+                                              const float volume_origin_x, const float volume_origin_y, const float volume_origin_z,
+                                              const int detector_width, const int detector_height, const float projection_multiplier);
+    
+
+    void Cone_Projection_Kernel_Launcher(const float* volume_ptr, float *out, const float *inv_AR_matrix, const float *src_points, 
+                                        const int number_of_projections, const int volume_width, const int volume_height, const int volume_depth, 
+                                        const float volume_spacing_x, const float volume_spacing_y, const float volume_spacing_z,
+                                        const int detector_width, const int detector_height, const float step_size);
+    
+
+    void Cone_Projection_Kernel_Tex_Interp_Launcher(
+        const float *__restrict__ volume_ptr, float *out,
+        const float *inv_AR_matrix, const float *src_points,
+        const int number_of_projections, const int volume_width,
+        const int volume_height, const int volume_depth,
+        const float volume_spacing_x, const float volume_spacing_y,
+        const float volume_spacing_z, const int detector_width,
+        const int detector_height, const float step_size);
+    
+
+    void Parallel_Backprojection2D_Kernel_Launcher(const float *sinogram_ptr, float *out, const float *ray_vectors, const int number_of_projections,
+                                                   const int volume_width, const int volume_height, const float volume_spacing_x, const float volume_spacing_y,
+                                                   const float volume_origin_x, const float volume_origin_y,
+                                                   const int detector_size, const float detector_spacing, const float detector_origin);
+    
+
+    void Parallel_Projection2D_Kernel_Launcher(
+        const float *volume_ptr, float *out, const float *ray_vectors,
+        const int number_of_projections, const int volume_width,
+        const int volume_height, const float volume_spacing_x,
+        const float volume_spacing_y, const float volume_origin_x,
+        const float volume_origin_y, const int detector_size,
+        const float detector_spacing, const float detector_origin);
+    
 using namespace pybind11::literals;
 
 
diff --git a/setup.py b/setup.py
index 5038bb1a45162f8c52bd20b0c3b3fd101a990cd6..8d9d7782680a1bc53d15e11f661091ca4129071a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,6 @@
 import sys
 from glob import glob
-from os import makedirs
-from os.path import basename, dirname, join
-from shutil import copyfile, copytree, rmtree
+from os.path import dirname, join
 
 from pkg_resources import VersionConflict, require
 from setuptools import setup
@@ -20,20 +18,9 @@ if __name__ == "__main__":
     object_cache = dirname(__file__)
     module_name = 'pyronn_torch_cpp'
 
-    source_files = glob(join(dirname(__file__), 'src', 'pyronn_torch', 'PYRO-NN-Layers', '*.cu.cc'))
+    cuda_sources = glob(join(dirname(__file__), 'generated_files', '*.cu'))
 
-    generated_file = join('src', 'pyronn_torch', 'pyronn_torch.cpp')
-
-    cuda_sources = []
-    makedirs(join(object_cache, module_name), exist_ok=True)
-    rmtree(join(object_cache, module_name, 'helper_headers'), ignore_errors=True)
-    copytree(join(dirname(__file__), 'src', 'pyronn_torch',  'PYRO-NN-Layers', 'helper_headers'),
-             join(object_cache, module_name, 'helper_headers'))
-
-    for s in source_files:
-        dst = join(object_cache, module_name, basename(s).replace('.cu.cc', '.cu'))
-        copyfile(s, dst)  # Torch only accepts *.cu as CUDA
-        cuda_sources.append(join(module_name, basename(s).replace('.cu.cc', '.cu')))
+    generated_file = join('generated_files', 'pyronn_torch.cpp')
 
     setup(use_pyscaffold=True,
           ext_modules=[
diff --git a/src/pyronn_torch/codegen.py b/src/pyronn_torch/codegen.py
index 632b72e8a88a58d85a881f473f9c95147f215b17..5f1e96e8bf7666d342333660a03c8e1d5dacf5b9 100644
--- a/src/pyronn_torch/codegen.py
+++ b/src/pyronn_torch/codegen.py
@@ -172,14 +172,15 @@ def generate_shared_object(output_folder=None,
                            source_files=None,
                            show_code=False,
                            framework_module_class=TorchModule,
-                           generate_code_only=False):
+                           generate_code_only=False,
+                           update_repo_files=False):
 
     object_cache = get_cache_config()['object_cache']
 
     module_name = 'pyronn_torch_cpp'
 
     if not output_folder:
-        output_folder = dirname(__file__)
+        output_folder = join(dirname(__file__), '..', '..', 'generated_files')
 
     if not source_files:
         source_files = glob(join(dirname(__file__), 'PYRO-NN-Layers', '*.cu.cc'))
@@ -189,11 +190,18 @@ def generate_shared_object(output_folder=None,
     rmtree(join(object_cache, module_name, 'helper_headers'), ignore_errors=True)
     copytree(join(dirname(__file__), 'PYRO-NN-Layers', 'helper_headers'),
              join(object_cache, module_name, 'helper_headers'))
+    if update_repo_files:
+        copytree(join(dirname(__file__), 'PYRO-NN-Layers', 'helper_headers'),
+                 join(output_folder, 'helper_headers'))
 
     for s in source_files:
         dst = join(object_cache, module_name, basename(s).replace('.cu.cc', '.cu'))
         copyfile(s, dst)  # Torch only accepts *.cu as CUDA
         cuda_sources.append(dst)
+        if update_repo_files:
+            dst = join(output_folder, basename(s).replace('.cu.cc', '.cu'))
+            copyfile(s, dst)  # Torch only accepts *.cu as CUDA
+
 
     module = framework_module_class(module_name, FUNCTIONS.values())
 
@@ -210,13 +218,15 @@ def generate_shared_object(output_folder=None,
 
     shared_object_file = module.compiled_file
     copyfile(shared_object_file, join(output_folder, module_name + '.so'))
-    if show_code:
+
+    if update_repo_files:
         with open(join(output_folder, 'pyronn_torch.cpp'), 'w') as f:
             f.write(pystencils.get_code_str(module, custom_backend=FrameworkIntegrationPrinter()))
 
     return extension
 
 
+
 def compile_shared_object(output_folder=None, source_files=None):
     """
     Same as generate_shared_object without pystencils dependency
@@ -268,7 +278,7 @@ def main():
     parser.add_argument('--source-files',  default=None)
     args = parser.parse_args()
 
-    generate_shared_object(args.output_folder, args.source_files, show_code=True)
+    generate_shared_object(args.output_folder, args.source_files, show_code=True, update_repo_files=True)
     # compile_shared_object(args.output_folder, args.source_files)