diff --git a/src/pystencils_autodiff/backends/astnodes.py b/src/pystencils_autodiff/backends/astnodes.py
index acd536ff889e0d8506a5b039444c1ae03ca3387a..bb24dc510799309970b484692971b693ba50096c 100644
--- a/src/pystencils_autodiff/backends/astnodes.py
+++ b/src/pystencils_autodiff/backends/astnodes.py
@@ -99,18 +99,18 @@ class TorchModule(JinjaCppFile):
         file_extension = '.cu' if self.is_cuda else '.cpp'
         source_code = str(self)
         hash = _hash(source_code.encode()).hexdigest()
-        file_name = join(get_cache_config()['object_cache'], f'{hash}{file_extension}')
+        build_dir = join(get_cache_config()['object_cache'], self.module_name)
+        os.makedirs(build_dir, exist_ok=True)
+        file_name = join(build_dir, f'{hash}{file_extension}')
 
         if not exists(file_name):
             write_file(file_name, source_code)
 
-        build_dir = join(get_cache_config()['object_cache'], self.module_name)
-        os.makedirs(build_dir, exist_ok=True)
-
         torch_extension = load(hash,
                                [file_name],
                                with_cuda=self.is_cuda,
                                extra_cflags=['--std=c++14'],
+                               extra_cuda_cflags=['-std=c++14'],
                                build_directory=build_dir,
                                extra_include_paths=[get_pycuda_include_path(),
                                                     get_pystencils_include_path()])
diff --git a/src/pystencils_autodiff/framework_integration/astnodes.py b/src/pystencils_autodiff/framework_integration/astnodes.py
index ed16a4dbb7054750be640faa70645e1a854cd6db..3e5e7a35366b371df389ee04e1a708ac5fe84094 100644
--- a/src/pystencils_autodiff/framework_integration/astnodes.py
+++ b/src/pystencils_autodiff/framework_integration/astnodes.py
@@ -90,76 +90,198 @@ class DestructuringBindingsForFieldClass(Node):
 
 
 class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode):
-    CODE_TEMPLATE = """cudaResourceDesc {resource_desc}{{}};
-{resource_desc}.resType = cudaResourceTypeLinear;
-{resource_desc}.res.linear.devPtr = {device_ptr};
-{resource_desc}.res.linear.desc.f = {cuda_channel_format};
-{resource_desc}.res.linear.desc.x = {bits_per_channel}; // bits per channel
-{resource_desc}.res.linear.sizeInBytes = {total_size};
-
-cudaTextureDesc {texture_desc}{{}};
-cudaTextureObject_t {texture_object}=0;
-cudaCreateTextureObject(&{texture_object}, &{resource_desc}, &{texture_desc}, nullptr);
-{texture_desc}.readMode = cudaReadModeElementType;
-auto {texture_object}Destroyer = [&](){{
-   cudaDestroyTextureObject({texture_object});
-}};
     """
-
-    def _get_channel_format_string(self):
-        """
-        From CUDA API documentation:
-
-        ``enum cudaChannelFormatKind``
-
-        Channel format kind
-
-        Enumerator:
-            =============================   ========================
-            cudaChannelFormatKindSigned     Signed channel format.
-            cudaChannelFormatKindUnsigned   Unsigned channel format.
-            cudaChannelFormatKindFloat      Float channel format.
-            cudaChannelFormatKindNone       No channel format.
-            =============================   ========================
-        """
-        dtype = self._device_ptr.dtype.base_type
-        if np.issubdtype(dtype.numpy_dtype, np.signedinteger):
-            return 'cudaChannelFormatKindSigned'
-        elif np.issubdtype(dtype.numpy_dtype, np.unsignedinteger):
-            return 'cudaChannelFormatKindUnsigned'
-        elif np.issubdtype(dtype.numpy_dtype, np.float32):
-            return 'cudaChannelFormatKindFloat'
-        elif np.issubdtype(dtype.numpy_dtype, np.float64):
-            # PyCUDA double texture hack! See pystencils/include/pycuda-helper-modified.hpp
-            return 'cudaChannelFormatKindSigned'
-        else:
-            raise NotImplementedError('dtype not supported for CUDA textures')
+    Bind texture to CUDA device pointer
+
+    Recommended read: https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
+
+    The definition from cudaResourceDesc and cudaTextureDesc
+
+    .. code:: c
+
+        /**
+         * CUDA resource descriptor
+         */
+        struct __device_builtin__ cudaResourceDesc {
+            enum cudaResourceType resType;             /**< Resource type */
+
+            union {
+                struct {
+                    cudaArray_t array;                 /**< CUDA array */
+                } array;
+                struct {
+                    cudaMipmappedArray_t mipmap;       /**< CUDA mipmapped array */
+                } mipmap;
+                struct {
+                    void *devPtr;                      /**< Device pointer */
+                    struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+                    size_t sizeInBytes;                /**< Size in bytes */
+                } linear;
+                struct {
+                    void *devPtr;                      /**< Device pointer */
+                    struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+                    size_t width;                      /**< Width of the array in elements */
+                    size_t height;                     /**< Height of the array in elements */
+                    size_t pitchInBytes;               /**< Pitch between two rows in bytes */
+                } pitch2D;
+            } res;
+        };
+
+    .. code:: c
+
+        /**
+         * CUDA texture descriptor
+         */
+        struct __device_builtin__ cudaTextureDesc
+        {
+            /**
+             * Texture address mode for up to 3 dimensions
+             */
+            enum cudaTextureAddressMode addressMode[3];
+            /**
+             * Texture filter mode
+             */
+            enum cudaTextureFilterMode  filterMode;
+            /**
+             * Texture read mode
+             */
+            enum cudaTextureReadMode    readMode;
+            /**
+             * Perform sRGB->linear conversion during texture read
+             */
+            int                         sRGB;
+            /**
+             * Texture Border Color
+             */
+            float                       borderColor[4];
+            /**
+             * Indicates whether texture reads are normalized or not
+             */
+            int                         normalizedCoords;
+            /**
+             * Limit to the anisotropy ratio
+             */
+            unsigned int                maxAnisotropy;
+            /**
+             * Mipmap filter mode
+             */
+            enum cudaTextureFilterMode  mipmapFilterMode;
+            /**
+             * Offset applied to the supplied mipmap level
+             */
+            float                       mipmapLevelBias;
+            /**
+             * Lower end of the mipmap level range to clamp access to
+             */
+            float                       minMipmapLevelClamp;
+            /**
+             * Upper end of the mipmap level range to clamp access to
+             */
+            float                       maxMipmapLevelClamp;
+        };
+    """  # noqa
+    CODE_TEMPLATE_LINEAR = jinja2.Template("""cudaResourceDesc {{resource_desc}}{};
+{{resource_desc}}.resType = cudaResourceTypeLinear;
+{{resource_desc}}.res.linear.devPtr = {{device_ptr}};
+{{resource_desc}}.res.linear.desc.f = {{cuda_channel_format}};
+{{resource_desc}}.res.linear.desc.x = {{bits_per_channel}}; // bits per channel
+{{resource_desc}}.res.linear.sizeInBytes = {{total_size}};
+
+cudaTextureDesc {{texture_desc}}{};
+cudaTextureObject_t {{texture_object}}=0;
+cudaCreateTextureObject(&{{texture_object}}, &{{resource_desc}}, &{{texture_desc}}, nullptr);
+{{texture_desc}}.readMode = cudaReadModeElementType;
+auto {{texture_object}}Destroyer = std::unique_ptr(nullptr, [&](){
+   cudaDestroyTextureObject({{texture_object}});
+});
+    """)
+    CODE_TEMPLATE_PITCHED2D = jinja2.Template(""" !!! TODO!!! """)
+    CODE_TEMPLATE_CUDA_ARRAY = jinja2.Template("""
+auto channel_desc_{{texture_name}} = {{channel_desc}};
+{{ create_array }}
+{{ copy_array }}
+{{ texture_name }}.addressMode[0] = cudaAddressModeBorder;
+{{ texture_name }}.addressMode[1] = cudaAddressModeBorder;
+{{ texture_name }}.addressMode[2] = cudaAddressModeBorder;
+{{ texture_name }}.filterMode = cudaFilterModeLinear;
+{{ texture_name }}.normalized = false;
+cudaBindTextureToArray(&{{texture_name}}, {{array}}, &channel_desc_{{texture_name}});
+std::shared_ptr<void> {{array}}Destroyer(nullptr, [&](...){
+    cudaFreeArray({{array}});
+    cudaUnbindTexture({{texture_name}});
+});
+    """)
 
     def __init__(self, texture, device_data_ptr, use_texture_objects=True):
         self._texture = texture
         self._device_ptr = device_data_ptr
-        self._dtype = self._device_ptr.dtype.base_type.numpy_dtype
+        self._dtype = self._device_ptr.dtype.base_type
         self._shape = tuple(sp.S(s) for s in self._texture.field.shape)
+        self._ndim = texture.field.ndim
         assert use_texture_objects, "without texture objects is not implemented"
 
         super().__init__(self.get_code(dialect='c', vector_instruction_set=None),
                          symbols_read={device_data_ptr,
                                        *[s for s in self._shape if isinstance(s, sp.Symbol)]},
                          symbols_defined={})
-        self.headers.append("<cuda.h>")
+        self.headers = ['<memory>', '<cuda.h>', '<cuda_runtime_api.h>']
 
     def get_code(self, dialect, vector_instruction_set):
         texture_name = self._texture.symbol.name
-        code = self.CODE_TEMPLATE.format(
+        code = self.CODE_TEMPLATE_CUDA_ARRAY.render(
             resource_desc='resDesc_' + texture_name,
             texture_desc='texDesc_' + texture_name,
+            channel_desc=f'cudaCreateChannelDesc<{self._dtype}>()',  # noqa
             texture_object='tex_' + texture_name,
+            array='array_' + texture_name,
+            texture_name=texture_name,
+            ndim=self._ndim,
             device_ptr=self._device_ptr,
-            cuda_channel_format=self._get_channel_format_string(),
-            bits_per_channel=self._dtype.itemsize * 8,
-            total_size=self._dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1))
+            create_array=self._get_create_array_call(),
+            copy_array=self._get_copy_array_call(),
+            dtype=self._dtype,
+            bits_per_channel=self._dtype.numpy_dtype.itemsize * 8,
+            total_size=self._dtype.numpy_dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1))
         return code
 
+    def _get_create_array_call(self):
+        texture_name = self._texture.symbol.name
+        ndim = '' if self._ndim <= 2 else f'{self._ndim}D'
+        array = 'array_' + texture_name
+        return f"""cudaArray * {array};
+cudaMalloc{ndim}Array(&{array}, &channel_desc_{texture_name}, """ + (
+            (f'{{{", ".join(str(s) for s in reversed(self._shape))}}});'
+                if self._ndim == 3
+                else f'{", ".join(str(s) for s in reversed(self._shape))});'))
+
+    def _get_copy_array_call(self):
+        texture_name = self._texture.symbol.name
+        array = 'array_' + texture_name
+        if self._texture.field.ndim == 3:
+            copy_params = f'cpy_{texture_name}_params'
+            return f"""cudaMemcpy3DParams {copy_params}{{}};
+{copy_params}.srcPtr = {{{self._device_ptr},
+                       {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize},
+                       {self._texture.field.shape[-1], self._texture.field.shape[-2]}}};
+{copy_params}.dsrArray = {array};
+copyParams.extent = {{{", ".join(reversed(self._shape))}}};
+copyParams.kind = cudaMemcpyDeviceToDevice;
+cudaMemcpy3D(&{{copy_params}});"""  # noqa
+        elif self._texture.field.ndim == 2:
+            # cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+            return f"""cudaMemcpy2DToArray({array},
+                    0u,
+                    0u,
+                    {self._device_ptr},
+                    {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize},
+                    {self._texture.field.shape[-1]},
+                    {self._texture.field.shape[-2]},
+                    cudaMemcpyDeviceToDevice);
+ """
+        else:
+            raise NotImplementedError()
+
 
 class KernelFunctionCall(Node):
     """