Progress for 2d textures

4ab12dea · Stephan Seitz · 40ea9ea6 · 4ab12dea · 4ab12dea
Commit 4ab12dea authored 5 years ago by Stephan Seitz
--- a/src/pystencils_autodiff/backends/astnodes.py
+++ b/src/pystencils_autodiff/backends/astnodes.py
@@ -99,18 +99,18 @@ class TorchModule(JinjaCppFile):
        file_extension = '.cu' if self.is_cuda else '.cpp'
        source_code = str(self)
        hash = _hash(source_code.encode()).hexdigest()
-        file_name = join(get_cache_config()['object_cache'], f'{hash}{file_extension}')
+        build_dir = join(get_cache_config()['object_cache'], self.module_name)
+        os.makedirs(build_dir, exist_ok=True)
+        file_name = join(build_dir, f'{hash}{file_extension}')
        if not exists(file_name):
            write_file(file_name, source_code)
-        build_dir = join(get_cache_config()['object_cache'], self.module_name)
-        os.makedirs(build_dir, exist_ok=True)
        torch_extension = load(hash,
                               [file_name],
                               with_cuda=self.is_cuda,
                               extra_cflags=['--std=c++14'],
+                               extra_cuda_cflags=['-std=c++14'],
                               build_directory=build_dir,
                               extra_include_paths=[get_pycuda_include_path(),
                                                    get_pystencils_include_path()])

--- a/src/pystencils_autodiff/framework_integration/astnodes.py
+++ b/src/pystencils_autodiff/framework_integration/astnodes.py
@@ -90,76 +90,198 @@ class DestructuringBindingsForFieldClass(Node):
 class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode):
-    CODE_TEMPLATE = """cudaResourceDesc {resource_desc}{{}};
-{resource_desc}.resType = cudaResourceTypeLinear;
-{resource_desc}.res.linear.devPtr = {device_ptr};
-{resource_desc}.res.linear.desc.f = {cuda_channel_format};
-{resource_desc}.res.linear.desc.x = {bits_per_channel}; // bits per channel
-{resource_desc}.res.linear.sizeInBytes = {total_size};
-cudaTextureDesc {texture_desc}{{}};
-cudaTextureObject_t {texture_object}=0;
-cudaCreateTextureObject(&{texture_object}, &{resource_desc}, &{texture_desc}, nullptr);
-{texture_desc}.readMode = cudaReadModeElementType;
-auto {texture_object}Destroyer = [&](){{
-   cudaDestroyTextureObject({texture_object});
-}};
    """
+    Bind texture to CUDA device pointer
-    def _get_channel_format_string(self):
-        """
+    Recommended read: https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
-        From CUDA API documentation:
+    The definition from cudaResourceDesc and cudaTextureDesc
-        ``enum cudaChannelFormatKind``
+    .. code:: c
-        Channel format kind
+        /**
-        Enumerator:
+         * CUDA resource descriptor
-            =============================   ========================
+         */
-            cudaChannelFormatKindSigned     Signed channel format.
+        struct __device_builtin__ cudaResourceDesc {
-            cudaChannelFormatKindUnsigned   Unsigned channel format.
+            enum cudaResourceType resType;             /**< Resource type */
-            cudaChannelFormatKindFloat      Float channel format.
-            cudaChannelFormatKindNone       No channel format.
+            union {
-            =============================   ========================
+                struct {
-        """
+                    cudaArray_t array;                 /**< CUDA array */
-        dtype = self._device_ptr.dtype.base_type
+                } array;
-        if np.issubdtype(dtype.numpy_dtype, np.signedinteger):
+                struct {
-            return 'cudaChannelFormatKindSigned'
+                    cudaMipmappedArray_t mipmap;       /**< CUDA mipmapped array */
-        elif np.issubdtype(dtype.numpy_dtype, np.unsignedinteger):
+                } mipmap;
-            return 'cudaChannelFormatKindUnsigned'
+                struct {
-        elif np.issubdtype(dtype.numpy_dtype, np.float32):
+                    void *devPtr;                      /**< Device pointer */
-            return 'cudaChannelFormatKindFloat'
+                    struct cudaChannelFormatDesc desc; /**< Channel descriptor */
-        elif np.issubdtype(dtype.numpy_dtype, np.float64):
+                    size_t sizeInBytes;                /**< Size in bytes */
-            # PyCUDA double texture hack! See pystencils/include/pycuda-helper-modified.hpp
+                } linear;
-            return 'cudaChannelFormatKindSigned'
+                struct {
-        else:
+                    void *devPtr;                      /**< Device pointer */
-            raise NotImplementedError('dtype not supported for CUDA textures')
+                    struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+                    size_t width;                      /**< Width of the array in elements */
+                    size_t height;                     /**< Height of the array in elements */
+                    size_t pitchInBytes;               /**< Pitch between two rows in bytes */
+                } pitch2D;
+            } res;
+        };
+    .. code:: c
+        /**
+         * CUDA texture descriptor
+         */
+        struct __device_builtin__ cudaTextureDesc
+        {
+            /**
+             * Texture address mode for up to 3 dimensions
+             */
+            enum cudaTextureAddressMode addressMode[3];
+            /**
+             * Texture filter mode
+             */
+            enum cudaTextureFilterMode  filterMode;
+            /**
+             * Texture read mode
+             */
+            enum cudaTextureReadMode    readMode;
+            /**
+             * Perform sRGB->linear conversion during texture read
+             */
+            int                         sRGB;
+            /**
+             * Texture Border Color
+             */
+            float                       borderColor[4];
+            /**
+             * Indicates whether texture reads are normalized or not
+             */
+            int                         normalizedCoords;
+            /**
+             * Limit to the anisotropy ratio
+             */
+            unsigned int                maxAnisotropy;
+            /**
+             * Mipmap filter mode
+             */
+            enum cudaTextureFilterMode  mipmapFilterMode;
+            /**
+             * Offset applied to the supplied mipmap level
+             */
+            float                       mipmapLevelBias;
+            /**
+             * Lower end of the mipmap level range to clamp access to
+             */
+            float                       minMipmapLevelClamp;
+            /**
+             * Upper end of the mipmap level range to clamp access to
+             */
+            float                       maxMipmapLevelClamp;
+        };
+    """  # noqa
+    CODE_TEMPLATE_LINEAR = jinja2.Template("""cudaResourceDesc {{resource_desc}}{};
+{{resource_desc}}.resType = cudaResourceTypeLinear;
+{{resource_desc}}.res.linear.devPtr = {{device_ptr}};
+{{resource_desc}}.res.linear.desc.f = {{cuda_channel_format}};
+{{resource_desc}}.res.linear.desc.x = {{bits_per_channel}}; // bits per channel
+{{resource_desc}}.res.linear.sizeInBytes = {{total_size}};
+cudaTextureDesc {{texture_desc}}{};
+cudaTextureObject_t {{texture_object}}=0;
+cudaCreateTextureObject(&{{texture_object}}, &{{resource_desc}}, &{{texture_desc}}, nullptr);
+{{texture_desc}}.readMode = cudaReadModeElementType;
+auto {{texture_object}}Destroyer = std::unique_ptr(nullptr, [&](){
+   cudaDestroyTextureObject({{texture_object}});
+});
+    """)
+    CODE_TEMPLATE_PITCHED2D = jinja2.Template(""" !!! TODO!!! """)
+    CODE_TEMPLATE_CUDA_ARRAY = jinja2.Template("""
+auto channel_desc_{{texture_name}} = {{channel_desc}};
+{{ create_array }}
+{{ copy_array }}
+{{ texture_name }}.addressMode[0] = cudaAddressModeBorder;
+{{ texture_name }}.addressMode[1] = cudaAddressModeBorder;
+{{ texture_name }}.addressMode[2] = cudaAddressModeBorder;
+{{ texture_name }}.filterMode = cudaFilterModeLinear;
+{{ texture_name }}.normalized = false;
+cudaBindTextureToArray(&{{texture_name}}, {{array}}, &channel_desc_{{texture_name}});
+std::shared_ptr<void> {{array}}Destroyer(nullptr, [&](...){
+    cudaFreeArray({{array}});
+    cudaUnbindTexture({{texture_name}});
+});
+    """)
    def __init__(self, texture, device_data_ptr, use_texture_objects=True):
        self._texture = texture
        self._device_ptr = device_data_ptr
-        self._dtype = self._device_ptr.dtype.base_type.numpy_dtype
+        self._dtype = self._device_ptr.dtype.base_type
        self._shape = tuple(sp.S(s) for s in self._texture.field.shape)
+        self._ndim = texture.field.ndim
        assert use_texture_objects, "without texture objects is not implemented"
        super().__init__(self.get_code(dialect='c', vector_instruction_set=None),
                         symbols_read={device_data_ptr,
                                       *[s for s in self._shape if isinstance(s, sp.Symbol)]},
                         symbols_defined={})
-        self.headers.append("<cuda.h>")
+        self.headers = ['<memory>', '<cuda.h>', '<cuda_runtime_api.h>']
    def get_code(self, dialect, vector_instruction_set):
        texture_name = self._texture.symbol.name
-        code = self.CODE_TEMPLATE.format(
+        code = self.CODE_TEMPLATE_CUDA_ARRAY.render(
            resource_desc='resDesc_' + texture_name,
            texture_desc='texDesc_' + texture_name,
+            channel_desc=f'cudaCreateChannelDesc<{self._dtype}>()',  # noqa
            texture_object='tex_' + texture_name,
+            array='array_' + texture_name,
+            texture_name=texture_name,
+            ndim=self._ndim,
            device_ptr=self._device_ptr,
-            cuda_channel_format=self._get_channel_format_string(),
+            create_array=self._get_create_array_call(),
-            bits_per_channel=self._dtype.itemsize * 8,
+            copy_array=self._get_copy_array_call(),
-            total_size=self._dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1))
+            dtype=self._dtype,
+            bits_per_channel=self._dtype.numpy_dtype.itemsize * 8,
+            total_size=self._dtype.numpy_dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1))
        return code
+    def _get_create_array_call(self):
+        texture_name = self._texture.symbol.name
+        ndim = '' if self._ndim <= 2 else f'{self._ndim}D'
+        array = 'array_' + texture_name
+        return f"""cudaArray * {array};
+cudaMalloc{ndim}Array(&{array}, &channel_desc_{texture_name}, """ + (
+            (f'{{{", ".join(str(s) for s in reversed(self._shape))}}});'
+                if self._ndim == 3
+                else f'{", ".join(str(s) for s in reversed(self._shape))});'))
+    def _get_copy_array_call(self):
+        texture_name = self._texture.symbol.name
+        array = 'array_' + texture_name
+        if self._texture.field.ndim == 3:
+            copy_params = f'cpy_{texture_name}_params'
+            return f"""cudaMemcpy3DParams {copy_params}{{}};
+{copy_params}.srcPtr = {{{self._device_ptr},
+                       {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize},
+                       {self._texture.field.shape[-1], self._texture.field.shape[-2]}}};
+{copy_params}.dsrArray = {array};
+copyParams.extent = {{{", ".join(reversed(self._shape))}}};
+copyParams.kind = cudaMemcpyDeviceToDevice;
+cudaMemcpy3D(&{{copy_params}});"""  # noqa
+        elif self._texture.field.ndim == 2:
+            # cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+            return f"""cudaMemcpy2DToArray({array},
+                    0u,
+                    0u,
+                    {self._device_ptr},
+                    {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize},
+                    {self._texture.field.shape[-1]},
+                    {self._texture.field.shape[-2]},
+                    cudaMemcpyDeviceToDevice);
+ """
+        else:
+            raise NotImplementedError()
 class KernelFunctionCall(Node):
    """