Move NativeTextureBinding to own file

4c6b251a · Stephan Seitz · c6de524f · 4c6b251a · 4c6b251a · 4c6b251a
Commit 4c6b251a authored 5 years ago by Stephan Seitz
--- a/src/pystencils_autodiff/backends/_torch_native.py
+++ b/src/pystencils_autodiff/backends/_torch_native.py
 from collections import OrderedDict
+from itertools import chain

 from pystencils_autodiff.backends._pytorch import numpy_dtype_to_torch
 from pystencils_autodiff.backends.astnodes import TorchModule
@@ -63,7 +64,7 @@ def create_autograd_function(autodiff_obj, use_cuda):
                kwargs[field.name] = torch.zeros(
                    field.shape,
                    dtype=numpy_dtype_to_torch(field.dtype.numpy_dtype),
-                    device=args[0].device)
+                    device=chain(args[0], kwargs.values()).device)
        output_tensors = OrderedDict({f.name:
                                      field_to_tensor_dict.get(f, kwargs[f.name])
                                      for f in autodiff_obj.forward_output_fields})

--- a/src/pystencils_autodiff/framework_integration/astnodes.py
+++ b/src/pystencils_autodiff/framework_integration/astnodes.py
@@ -11,11 +11,9 @@ waLBerla currently uses `pystencils-walberla <https://pypi.org/project/pystencil
 """
 import itertools
 from collections.abc import Iterable
-from functools import reduce
 from typing import Any, List, Set

 import jinja2
-import numpy as np

 import pystencils
 import sympy as sp
@@ -23,6 +21,7 @@ from pystencils.astnodes import KernelFunction, Node, NodeOrExpr, ResolvedFieldA
 from pystencils.data_types import TypedSymbol
 from pystencils.kernelparameters import FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol
 from pystencils_autodiff.framework_integration.printer import FrameworkIntegrationPrinter
+from pystencils_autodiff.framework_integration.texture_astnodes import NativeTextureBinding


 class DestructuringBindingsForFieldClass(Node):
@@ -89,78 +88,6 @@ class DestructuringBindingsForFieldClass(Node):
        return self.body.atoms(arg_type) | {s for s in self.symbols_defined if isinstance(s, arg_type)}


-class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode):
-    CODE_TEMPLATE = """cudaResourceDesc {resource_desc}{{}};
-{resource_desc}.resType = cudaResourceTypeLinear;
-{resource_desc}.res.linear.devPtr = {device_ptr};
-{resource_desc}.res.linear.desc.f = {cuda_channel_format};
-{resource_desc}.res.linear.desc.x = {bits_per_channel}; // bits per channel
-{resource_desc}.res.linear.sizeInBytes = {total_size};
-
-cudaTextureDesc {texture_desc}{{}};
-cudaTextureObject_t {texture_object}=0;
-cudaCreateTextureObject(&{texture_object}, &{resource_desc}, &{texture_desc}, nullptr);
-{texture_desc}.readMode = cudaReadModeElementType;
-auto {texture_object}Destroyer = [&](){{
-   cudaDestroyTextureObject({texture_object});
-}};
-    """
-
-    def _get_channel_format_string(self):
-        """
-        From CUDA API documentation:
-
-        ``enum cudaChannelFormatKind``
-
-        Channel format kind
-
-        Enumerator:
-            =============================   ========================
-            cudaChannelFormatKindSigned     Signed channel format.
-            cudaChannelFormatKindUnsigned   Unsigned channel format.
-            cudaChannelFormatKindFloat      Float channel format.
-            cudaChannelFormatKindNone       No channel format.
-            =============================   ========================
-        """
-        dtype = self._device_ptr.dtype.base_type
-        if np.issubdtype(dtype.numpy_dtype, np.signedinteger):
-            return 'cudaChannelFormatKindSigned'
-        elif np.issubdtype(dtype.numpy_dtype, np.unsignedinteger):
-            return 'cudaChannelFormatKindUnsigned'
-        elif np.issubdtype(dtype.numpy_dtype, np.float32):
-            return 'cudaChannelFormatKindFloat'
-        elif np.issubdtype(dtype.numpy_dtype, np.float64):
-            # PyCUDA double texture hack! See pystencils/include/pycuda-helper-modified.hpp
-            return 'cudaChannelFormatKindSigned'
-        else:
-            raise NotImplementedError('dtype not supported for CUDA textures')
-
-    def __init__(self, texture, device_data_ptr, use_texture_objects=True):
-        self._texture = texture
-        self._device_ptr = device_data_ptr
-        self._dtype = self._device_ptr.dtype.base_type.numpy_dtype
-        self._shape = tuple(sp.S(s) for s in self._texture.field.shape)
-        assert use_texture_objects, "without texture objects is not implemented"
-
-        super().__init__(self.get_code(dialect='c', vector_instruction_set=None),
-                         symbols_read={device_data_ptr,
-                                       *[s for s in self._shape if isinstance(s, sp.Symbol)]},
-                         symbols_defined={})
-        self.headers.append("<cuda.h>")
-
-    def get_code(self, dialect, vector_instruction_set):
-        texture_name = self._texture.symbol.name
-        code = self.CODE_TEMPLATE.format(
-            resource_desc='resDesc_' + texture_name,
-            texture_desc='texDesc_' + texture_name,
-            texture_object='tex_' + texture_name,
-            device_ptr=self._device_ptr,
-            cuda_channel_format=self._get_channel_format_string(),
-            bits_per_channel=self._dtype.itemsize * 8,
-            total_size=self._dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1))
-        return code
-
-
 class KernelFunctionCall(Node):
    """
    AST nodes representing a call of a :class:`pystencils.astnodes.KernelFunction`

--- a/src/pystencils_autodiff/framework_integration/texture_astnodes.py
+++ b/src/pystencils_autodiff/framework_integration/texture_astnodes.py
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
+#
+# Distributed under terms of the GPLv3 license.
+
+"""
+
+"""
+from functools import reduce
+
+import jinja2
+import sympy as sp
+
+import pystencils.backends
+
+
+class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode):
+    """
+    Bind texture to CUDA device pointer
+
+    Recommended read: https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
+
+    The definition from cudaResourceDesc and cudaTextureDesc
+
+    .. code:: c
+
+        /**
+         * CUDA resource descriptor
+         */
+        struct __device_builtin__ cudaResourceDesc {
+            enum cudaResourceType resType;             /**< Resource type */
+
+            union {
+                struct {
+                    cudaArray_t array;                 /**< CUDA array */
+                } array;
+                struct {
+                    cudaMipmappedArray_t mipmap;       /**< CUDA mipmapped array */
+                } mipmap;
+                struct {
+                    void *devPtr;                      /**< Device pointer */
+                    struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+                    size_t sizeInBytes;                /**< Size in bytes */
+                } linear;
+                struct {
+                    void *devPtr;                      /**< Device pointer */
+                    struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+                    size_t width;                      /**< Width of the array in elements */
+                    size_t height;                     /**< Height of the array in elements */
+                    size_t pitchInBytes;               /**< Pitch between two rows in bytes */
+                } pitch2D;
+            } res;
+        };
+
+    .. code:: c
+
+        /**
+         * CUDA texture descriptor
+         */
+        struct __device_builtin__ cudaTextureDesc
+        {
+            /**
+             * Texture address mode for up to 3 dimensions
+             */
+            enum cudaTextureAddressMode addressMode[3];
+            /**
+             * Texture filter mode
+             */
+            enum cudaTextureFilterMode  filterMode;
+            /**
+             * Texture read mode
+             */
+            enum cudaTextureReadMode    readMode;
+            /**
+             * Perform sRGB->linear conversion during texture read
+             */
+            int                         sRGB;
+            /**
+             * Texture Border Color
+             */
+            float                       borderColor[4];
+            /**
+             * Indicates whether texture reads are normalized or not
+             */
+            int                         normalizedCoords;
+            /**
+             * Limit to the anisotropy ratio
+             */
+            unsigned int                maxAnisotropy;
+            /**
+             * Mipmap filter mode
+             */
+            enum cudaTextureFilterMode  mipmapFilterMode;
+            /**
+             * Offset applied to the supplied mipmap level
+             */
+            float                       mipmapLevelBias;
+            /**
+             * Lower end of the mipmap level range to clamp access to
+             */
+            float                       minMipmapLevelClamp;
+            /**
+             * Upper end of the mipmap level range to clamp access to
+             */
+            float                       maxMipmapLevelClamp;
+        };
+    """  # noqa
+    CODE_TEMPLATE_LINEAR = jinja2.Template("""cudaResourceDesc {{resource_desc}}{};
+{{resource_desc}}.resType = cudaResourceTypeLinear;
+{{resource_desc}}.res.linear.devPtr = {{device_ptr}};
+{{resource_desc}}.res.linear.desc.f = {{cuda_channel_format}};
+{{resource_desc}}.res.linear.desc.x = {{bits_per_channel}}; // bits per channel
+{{resource_desc}}.res.linear.sizeInBytes = {{total_size}};
+
+cudaTextureDesc {{texture_desc}}{};
+cudaTextureObject_t {{texture_object}}=0;
+cudaCreateTextureObject(&{{texture_object}}, &{{resource_desc}}, &{{texture_desc}}, nullptr);
+{{texture_desc}}.readMode = cudaReadModeElementType;
+auto {{texture_object}}Destroyer = std::unique_ptr(nullptr, [&](){
+   cudaDestroyTextureObject({{texture_object}});
+});
+    """)
+    CODE_TEMPLATE_PITCHED2D = jinja2.Template(""" !!! TODO!!! """)
+    CODE_TEMPLATE_CUDA_ARRAY = jinja2.Template("""
+auto channel_desc_{{texture_name}} = {{channel_desc}};
+{{ create_array }}
+{{ copy_array }}
+    cudaDeviceSynchronize();
+{{ texture_name }}.addressMode[0] = cudaAddressModeBorder;
+{{ texture_name }}.addressMode[1] = cudaAddressModeBorder;
+{{ texture_name }}.addressMode[2] = cudaAddressModeBorder;
+{{ texture_name }}.filterMode = cudaFilterModeLinear;
+{{ texture_name }}.normalized = false;
+cudaBindTextureToArray(&{{texture_name}}, {{array}}, &channel_desc_{{texture_name}});
+std::shared_ptr<void> {{array}}Destroyer(nullptr, [&](...){
+    cudaDeviceSynchronize();
+    cudaFreeArray({{array}});
+    cudaUnbindTexture({{texture_name}});
+});
+    """)
+
+    def __init__(self, texture, device_data_ptr, use_texture_objects=True):
+        self._texture = texture
+        self._device_ptr = device_data_ptr
+        self._dtype = self._device_ptr.dtype.base_type
+        self._shape = tuple(sp.S(s) for s in self._texture.field.shape)
+        self._ndim = texture.field.ndim
+        assert use_texture_objects, "without texture objects is not implemented"
+
+        super().__init__(self.get_code(dialect='c', vector_instruction_set=None),
+                         symbols_read={device_data_ptr,
+                                       *[s for s in self._shape if isinstance(s, sp.Symbol)]},
+                         symbols_defined={})
+        self.headers = ['<memory>', '<cuda.h>', '<cuda_runtime_api.h>']
+
+    def get_code(self, dialect, vector_instruction_set):
+        texture_name = self._texture.symbol.name
+        code = self.CODE_TEMPLATE_CUDA_ARRAY.render(
+            resource_desc='resDesc_' + texture_name,
+            texture_desc='texDesc_' + texture_name,
+            channel_desc=f'cudaCreateChannelDesc<{self._dtype}>()',  # noqa
+            texture_object='tex_' + texture_name,
+            array='array_' + texture_name,
+            texture_name=texture_name,
+            ndim=self._ndim,
+            device_ptr=self._device_ptr,
+            create_array=self._get_create_array_call(),
+            copy_array=self._get_copy_array_call(),
+            dtype=self._dtype,
+            bits_per_channel=self._dtype.numpy_dtype.itemsize * 8,
+            total_size=self._dtype.numpy_dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1))
+        return code
+
+    def _get_create_array_call(self):
+        texture_name = self._texture.symbol.name
+        ndim = '' if self._ndim <= 2 else f'{self._ndim}D'
+        array = 'array_' + texture_name
+        return f"""cudaArray * {array};
+cudaMalloc{ndim}Array(&{array}, &channel_desc_{texture_name}, """ + (
+            (f'{{{", ".join(str(s) for s in reversed(self._shape))}}});'
+                if self._ndim == 3
+                else f'{", ".join(str(s) for s in reversed(self._shape))});'))
+
+    def _get_copy_array_call(self):
+        texture_name = self._texture.symbol.name
+        array = 'array_' + texture_name
+        if self._texture.field.ndim == 3:
+            copy_params = f'cpy_{texture_name}_params'
+            return f"""cudaMemcpy3DParms {copy_params}{{}};
+{copy_params}.srcPtr = {{{self._device_ptr},
+                        {self._texture.field.strides[-2] * self._texture.field.dtype.numpy_dtype.itemsize},
+                        {self._texture.field.shape[-1]},
+                        {self._texture.field.shape[-2]}}};
+{copy_params}.dstArray = {array};
+{copy_params}.extent = {{{", ".join(str(s) for s in reversed(self._shape))}}};
+{copy_params}.kind = cudaMemcpyDeviceToDevice;
+cudaMemcpy3D(&{copy_params});"""  # noqa
+        elif self._texture.field.ndim == 2:
+            # noqa: cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+            return f"""cudaMemcpy2DToArray({array},
+                    0u,
+                    0u,
+                    {self._device_ptr},
+                    {self._texture.field.strides[-2] * self._texture.field.dtype.numpy_dtype.itemsize},
+                    {self._texture.field.shape[-1] * self._texture.field.dtype.numpy_dtype.itemsize}, // Dafaq, this has to be in bytes, but only columns and only in memcpy2D
+                    {self._texture.field.shape[-2]},
+                    cudaMemcpyDeviceToDevice);
+ """  # noqa
+        else:
+            raise NotImplementedError()