diff --git a/src/pystencils_autodiff/backends/_torch_native.py b/src/pystencils_autodiff/backends/_torch_native.py index 4bef57312a4ca5c8e3e4d0f52330e3a3b60cf530..a5701b154c02d9b94ceae7a7068cf4b35bc3356c 100644 --- a/src/pystencils_autodiff/backends/_torch_native.py +++ b/src/pystencils_autodiff/backends/_torch_native.py @@ -1,4 +1,5 @@ from collections import OrderedDict +from itertools import chain from pystencils_autodiff.backends._pytorch import numpy_dtype_to_torch from pystencils_autodiff.backends.astnodes import TorchModule @@ -63,7 +64,7 @@ def create_autograd_function(autodiff_obj, use_cuda): kwargs[field.name] = torch.zeros( field.shape, dtype=numpy_dtype_to_torch(field.dtype.numpy_dtype), - device=args[0].device) + device=chain(args[0], kwargs.values()).device) output_tensors = OrderedDict({f.name: field_to_tensor_dict.get(f, kwargs[f.name]) for f in autodiff_obj.forward_output_fields}) diff --git a/src/pystencils_autodiff/framework_integration/astnodes.py b/src/pystencils_autodiff/framework_integration/astnodes.py index ed16a4dbb7054750be640faa70645e1a854cd6db..e63a3a610d2498bf7e609ddd3ae8c628e7cf3d93 100644 --- a/src/pystencils_autodiff/framework_integration/astnodes.py +++ b/src/pystencils_autodiff/framework_integration/astnodes.py @@ -11,11 +11,9 @@ waLBerla currently uses `pystencils-walberla <https://pypi.org/project/pystencil """ import itertools from collections.abc import Iterable -from functools import reduce from typing import Any, List, Set import jinja2 -import numpy as np import pystencils import sympy as sp @@ -23,6 +21,7 @@ from pystencils.astnodes import KernelFunction, Node, NodeOrExpr, ResolvedFieldA from pystencils.data_types import TypedSymbol from pystencils.kernelparameters import FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol from pystencils_autodiff.framework_integration.printer import FrameworkIntegrationPrinter +from pystencils_autodiff.framework_integration.texture_astnodes import NativeTextureBinding class DestructuringBindingsForFieldClass(Node): @@ -89,78 +88,6 @@ class DestructuringBindingsForFieldClass(Node): return self.body.atoms(arg_type) | {s for s in self.symbols_defined if isinstance(s, arg_type)} -class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode): - CODE_TEMPLATE = """cudaResourceDesc {resource_desc}{{}}; -{resource_desc}.resType = cudaResourceTypeLinear; -{resource_desc}.res.linear.devPtr = {device_ptr}; -{resource_desc}.res.linear.desc.f = {cuda_channel_format}; -{resource_desc}.res.linear.desc.x = {bits_per_channel}; // bits per channel -{resource_desc}.res.linear.sizeInBytes = {total_size}; - -cudaTextureDesc {texture_desc}{{}}; -cudaTextureObject_t {texture_object}=0; -cudaCreateTextureObject(&{texture_object}, &{resource_desc}, &{texture_desc}, nullptr); -{texture_desc}.readMode = cudaReadModeElementType; -auto {texture_object}Destroyer = [&](){{ - cudaDestroyTextureObject({texture_object}); -}}; - """ - - def _get_channel_format_string(self): - """ - From CUDA API documentation: - - ``enum cudaChannelFormatKind`` - - Channel format kind - - Enumerator: - ============================= ======================== - cudaChannelFormatKindSigned Signed channel format. - cudaChannelFormatKindUnsigned Unsigned channel format. - cudaChannelFormatKindFloat Float channel format. - cudaChannelFormatKindNone No channel format. - ============================= ======================== - """ - dtype = self._device_ptr.dtype.base_type - if np.issubdtype(dtype.numpy_dtype, np.signedinteger): - return 'cudaChannelFormatKindSigned' - elif np.issubdtype(dtype.numpy_dtype, np.unsignedinteger): - return 'cudaChannelFormatKindUnsigned' - elif np.issubdtype(dtype.numpy_dtype, np.float32): - return 'cudaChannelFormatKindFloat' - elif np.issubdtype(dtype.numpy_dtype, np.float64): - # PyCUDA double texture hack! See pystencils/include/pycuda-helper-modified.hpp - return 'cudaChannelFormatKindSigned' - else: - raise NotImplementedError('dtype not supported for CUDA textures') - - def __init__(self, texture, device_data_ptr, use_texture_objects=True): - self._texture = texture - self._device_ptr = device_data_ptr - self._dtype = self._device_ptr.dtype.base_type.numpy_dtype - self._shape = tuple(sp.S(s) for s in self._texture.field.shape) - assert use_texture_objects, "without texture objects is not implemented" - - super().__init__(self.get_code(dialect='c', vector_instruction_set=None), - symbols_read={device_data_ptr, - *[s for s in self._shape if isinstance(s, sp.Symbol)]}, - symbols_defined={}) - self.headers.append("<cuda.h>") - - def get_code(self, dialect, vector_instruction_set): - texture_name = self._texture.symbol.name - code = self.CODE_TEMPLATE.format( - resource_desc='resDesc_' + texture_name, - texture_desc='texDesc_' + texture_name, - texture_object='tex_' + texture_name, - device_ptr=self._device_ptr, - cuda_channel_format=self._get_channel_format_string(), - bits_per_channel=self._dtype.itemsize * 8, - total_size=self._dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1)) - return code - - class KernelFunctionCall(Node): """ AST nodes representing a call of a :class:`pystencils.astnodes.KernelFunction` diff --git a/src/pystencils_autodiff/framework_integration/texture_astnodes.py b/src/pystencils_autodiff/framework_integration/texture_astnodes.py new file mode 100644 index 0000000000000000000000000000000000000000..cbc58bbacfc39ef14aa87a78a8047c6337d0843f --- /dev/null +++ b/src/pystencils_autodiff/framework_integration/texture_astnodes.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" +from functools import reduce + +import jinja2 +import sympy as sp + +import pystencils.backends + + +class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode): + """ + Bind texture to CUDA device pointer + + Recommended read: https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ + + The definition from cudaResourceDesc and cudaTextureDesc + + .. code:: c + + /** + * CUDA resource descriptor + */ + struct __device_builtin__ cudaResourceDesc { + enum cudaResourceType resType; /**< Resource type */ + + union { + struct { + cudaArray_t array; /**< CUDA array */ + } array; + struct { + cudaMipmappedArray_t mipmap; /**< CUDA mipmapped array */ + } mipmap; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + } res; + }; + + .. code:: c + + /** + * CUDA texture descriptor + */ + struct __device_builtin__ cudaTextureDesc + { + /** + * Texture address mode for up to 3 dimensions + */ + enum cudaTextureAddressMode addressMode[3]; + /** + * Texture filter mode + */ + enum cudaTextureFilterMode filterMode; + /** + * Texture read mode + */ + enum cudaTextureReadMode readMode; + /** + * Perform sRGB->linear conversion during texture read + */ + int sRGB; + /** + * Texture Border Color + */ + float borderColor[4]; + /** + * Indicates whether texture reads are normalized or not + */ + int normalizedCoords; + /** + * Limit to the anisotropy ratio + */ + unsigned int maxAnisotropy; + /** + * Mipmap filter mode + */ + enum cudaTextureFilterMode mipmapFilterMode; + /** + * Offset applied to the supplied mipmap level + */ + float mipmapLevelBias; + /** + * Lower end of the mipmap level range to clamp access to + */ + float minMipmapLevelClamp; + /** + * Upper end of the mipmap level range to clamp access to + */ + float maxMipmapLevelClamp; + }; + """ # noqa + CODE_TEMPLATE_LINEAR = jinja2.Template("""cudaResourceDesc {{resource_desc}}{}; +{{resource_desc}}.resType = cudaResourceTypeLinear; +{{resource_desc}}.res.linear.devPtr = {{device_ptr}}; +{{resource_desc}}.res.linear.desc.f = {{cuda_channel_format}}; +{{resource_desc}}.res.linear.desc.x = {{bits_per_channel}}; // bits per channel +{{resource_desc}}.res.linear.sizeInBytes = {{total_size}}; + +cudaTextureDesc {{texture_desc}}{}; +cudaTextureObject_t {{texture_object}}=0; +cudaCreateTextureObject(&{{texture_object}}, &{{resource_desc}}, &{{texture_desc}}, nullptr); +{{texture_desc}}.readMode = cudaReadModeElementType; +auto {{texture_object}}Destroyer = std::unique_ptr(nullptr, [&](){ + cudaDestroyTextureObject({{texture_object}}); +}); + """) + CODE_TEMPLATE_PITCHED2D = jinja2.Template(""" !!! TODO!!! """) + CODE_TEMPLATE_CUDA_ARRAY = jinja2.Template(""" +auto channel_desc_{{texture_name}} = {{channel_desc}}; +{{ create_array }} +{{ copy_array }} + cudaDeviceSynchronize(); +{{ texture_name }}.addressMode[0] = cudaAddressModeBorder; +{{ texture_name }}.addressMode[1] = cudaAddressModeBorder; +{{ texture_name }}.addressMode[2] = cudaAddressModeBorder; +{{ texture_name }}.filterMode = cudaFilterModeLinear; +{{ texture_name }}.normalized = false; +cudaBindTextureToArray(&{{texture_name}}, {{array}}, &channel_desc_{{texture_name}}); +std::shared_ptr<void> {{array}}Destroyer(nullptr, [&](...){ + cudaDeviceSynchronize(); + cudaFreeArray({{array}}); + cudaUnbindTexture({{texture_name}}); +}); + """) + + def __init__(self, texture, device_data_ptr, use_texture_objects=True): + self._texture = texture + self._device_ptr = device_data_ptr + self._dtype = self._device_ptr.dtype.base_type + self._shape = tuple(sp.S(s) for s in self._texture.field.shape) + self._ndim = texture.field.ndim + assert use_texture_objects, "without texture objects is not implemented" + + super().__init__(self.get_code(dialect='c', vector_instruction_set=None), + symbols_read={device_data_ptr, + *[s for s in self._shape if isinstance(s, sp.Symbol)]}, + symbols_defined={}) + self.headers = ['<memory>', '<cuda.h>', '<cuda_runtime_api.h>'] + + def get_code(self, dialect, vector_instruction_set): + texture_name = self._texture.symbol.name + code = self.CODE_TEMPLATE_CUDA_ARRAY.render( + resource_desc='resDesc_' + texture_name, + texture_desc='texDesc_' + texture_name, + channel_desc=f'cudaCreateChannelDesc<{self._dtype}>()', # noqa + texture_object='tex_' + texture_name, + array='array_' + texture_name, + texture_name=texture_name, + ndim=self._ndim, + device_ptr=self._device_ptr, + create_array=self._get_create_array_call(), + copy_array=self._get_copy_array_call(), + dtype=self._dtype, + bits_per_channel=self._dtype.numpy_dtype.itemsize * 8, + total_size=self._dtype.numpy_dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1)) + return code + + def _get_create_array_call(self): + texture_name = self._texture.symbol.name + ndim = '' if self._ndim <= 2 else f'{self._ndim}D' + array = 'array_' + texture_name + return f"""cudaArray * {array}; +cudaMalloc{ndim}Array(&{array}, &channel_desc_{texture_name}, """ + ( + (f'{{{", ".join(str(s) for s in reversed(self._shape))}}});' + if self._ndim == 3 + else f'{", ".join(str(s) for s in reversed(self._shape))});')) + + def _get_copy_array_call(self): + texture_name = self._texture.symbol.name + array = 'array_' + texture_name + if self._texture.field.ndim == 3: + copy_params = f'cpy_{texture_name}_params' + return f"""cudaMemcpy3DParms {copy_params}{{}}; +{copy_params}.srcPtr = {{{self._device_ptr}, + {self._texture.field.strides[-2] * self._texture.field.dtype.numpy_dtype.itemsize}, + {self._texture.field.shape[-1]}, + {self._texture.field.shape[-2]}}}; +{copy_params}.dstArray = {array}; +{copy_params}.extent = {{{", ".join(str(s) for s in reversed(self._shape))}}}; +{copy_params}.kind = cudaMemcpyDeviceToDevice; +cudaMemcpy3D(&{copy_params});""" # noqa + elif self._texture.field.ndim == 2: + # noqa: cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + return f"""cudaMemcpy2DToArray({array}, + 0u, + 0u, + {self._device_ptr}, + {self._texture.field.strides[-2] * self._texture.field.dtype.numpy_dtype.itemsize}, + {self._texture.field.shape[-1] * self._texture.field.dtype.numpy_dtype.itemsize}, // Dafaq, this has to be in bytes, but only columns and only in memcpy2D + {self._texture.field.shape[-2]}, + cudaMemcpyDeviceToDevice); + """ # noqa + else: + raise NotImplementedError()