diff --git a/src/pystencils_autodiff/framework_integration/astnodes.py b/src/pystencils_autodiff/framework_integration/astnodes.py index d8547e534e449b5e847acdc7a2d98f387ba414a9..d8285c48662861d1fec7bc09d0d711953c21daa4 100644 --- a/src/pystencils_autodiff/framework_integration/astnodes.py +++ b/src/pystencils_autodiff/framework_integration/astnodes.py @@ -11,17 +11,17 @@ waLBerla currently uses `pystencils-walberla <https://pypi.org/project/pystencil """ import itertools from collections.abc import Iterable -from functools import reduce from typing import Any, List, Set import jinja2 +import sympy as sp import pystencils -import sympy as sp from pystencils.astnodes import KernelFunction, Node, NodeOrExpr, ResolvedFieldAccess from pystencils.data_types import TypedSymbol from pystencils.kernelparameters import FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol from pystencils_autodiff.framework_integration.printer import FrameworkIntegrationPrinter +from pystencils_autodiff.framework_integration.texture_astnodes import NativeTextureBinding class DestructuringBindingsForFieldClass(Node): @@ -88,201 +88,6 @@ class DestructuringBindingsForFieldClass(Node): return self.body.atoms(arg_type) | {s for s in self.symbols_defined if isinstance(s, arg_type)} -class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode): - """ - Bind texture to CUDA device pointer - - Recommended read: https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ - - The definition from cudaResourceDesc and cudaTextureDesc - - .. code:: c - - /** - * CUDA resource descriptor - */ - struct __device_builtin__ cudaResourceDesc { - enum cudaResourceType resType; /**< Resource type */ - - union { - struct { - cudaArray_t array; /**< CUDA array */ - } array; - struct { - cudaMipmappedArray_t mipmap; /**< CUDA mipmapped array */ - } mipmap; - struct { - void *devPtr; /**< Device pointer */ - struct cudaChannelFormatDesc desc; /**< Channel descriptor */ - size_t sizeInBytes; /**< Size in bytes */ - } linear; - struct { - void *devPtr; /**< Device pointer */ - struct cudaChannelFormatDesc desc; /**< Channel descriptor */ - size_t width; /**< Width of the array in elements */ - size_t height; /**< Height of the array in elements */ - size_t pitchInBytes; /**< Pitch between two rows in bytes */ - } pitch2D; - } res; - }; - - .. code:: c - - /** - * CUDA texture descriptor - */ - struct __device_builtin__ cudaTextureDesc - { - /** - * Texture address mode for up to 3 dimensions - */ - enum cudaTextureAddressMode addressMode[3]; - /** - * Texture filter mode - */ - enum cudaTextureFilterMode filterMode; - /** - * Texture read mode - */ - enum cudaTextureReadMode readMode; - /** - * Perform sRGB->linear conversion during texture read - */ - int sRGB; - /** - * Texture Border Color - */ - float borderColor[4]; - /** - * Indicates whether texture reads are normalized or not - */ - int normalizedCoords; - /** - * Limit to the anisotropy ratio - */ - unsigned int maxAnisotropy; - /** - * Mipmap filter mode - */ - enum cudaTextureFilterMode mipmapFilterMode; - /** - * Offset applied to the supplied mipmap level - */ - float mipmapLevelBias; - /** - * Lower end of the mipmap level range to clamp access to - */ - float minMipmapLevelClamp; - /** - * Upper end of the mipmap level range to clamp access to - */ - float maxMipmapLevelClamp; - }; - """ # noqa - CODE_TEMPLATE_LINEAR = jinja2.Template("""cudaResourceDesc {{resource_desc}}{}; -{{resource_desc}}.resType = cudaResourceTypeLinear; -{{resource_desc}}.res.linear.devPtr = {{device_ptr}}; -{{resource_desc}}.res.linear.desc.f = {{cuda_channel_format}}; -{{resource_desc}}.res.linear.desc.x = {{bits_per_channel}}; // bits per channel -{{resource_desc}}.res.linear.sizeInBytes = {{total_size}}; - -cudaTextureDesc {{texture_desc}}{}; -cudaTextureObject_t {{texture_object}}=0; -cudaCreateTextureObject(&{{texture_object}}, &{{resource_desc}}, &{{texture_desc}}, nullptr); -{{texture_desc}}.readMode = cudaReadModeElementType; -auto {{texture_object}}Destroyer = std::unique_ptr(nullptr, [&](){ - cudaDestroyTextureObject({{texture_object}}); -}); - """) - CODE_TEMPLATE_PITCHED2D = jinja2.Template(""" !!! TODO!!! """) - CODE_TEMPLATE_CUDA_ARRAY = jinja2.Template(""" -auto channel_desc_{{texture_name}} = {{channel_desc}}; -{{ create_array }} -{{ copy_array }} -{{ texture_name }}.addressMode[0] = cudaAddressModeBorder; -{{ texture_name }}.addressMode[1] = cudaAddressModeBorder; -{{ texture_name }}.addressMode[2] = cudaAddressModeBorder; -{{ texture_name }}.filterMode = cudaFilterModeLinear; -{{ texture_name }}.normalized = false; -cudaBindTextureToArray(&{{texture_name}}, {{array}}, &channel_desc_{{texture_name}}); -std::shared_ptr<void> {{array}}Destroyer(nullptr, [&](...){ - cudaFreeArray({{array}}); - cudaUnbindTexture({{texture_name}}); -}); - """) - - def __init__(self, texture, device_data_ptr, use_texture_objects=True): - self._texture = texture - self._device_ptr = device_data_ptr - self._dtype = self._device_ptr.dtype.base_type - self._shape = tuple(sp.S(s) for s in self._texture.field.shape) - self._ndim = texture.field.ndim - assert use_texture_objects, "without texture objects is not implemented" - - super().__init__(self.get_code(dialect='c', vector_instruction_set=None), - symbols_read={device_data_ptr, - *[s for s in self._shape if isinstance(s, sp.Symbol)]}, - symbols_defined={}) - self.headers = ['<memory>', '<cuda.h>', '<cuda_runtime_api.h>'] - - def get_code(self, dialect, vector_instruction_set): - texture_name = self._texture.symbol.name - code = self.CODE_TEMPLATE_CUDA_ARRAY.render( - resource_desc='resDesc_' + texture_name, - texture_desc='texDesc_' + texture_name, - channel_desc=f'cudaCreateChannelDesc<{self._dtype}>()', # noqa - texture_object='tex_' + texture_name, - array='array_' + texture_name, - texture_name=texture_name, - ndim=self._ndim, - device_ptr=self._device_ptr, - create_array=self._get_create_array_call(), - copy_array=self._get_copy_array_call(), - dtype=self._dtype, - bits_per_channel=self._dtype.numpy_dtype.itemsize * 8, - total_size=self._dtype.numpy_dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1)) - return code - - def _get_create_array_call(self): - texture_name = self._texture.symbol.name - ndim = '' if self._ndim <= 2 else f'{self._ndim}D' - array = 'array_' + texture_name - return f"""cudaArray * {array}; -cudaMalloc{ndim}Array(&{array}, &channel_desc_{texture_name}, """ + ( - (f'{{{", ".join(str(s) for s in reversed(self._shape))}}});' - if self._ndim == 3 - else f'{", ".join(str(s) for s in reversed(self._shape))});')) - - def _get_copy_array_call(self): - texture_name = self._texture.symbol.name - array = 'array_' + texture_name - if self._texture.field.ndim == 3: - copy_params = f'cpy_{texture_name}_params' - return f"""cudaMemcpy3DParms {copy_params}{{}}; -{copy_params}.srcPtr = {{{self._device_ptr}, - {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize}, - {self._texture.field.shape[-1]}, - {self._texture.field.shape[-2]}}}; -{copy_params}.dstArray = {array}; -{copy_params}.extent = {{{", ".join(str(s) for s in reversed(self._shape))}}}; -{copy_params}.kind = cudaMemcpyDeviceToDevice; -cudaMemcpy3D(&{copy_params});""" # noqa - elif self._texture.field.ndim == 2: - # noqa: cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); - - return f"""cudaMemcpy2DToArray({array}, - 0u, - 0u, - {self._device_ptr}, - {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize}, - {self._texture.field.shape[-1]}, - {self._texture.field.shape[-2]}, - cudaMemcpyDeviceToDevice); - """ # noqa - else: - raise NotImplementedError() - - class KernelFunctionCall(Node): """ AST nodes representing a call of a :class:`pystencils.astnodes.KernelFunction` diff --git a/src/pystencils_autodiff/framework_integration/texture_astnodes.py b/src/pystencils_autodiff/framework_integration/texture_astnodes.py new file mode 100644 index 0000000000000000000000000000000000000000..f6b2f532f6348bec468ffd6e1377346019899c0c --- /dev/null +++ b/src/pystencils_autodiff/framework_integration/texture_astnodes.py @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" +from functools import reduce + +import jinja2 +import sympy as sp + +import pystencils.backends + + +class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode): + """ + Bind texture to CUDA device pointer + + Recommended read: https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ + + The definition from cudaResourceDesc and cudaTextureDesc + + .. code:: c + + /** + * CUDA resource descriptor + */ + struct __device_builtin__ cudaResourceDesc { + enum cudaResourceType resType; /**< Resource type */ + + union { + struct { + cudaArray_t array; /**< CUDA array */ + } array; + struct { + cudaMipmappedArray_t mipmap; /**< CUDA mipmapped array */ + } mipmap; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + } res; + }; + + .. code:: c + + /** + * CUDA texture descriptor + */ + struct __device_builtin__ cudaTextureDesc + { + /** + * Texture address mode for up to 3 dimensions + */ + enum cudaTextureAddressMode addressMode[3]; + /** + * Texture filter mode + */ + enum cudaTextureFilterMode filterMode; + /** + * Texture read mode + */ + enum cudaTextureReadMode readMode; + /** + * Perform sRGB->linear conversion during texture read + */ + int sRGB; + /** + * Texture Border Color + */ + float borderColor[4]; + /** + * Indicates whether texture reads are normalized or not + */ + int normalizedCoords; + /** + * Limit to the anisotropy ratio + */ + unsigned int maxAnisotropy; + /** + * Mipmap filter mode + */ + enum cudaTextureFilterMode mipmapFilterMode; + /** + * Offset applied to the supplied mipmap level + */ + float mipmapLevelBias; + /** + * Lower end of the mipmap level range to clamp access to + */ + float minMipmapLevelClamp; + /** + * Upper end of the mipmap level range to clamp access to + */ + float maxMipmapLevelClamp; + }; + """ # noqa + CODE_TEMPLATE_LINEAR = jinja2.Template("""cudaResourceDesc {{resource_desc}}{}; +{{resource_desc}}.resType = cudaResourceTypeLinear; +{{resource_desc}}.res.linear.devPtr = {{device_ptr}}; +{{resource_desc}}.res.linear.desc.f = {{cuda_channel_format}}; +{{resource_desc}}.res.linear.desc.x = {{bits_per_channel}}; // bits per channel +{{resource_desc}}.res.linear.sizeInBytes = {{total_size}}; + +cudaTextureDesc {{texture_desc}}{}; +cudaTextureObject_t {{texture_object}}=0; +cudaCreateTextureObject(&{{texture_object}}, &{{resource_desc}}, &{{texture_desc}}, nullptr); +{{texture_desc}}.readMode = cudaReadModeElementType; +auto {{texture_object}}Destroyer = std::unique_ptr(nullptr, [&](){ + cudaDestroyTextureObject({{texture_object}}); +}); + """) + CODE_TEMPLATE_PITCHED2D = jinja2.Template(""" !!! TODO!!! """) + CODE_TEMPLATE_CUDA_ARRAY = jinja2.Template(""" +auto channel_desc_{{texture_name}} = {{channel_desc}}; +{{ create_array }} +{{ copy_array }} +{{ texture_name }}.addressMode[0] = cudaAddressModeBorder; +{{ texture_name }}.addressMode[1] = cudaAddressModeBorder; +{{ texture_name }}.addressMode[2] = cudaAddressModeBorder; +{{ texture_name }}.filterMode = cudaFilterModeLinear; +{{ texture_name }}.normalized = false; +cudaBindTextureToArray(&{{texture_name}}, {{array}}, &channel_desc_{{texture_name}}); +std::shared_ptr<void> {{array}}Destroyer(nullptr, [&](...){ + cudaFreeArray({{array}}); + cudaUnbindTexture({{texture_name}}); +}); + """) + + def __init__(self, texture, device_data_ptr, use_texture_objects=True): + self._texture = texture + self._device_ptr = device_data_ptr + self._dtype = self._device_ptr.dtype.base_type + self._shape = tuple(sp.S(s) for s in self._texture.field.shape) + self._ndim = texture.field.ndim + assert use_texture_objects, "without texture objects is not implemented" + + super().__init__(self.get_code(dialect='c', vector_instruction_set=None), + symbols_read={device_data_ptr, + *[s for s in self._shape if isinstance(s, sp.Symbol)]}, + symbols_defined={}) + self.headers = ['<memory>', '<cuda.h>', '<cuda_runtime_api.h>'] + + def get_code(self, dialect, vector_instruction_set): + texture_name = self._texture.symbol.name + code = self.CODE_TEMPLATE_CUDA_ARRAY.render( + resource_desc='resDesc_' + texture_name, + texture_desc='texDesc_' + texture_name, + channel_desc=f'cudaCreateChannelDesc<{self._dtype}>()', # noqa + texture_object='tex_' + texture_name, + array='array_' + texture_name, + texture_name=texture_name, + ndim=self._ndim, + device_ptr=self._device_ptr, + create_array=self._get_create_array_call(), + copy_array=self._get_copy_array_call(), + dtype=self._dtype, + bits_per_channel=self._dtype.numpy_dtype.itemsize * 8, + total_size=self._dtype.numpy_dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1)) + return code + + def _get_create_array_call(self): + texture_name = self._texture.symbol.name + ndim = '' if self._ndim <= 2 else f'{self._ndim}D' + array = 'array_' + texture_name + return f"""cudaArray * {array}; +cudaMalloc{ndim}Array(&{array}, &channel_desc_{texture_name}, """ + ( + (f'{{{", ".join(str(s) for s in reversed(self._shape))}}});' + if self._ndim == 3 + else f'{", ".join(str(s) for s in reversed(self._shape))});')) + + def _get_copy_array_call(self): + texture_name = self._texture.symbol.name + array = 'array_' + texture_name + if self._texture.field.ndim == 3: + copy_params = f'cpy_{texture_name}_params' + return f"""cudaMemcpy3DParms {copy_params}{{}}; +{copy_params}.srcPtr = {{{self._device_ptr}, + {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize}, + {self._texture.field.shape[-1]}, + {self._texture.field.shape[-2]}}}; +{copy_params}.dstArray = {array}; +{copy_params}.extent = {{{", ".join(str(s) for s in reversed(self._shape))}}}; +{copy_params}.kind = cudaMemcpyDeviceToDevice; +cudaMemcpy3D(&{copy_params});""" # noqa + elif self._texture.field.ndim == 2: + # noqa: cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + + return f"""cudaMemcpy2DToArray({array}, + 0u, + 0u, + {self._device_ptr}, + {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize}, + {self._texture.field.shape[-1]}, + {self._texture.field.shape[-2]}, + cudaMemcpyDeviceToDevice); + """ # noqa + else: + raise NotImplementedError()