diff --git a/src/pystencils_autodiff/backends/astnodes.py b/src/pystencils_autodiff/backends/astnodes.py index acd536ff889e0d8506a5b039444c1ae03ca3387a..bb24dc510799309970b484692971b693ba50096c 100644 --- a/src/pystencils_autodiff/backends/astnodes.py +++ b/src/pystencils_autodiff/backends/astnodes.py @@ -99,18 +99,18 @@ class TorchModule(JinjaCppFile): file_extension = '.cu' if self.is_cuda else '.cpp' source_code = str(self) hash = _hash(source_code.encode()).hexdigest() - file_name = join(get_cache_config()['object_cache'], f'{hash}{file_extension}') + build_dir = join(get_cache_config()['object_cache'], self.module_name) + os.makedirs(build_dir, exist_ok=True) + file_name = join(build_dir, f'{hash}{file_extension}') if not exists(file_name): write_file(file_name, source_code) - build_dir = join(get_cache_config()['object_cache'], self.module_name) - os.makedirs(build_dir, exist_ok=True) - torch_extension = load(hash, [file_name], with_cuda=self.is_cuda, extra_cflags=['--std=c++14'], + extra_cuda_cflags=['-std=c++14'], build_directory=build_dir, extra_include_paths=[get_pycuda_include_path(), get_pystencils_include_path()]) diff --git a/src/pystencils_autodiff/framework_integration/astnodes.py b/src/pystencils_autodiff/framework_integration/astnodes.py index ed16a4dbb7054750be640faa70645e1a854cd6db..3e5e7a35366b371df389ee04e1a708ac5fe84094 100644 --- a/src/pystencils_autodiff/framework_integration/astnodes.py +++ b/src/pystencils_autodiff/framework_integration/astnodes.py @@ -90,76 +90,198 @@ class DestructuringBindingsForFieldClass(Node): class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode): - CODE_TEMPLATE = """cudaResourceDesc {resource_desc}{{}}; -{resource_desc}.resType = cudaResourceTypeLinear; -{resource_desc}.res.linear.devPtr = {device_ptr}; -{resource_desc}.res.linear.desc.f = {cuda_channel_format}; -{resource_desc}.res.linear.desc.x = {bits_per_channel}; // bits per channel -{resource_desc}.res.linear.sizeInBytes = {total_size}; - -cudaTextureDesc {texture_desc}{{}}; -cudaTextureObject_t {texture_object}=0; -cudaCreateTextureObject(&{texture_object}, &{resource_desc}, &{texture_desc}, nullptr); -{texture_desc}.readMode = cudaReadModeElementType; -auto {texture_object}Destroyer = [&](){{ - cudaDestroyTextureObject({texture_object}); -}}; """ - - def _get_channel_format_string(self): - """ - From CUDA API documentation: - - ``enum cudaChannelFormatKind`` - - Channel format kind - - Enumerator: - ============================= ======================== - cudaChannelFormatKindSigned Signed channel format. - cudaChannelFormatKindUnsigned Unsigned channel format. - cudaChannelFormatKindFloat Float channel format. - cudaChannelFormatKindNone No channel format. - ============================= ======================== - """ - dtype = self._device_ptr.dtype.base_type - if np.issubdtype(dtype.numpy_dtype, np.signedinteger): - return 'cudaChannelFormatKindSigned' - elif np.issubdtype(dtype.numpy_dtype, np.unsignedinteger): - return 'cudaChannelFormatKindUnsigned' - elif np.issubdtype(dtype.numpy_dtype, np.float32): - return 'cudaChannelFormatKindFloat' - elif np.issubdtype(dtype.numpy_dtype, np.float64): - # PyCUDA double texture hack! See pystencils/include/pycuda-helper-modified.hpp - return 'cudaChannelFormatKindSigned' - else: - raise NotImplementedError('dtype not supported for CUDA textures') + Bind texture to CUDA device pointer + + Recommended read: https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ + + The definition from cudaResourceDesc and cudaTextureDesc + + .. code:: c + + /** + * CUDA resource descriptor + */ + struct __device_builtin__ cudaResourceDesc { + enum cudaResourceType resType; /**< Resource type */ + + union { + struct { + cudaArray_t array; /**< CUDA array */ + } array; + struct { + cudaMipmappedArray_t mipmap; /**< CUDA mipmapped array */ + } mipmap; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + } res; + }; + + .. code:: c + + /** + * CUDA texture descriptor + */ + struct __device_builtin__ cudaTextureDesc + { + /** + * Texture address mode for up to 3 dimensions + */ + enum cudaTextureAddressMode addressMode[3]; + /** + * Texture filter mode + */ + enum cudaTextureFilterMode filterMode; + /** + * Texture read mode + */ + enum cudaTextureReadMode readMode; + /** + * Perform sRGB->linear conversion during texture read + */ + int sRGB; + /** + * Texture Border Color + */ + float borderColor[4]; + /** + * Indicates whether texture reads are normalized or not + */ + int normalizedCoords; + /** + * Limit to the anisotropy ratio + */ + unsigned int maxAnisotropy; + /** + * Mipmap filter mode + */ + enum cudaTextureFilterMode mipmapFilterMode; + /** + * Offset applied to the supplied mipmap level + */ + float mipmapLevelBias; + /** + * Lower end of the mipmap level range to clamp access to + */ + float minMipmapLevelClamp; + /** + * Upper end of the mipmap level range to clamp access to + */ + float maxMipmapLevelClamp; + }; + """ # noqa + CODE_TEMPLATE_LINEAR = jinja2.Template("""cudaResourceDesc {{resource_desc}}{}; +{{resource_desc}}.resType = cudaResourceTypeLinear; +{{resource_desc}}.res.linear.devPtr = {{device_ptr}}; +{{resource_desc}}.res.linear.desc.f = {{cuda_channel_format}}; +{{resource_desc}}.res.linear.desc.x = {{bits_per_channel}}; // bits per channel +{{resource_desc}}.res.linear.sizeInBytes = {{total_size}}; + +cudaTextureDesc {{texture_desc}}{}; +cudaTextureObject_t {{texture_object}}=0; +cudaCreateTextureObject(&{{texture_object}}, &{{resource_desc}}, &{{texture_desc}}, nullptr); +{{texture_desc}}.readMode = cudaReadModeElementType; +auto {{texture_object}}Destroyer = std::unique_ptr(nullptr, [&](){ + cudaDestroyTextureObject({{texture_object}}); +}); + """) + CODE_TEMPLATE_PITCHED2D = jinja2.Template(""" !!! TODO!!! """) + CODE_TEMPLATE_CUDA_ARRAY = jinja2.Template(""" +auto channel_desc_{{texture_name}} = {{channel_desc}}; +{{ create_array }} +{{ copy_array }} +{{ texture_name }}.addressMode[0] = cudaAddressModeBorder; +{{ texture_name }}.addressMode[1] = cudaAddressModeBorder; +{{ texture_name }}.addressMode[2] = cudaAddressModeBorder; +{{ texture_name }}.filterMode = cudaFilterModeLinear; +{{ texture_name }}.normalized = false; +cudaBindTextureToArray(&{{texture_name}}, {{array}}, &channel_desc_{{texture_name}}); +std::shared_ptr<void> {{array}}Destroyer(nullptr, [&](...){ + cudaFreeArray({{array}}); + cudaUnbindTexture({{texture_name}}); +}); + """) def __init__(self, texture, device_data_ptr, use_texture_objects=True): self._texture = texture self._device_ptr = device_data_ptr - self._dtype = self._device_ptr.dtype.base_type.numpy_dtype + self._dtype = self._device_ptr.dtype.base_type self._shape = tuple(sp.S(s) for s in self._texture.field.shape) + self._ndim = texture.field.ndim assert use_texture_objects, "without texture objects is not implemented" super().__init__(self.get_code(dialect='c', vector_instruction_set=None), symbols_read={device_data_ptr, *[s for s in self._shape if isinstance(s, sp.Symbol)]}, symbols_defined={}) - self.headers.append("<cuda.h>") + self.headers = ['<memory>', '<cuda.h>', '<cuda_runtime_api.h>'] def get_code(self, dialect, vector_instruction_set): texture_name = self._texture.symbol.name - code = self.CODE_TEMPLATE.format( + code = self.CODE_TEMPLATE_CUDA_ARRAY.render( resource_desc='resDesc_' + texture_name, texture_desc='texDesc_' + texture_name, + channel_desc=f'cudaCreateChannelDesc<{self._dtype}>()', # noqa texture_object='tex_' + texture_name, + array='array_' + texture_name, + texture_name=texture_name, + ndim=self._ndim, device_ptr=self._device_ptr, - cuda_channel_format=self._get_channel_format_string(), - bits_per_channel=self._dtype.itemsize * 8, - total_size=self._dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1)) + create_array=self._get_create_array_call(), + copy_array=self._get_copy_array_call(), + dtype=self._dtype, + bits_per_channel=self._dtype.numpy_dtype.itemsize * 8, + total_size=self._dtype.numpy_dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1)) return code + def _get_create_array_call(self): + texture_name = self._texture.symbol.name + ndim = '' if self._ndim <= 2 else f'{self._ndim}D' + array = 'array_' + texture_name + return f"""cudaArray * {array}; +cudaMalloc{ndim}Array(&{array}, &channel_desc_{texture_name}, """ + ( + (f'{{{", ".join(str(s) for s in reversed(self._shape))}}});' + if self._ndim == 3 + else f'{", ".join(str(s) for s in reversed(self._shape))});')) + + def _get_copy_array_call(self): + texture_name = self._texture.symbol.name + array = 'array_' + texture_name + if self._texture.field.ndim == 3: + copy_params = f'cpy_{texture_name}_params' + return f"""cudaMemcpy3DParams {copy_params}{{}}; +{copy_params}.srcPtr = {{{self._device_ptr}, + {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize}, + {self._texture.field.shape[-1], self._texture.field.shape[-2]}}}; +{copy_params}.dsrArray = {array}; +copyParams.extent = {{{", ".join(reversed(self._shape))}}}; +copyParams.kind = cudaMemcpyDeviceToDevice; +cudaMemcpy3D(&{{copy_params}});""" # noqa + elif self._texture.field.ndim == 2: + # cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + + return f"""cudaMemcpy2DToArray({array}, + 0u, + 0u, + {self._device_ptr}, + {self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize}, + {self._texture.field.shape[-1]}, + {self._texture.field.shape[-2]}, + cudaMemcpyDeviceToDevice); + """ + else: + raise NotImplementedError() + class KernelFunctionCall(Node): """