Skip to content
Snippets Groups Projects
Commit 4ab12dea authored by Stephan Seitz's avatar Stephan Seitz
Browse files

Progress for 2d textures

parent 40ea9ea6
Branches
Tags
No related merge requests found
...@@ -99,18 +99,18 @@ class TorchModule(JinjaCppFile): ...@@ -99,18 +99,18 @@ class TorchModule(JinjaCppFile):
file_extension = '.cu' if self.is_cuda else '.cpp' file_extension = '.cu' if self.is_cuda else '.cpp'
source_code = str(self) source_code = str(self)
hash = _hash(source_code.encode()).hexdigest() hash = _hash(source_code.encode()).hexdigest()
file_name = join(get_cache_config()['object_cache'], f'{hash}{file_extension}') build_dir = join(get_cache_config()['object_cache'], self.module_name)
os.makedirs(build_dir, exist_ok=True)
file_name = join(build_dir, f'{hash}{file_extension}')
if not exists(file_name): if not exists(file_name):
write_file(file_name, source_code) write_file(file_name, source_code)
build_dir = join(get_cache_config()['object_cache'], self.module_name)
os.makedirs(build_dir, exist_ok=True)
torch_extension = load(hash, torch_extension = load(hash,
[file_name], [file_name],
with_cuda=self.is_cuda, with_cuda=self.is_cuda,
extra_cflags=['--std=c++14'], extra_cflags=['--std=c++14'],
extra_cuda_cflags=['-std=c++14'],
build_directory=build_dir, build_directory=build_dir,
extra_include_paths=[get_pycuda_include_path(), extra_include_paths=[get_pycuda_include_path(),
get_pystencils_include_path()]) get_pystencils_include_path()])
......
...@@ -90,76 +90,198 @@ class DestructuringBindingsForFieldClass(Node): ...@@ -90,76 +90,198 @@ class DestructuringBindingsForFieldClass(Node):
class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode): class NativeTextureBinding(pystencils.backends.cbackend.CustomCodeNode):
CODE_TEMPLATE = """cudaResourceDesc {resource_desc}{{}};
{resource_desc}.resType = cudaResourceTypeLinear;
{resource_desc}.res.linear.devPtr = {device_ptr};
{resource_desc}.res.linear.desc.f = {cuda_channel_format};
{resource_desc}.res.linear.desc.x = {bits_per_channel}; // bits per channel
{resource_desc}.res.linear.sizeInBytes = {total_size};
cudaTextureDesc {texture_desc}{{}};
cudaTextureObject_t {texture_object}=0;
cudaCreateTextureObject(&{texture_object}, &{resource_desc}, &{texture_desc}, nullptr);
{texture_desc}.readMode = cudaReadModeElementType;
auto {texture_object}Destroyer = [&](){{
cudaDestroyTextureObject({texture_object});
}};
""" """
Bind texture to CUDA device pointer
def _get_channel_format_string(self):
""" Recommended read: https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
From CUDA API documentation:
The definition from cudaResourceDesc and cudaTextureDesc
``enum cudaChannelFormatKind``
.. code:: c
Channel format kind
/**
Enumerator: * CUDA resource descriptor
============================= ======================== */
cudaChannelFormatKindSigned Signed channel format. struct __device_builtin__ cudaResourceDesc {
cudaChannelFormatKindUnsigned Unsigned channel format. enum cudaResourceType resType; /**< Resource type */
cudaChannelFormatKindFloat Float channel format.
cudaChannelFormatKindNone No channel format. union {
============================= ======================== struct {
""" cudaArray_t array; /**< CUDA array */
dtype = self._device_ptr.dtype.base_type } array;
if np.issubdtype(dtype.numpy_dtype, np.signedinteger): struct {
return 'cudaChannelFormatKindSigned' cudaMipmappedArray_t mipmap; /**< CUDA mipmapped array */
elif np.issubdtype(dtype.numpy_dtype, np.unsignedinteger): } mipmap;
return 'cudaChannelFormatKindUnsigned' struct {
elif np.issubdtype(dtype.numpy_dtype, np.float32): void *devPtr; /**< Device pointer */
return 'cudaChannelFormatKindFloat' struct cudaChannelFormatDesc desc; /**< Channel descriptor */
elif np.issubdtype(dtype.numpy_dtype, np.float64): size_t sizeInBytes; /**< Size in bytes */
# PyCUDA double texture hack! See pystencils/include/pycuda-helper-modified.hpp } linear;
return 'cudaChannelFormatKindSigned' struct {
else: void *devPtr; /**< Device pointer */
raise NotImplementedError('dtype not supported for CUDA textures') struct cudaChannelFormatDesc desc; /**< Channel descriptor */
size_t width; /**< Width of the array in elements */
size_t height; /**< Height of the array in elements */
size_t pitchInBytes; /**< Pitch between two rows in bytes */
} pitch2D;
} res;
};
.. code:: c
/**
* CUDA texture descriptor
*/
struct __device_builtin__ cudaTextureDesc
{
/**
* Texture address mode for up to 3 dimensions
*/
enum cudaTextureAddressMode addressMode[3];
/**
* Texture filter mode
*/
enum cudaTextureFilterMode filterMode;
/**
* Texture read mode
*/
enum cudaTextureReadMode readMode;
/**
* Perform sRGB->linear conversion during texture read
*/
int sRGB;
/**
* Texture Border Color
*/
float borderColor[4];
/**
* Indicates whether texture reads are normalized or not
*/
int normalizedCoords;
/**
* Limit to the anisotropy ratio
*/
unsigned int maxAnisotropy;
/**
* Mipmap filter mode
*/
enum cudaTextureFilterMode mipmapFilterMode;
/**
* Offset applied to the supplied mipmap level
*/
float mipmapLevelBias;
/**
* Lower end of the mipmap level range to clamp access to
*/
float minMipmapLevelClamp;
/**
* Upper end of the mipmap level range to clamp access to
*/
float maxMipmapLevelClamp;
};
""" # noqa
CODE_TEMPLATE_LINEAR = jinja2.Template("""cudaResourceDesc {{resource_desc}}{};
{{resource_desc}}.resType = cudaResourceTypeLinear;
{{resource_desc}}.res.linear.devPtr = {{device_ptr}};
{{resource_desc}}.res.linear.desc.f = {{cuda_channel_format}};
{{resource_desc}}.res.linear.desc.x = {{bits_per_channel}}; // bits per channel
{{resource_desc}}.res.linear.sizeInBytes = {{total_size}};
cudaTextureDesc {{texture_desc}}{};
cudaTextureObject_t {{texture_object}}=0;
cudaCreateTextureObject(&{{texture_object}}, &{{resource_desc}}, &{{texture_desc}}, nullptr);
{{texture_desc}}.readMode = cudaReadModeElementType;
auto {{texture_object}}Destroyer = std::unique_ptr(nullptr, [&](){
cudaDestroyTextureObject({{texture_object}});
});
""")
CODE_TEMPLATE_PITCHED2D = jinja2.Template(""" !!! TODO!!! """)
CODE_TEMPLATE_CUDA_ARRAY = jinja2.Template("""
auto channel_desc_{{texture_name}} = {{channel_desc}};
{{ create_array }}
{{ copy_array }}
{{ texture_name }}.addressMode[0] = cudaAddressModeBorder;
{{ texture_name }}.addressMode[1] = cudaAddressModeBorder;
{{ texture_name }}.addressMode[2] = cudaAddressModeBorder;
{{ texture_name }}.filterMode = cudaFilterModeLinear;
{{ texture_name }}.normalized = false;
cudaBindTextureToArray(&{{texture_name}}, {{array}}, &channel_desc_{{texture_name}});
std::shared_ptr<void> {{array}}Destroyer(nullptr, [&](...){
cudaFreeArray({{array}});
cudaUnbindTexture({{texture_name}});
});
""")
def __init__(self, texture, device_data_ptr, use_texture_objects=True): def __init__(self, texture, device_data_ptr, use_texture_objects=True):
self._texture = texture self._texture = texture
self._device_ptr = device_data_ptr self._device_ptr = device_data_ptr
self._dtype = self._device_ptr.dtype.base_type.numpy_dtype self._dtype = self._device_ptr.dtype.base_type
self._shape = tuple(sp.S(s) for s in self._texture.field.shape) self._shape = tuple(sp.S(s) for s in self._texture.field.shape)
self._ndim = texture.field.ndim
assert use_texture_objects, "without texture objects is not implemented" assert use_texture_objects, "without texture objects is not implemented"
super().__init__(self.get_code(dialect='c', vector_instruction_set=None), super().__init__(self.get_code(dialect='c', vector_instruction_set=None),
symbols_read={device_data_ptr, symbols_read={device_data_ptr,
*[s for s in self._shape if isinstance(s, sp.Symbol)]}, *[s for s in self._shape if isinstance(s, sp.Symbol)]},
symbols_defined={}) symbols_defined={})
self.headers.append("<cuda.h>") self.headers = ['<memory>', '<cuda.h>', '<cuda_runtime_api.h>']
def get_code(self, dialect, vector_instruction_set): def get_code(self, dialect, vector_instruction_set):
texture_name = self._texture.symbol.name texture_name = self._texture.symbol.name
code = self.CODE_TEMPLATE.format( code = self.CODE_TEMPLATE_CUDA_ARRAY.render(
resource_desc='resDesc_' + texture_name, resource_desc='resDesc_' + texture_name,
texture_desc='texDesc_' + texture_name, texture_desc='texDesc_' + texture_name,
channel_desc=f'cudaCreateChannelDesc<{self._dtype}>()', # noqa
texture_object='tex_' + texture_name, texture_object='tex_' + texture_name,
array='array_' + texture_name,
texture_name=texture_name,
ndim=self._ndim,
device_ptr=self._device_ptr, device_ptr=self._device_ptr,
cuda_channel_format=self._get_channel_format_string(), create_array=self._get_create_array_call(),
bits_per_channel=self._dtype.itemsize * 8, copy_array=self._get_copy_array_call(),
total_size=self._dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1)) dtype=self._dtype,
bits_per_channel=self._dtype.numpy_dtype.itemsize * 8,
total_size=self._dtype.numpy_dtype.itemsize * reduce(lambda x, y: x * y, self._shape, 1))
return code return code
def _get_create_array_call(self):
texture_name = self._texture.symbol.name
ndim = '' if self._ndim <= 2 else f'{self._ndim}D'
array = 'array_' + texture_name
return f"""cudaArray * {array};
cudaMalloc{ndim}Array(&{array}, &channel_desc_{texture_name}, """ + (
(f'{{{", ".join(str(s) for s in reversed(self._shape))}}});'
if self._ndim == 3
else f'{", ".join(str(s) for s in reversed(self._shape))});'))
def _get_copy_array_call(self):
texture_name = self._texture.symbol.name
array = 'array_' + texture_name
if self._texture.field.ndim == 3:
copy_params = f'cpy_{texture_name}_params'
return f"""cudaMemcpy3DParams {copy_params}{{}};
{copy_params}.srcPtr = {{{self._device_ptr},
{self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize},
{self._texture.field.shape[-1], self._texture.field.shape[-2]}}};
{copy_params}.dsrArray = {array};
copyParams.extent = {{{", ".join(reversed(self._shape))}}};
copyParams.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&{{copy_params}});""" # noqa
elif self._texture.field.ndim == 2:
# cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
return f"""cudaMemcpy2DToArray({array},
0u,
0u,
{self._device_ptr},
{self._texture.field.strides[-1] * self._texture.field.shape[-1] * self._dtype.numpy_dtype.itemsize},
{self._texture.field.shape[-1]},
{self._texture.field.shape[-2]},
cudaMemcpyDeviceToDevice);
"""
else:
raise NotImplementedError()
class KernelFunctionCall(Node): class KernelFunctionCall(Node):
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment