Skip to content
Snippets Groups Projects

WIP: Cuda autotune

Closed Stephan Seitz requested to merge seitz/pystencils:cuda-autotune into master
Viewing commit 0800d84a
Next
Show latest version
5 files
+ 135
7
Preferences
Compare changes
Files
5
from functools import partial
import numpy as np
import pystencils
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.data_types import StructType
from pystencils.field import FieldType
@@ -77,11 +80,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
full_arguments.update(kwargs)
shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
# TODO: use texture objects:
# https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
for tex in textures:
@@ -89,6 +87,21 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
args = _build_numpy_argument_list(parameters, full_arguments)
indexing = kernel_function_node.indexing
if kernel_function_node.do_cudaautotune:
block_and_thread_numbers = (
indexing.autotune_call_parameters(partial(func, *args),
shape,
kernel_function_node.function_name,
tuple((k, v.strides, v.shape)
for k, v in kwargs.items()
if (isinstance(v, pycuda.gpuarray.GPUArray)))
+ (str(pystencils.show_code(kernel_function_node)),)))
else:
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args, **block_and_thread_numbers)