Skip to content
Snippets Groups Projects
Commit 0800d84a authored by Stephan Seitz's avatar Stephan Seitz
Browse files

Add auto-tuning for CUDA call parameters

parent f9ba7391
No related branches found
No related tags found
1 merge request!106WIP: Cuda autotune
...@@ -178,6 +178,11 @@ class KernelFunction(Node): ...@@ -178,6 +178,11 @@ class KernelFunction(Node):
self.instruction_set = None # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use self.instruction_set = None # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use
# function that compiles the node to a Python callable, is set by the backends # function that compiles the node to a Python callable, is set by the backends
self._compile_function = compile_function self._compile_function = compile_function
self._autotune_options = None
@property
def do_cudaautotune(self):
return self._autotune_options is not None
@property @property
def target(self): def target(self):
......
from functools import partial
import numpy as np import numpy as np
import pystencils
from pystencils.backends.cbackend import generate_c, get_headers from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.data_types import StructType from pystencils.data_types import StructType
from pystencils.field import FieldType from pystencils.field import FieldType
...@@ -77,11 +80,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -77,11 +80,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
full_arguments.update(kwargs) full_arguments.update(kwargs)
shape = _check_arguments(parameters, full_arguments) shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
# TODO: use texture objects: # TODO: use texture objects:
# https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ # https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
for tex in textures: for tex in textures:
...@@ -89,6 +87,21 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -89,6 +87,21 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode, ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer) tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
args = _build_numpy_argument_list(parameters, full_arguments) args = _build_numpy_argument_list(parameters, full_arguments)
indexing = kernel_function_node.indexing
if kernel_function_node.do_cudaautotune:
block_and_thread_numbers = (
indexing.autotune_call_parameters(partial(func, *args),
shape,
kernel_function_node.function_name,
tuple((k, v.strides, v.shape)
for k, v in kwargs.items()
if (isinstance(v, pycuda.gpuarray.GPUArray)))
+ (str(pystencils.show_code(kernel_function_node)),)))
else:
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
cache[key] = (args, block_and_thread_numbers) cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args, **block_and_thread_numbers) func(*args, **block_and_thread_numbers)
......
import abc import abc
import timeit
from functools import partial from functools import partial
import sympy as sp import sympy as sp
from sympy.core.cache import cacheit from sympy.core.cache import cacheit
from pystencils.astnodes import Block, Conditional from pystencils.astnodes import Block, Conditional
from pystencils.cache import disk_cache
from pystencils.data_types import TypedSymbol, create_type from pystencils.data_types import TypedSymbol, create_type
from pystencils.integer_functions import div_ceil, div_floor from pystencils.integer_functions import div_ceil, div_floor
from pystencils.slicing import normalize_slice from pystencils.slicing import normalize_slice
...@@ -83,6 +85,60 @@ class AbstractIndexing(abc.ABC): ...@@ -83,6 +85,60 @@ class AbstractIndexing(abc.ABC):
def symbolic_parameters(self): def symbolic_parameters(self):
"""Set of symbols required in call_parameters code""" """Set of symbols required in call_parameters code"""
def autotune_call_parameters(self, partial_function, call_shape, function_name, magic_hash):
"""Autotune call parameters for a specific kernel call
Tries to find the optimum call parameters ``block``, ``grid`` for a kernel function.
Args:
partial_function: Partial PyCUDA function with parameters block and grid missing
"""
import pycuda.driver
@disk_cache
def _autotune_call_parameters(self,
call_shape,
num_profile_calls,
function_name,
block_sizes,
magic_hash # needed for disk_cache
):
BIG_NUMBER = 100000000
current_best = self.call_parameters(call_shape)
best_timing = BIG_NUMBER
print(f'Autotuning function {function_name}')
for block_size in block_sizes:
self._block_size = block_size
if isinstance(self, BlockIndexing):
self._block_size = (
BlockIndexing.permute_block_size_according_to_layout(self._block_size, self._layout))
block_and_thread_numbers = self.call_parameters(call_shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
# TODO(seitz) can we use the CUDA profiler?: pycuda.driver.start_profiler()
def profile_call():
for i in range(num_profile_calls):
partial_function(**block_and_thread_numbers)
pycuda.driver.Context.synchronize()
current_time = timeit.timeit(profile_call, number=1)
print(f'{block_size} takes {current_time} ({num_profile_calls})')
if current_time < best_timing:
best_timing = current_time
current_best = block_and_thread_numbers
print(f'{current_best} is the best out of {self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES}')
self._block_size = current_best
return current_best
return _autotune_call_parameters(self,
call_shape,
self.AUTOTUNE_NUM_CALLS,
function_name,
self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES,
magic_hash)
# -------------------------------------------- Implementations --------------------------------------------------------- # -------------------------------------------- Implementations ---------------------------------------------------------
...@@ -97,6 +153,8 @@ class BlockIndexing(AbstractIndexing): ...@@ -97,6 +153,8 @@ class BlockIndexing(AbstractIndexing):
gets the largest amount of threads gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
""" """
AUTOTUNE_BLOCK_SIZES = ((16, 16, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),)
AUTOTUNE_NUM_CALLS = 10
def __init__(self, field, iteration_slice, def __init__(self, field, iteration_slice,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
...@@ -118,14 +176,17 @@ class BlockIndexing(AbstractIndexing): ...@@ -118,14 +176,17 @@ class BlockIndexing(AbstractIndexing):
maximum_block_size = tuple(device.get_attribute(a) maximum_block_size = tuple(device.get_attribute(a)
for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)) for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z))
self._layout = field.layout
self._maximum_block_size = maximum_block_size self._maximum_block_size = maximum_block_size
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape) self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._dim = field.spatial_dimensions self._dim = field.spatial_dimensions
self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape] self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
self._compile_time_block_size = compile_time_block_size self._compile_time_block_size = compile_time_block_size
self._autotune_block_sizes = None
@property @property
def coordinates(self): def coordinates(self):
# TODO(seitz): require layout in constructor to rotate the thread indices: thread_idx == fastest
offsets = _get_start_from_slice(self._iterationSlice) offsets = _get_start_from_slice(self._iterationSlice)
block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
coordinates = [block_index * bs + thread_idx + off coordinates = [block_index * bs + thread_idx + off
...@@ -227,6 +288,8 @@ class LineIndexing(AbstractIndexing): ...@@ -227,6 +288,8 @@ class LineIndexing(AbstractIndexing):
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a CUDA block (which depends on device). maximum amount of threads allowed in a CUDA block (which depends on device).
""" """
AUTOTUNE_BLOCK_SIZES = ((16, 1, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),)
AUTOTUNE_NUM_CALLS = 10
def __init__(self, field, iteration_slice): def __init__(self, field, iteration_slice):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX available_indices = [THREAD_IDX[0]] + BLOCK_IDX
......
from types import MappingProxyType
from itertools import combinations from itertools import combinations
from types import MappingProxyType
import sympy as sp import sympy as sp
...@@ -27,7 +27,8 @@ def create_kernel(assignments, ...@@ -27,7 +27,8 @@ def create_kernel(assignments,
gpu_indexing_params=MappingProxyType({}), gpu_indexing_params=MappingProxyType({}),
use_textures_for_interpolation=True, use_textures_for_interpolation=True,
cpu_prepend_optimizations=[], cpu_prepend_optimizations=[],
use_auto_for_assignments=False): use_auto_for_assignments=False,
autotune_cuda_callparameters=False):
""" """
Creates abstract syntax tree (AST) of kernel, using a list of update equations. Creates abstract syntax tree (AST) of kernel, using a list of update equations.
...@@ -121,6 +122,8 @@ def create_kernel(assignments, ...@@ -121,6 +122,8 @@ def create_kernel(assignments,
for a in ast.atoms(SympyAssignment): for a in ast.atoms(SympyAssignment):
a.use_auto = True a.use_auto = True
if autotune_cuda_callparameters:
ast._autotune_options = True
return ast return ast
......
import numpy as np import numpy as np
import pycuda.gpuarray as gpuarray import pycuda.gpuarray as gpuarray
import pytest
import sympy as sp import sympy as sp
from scipy.ndimage import convolve from scipy.ndimage import convolve
...@@ -35,6 +36,49 @@ def test_averaging_kernel(): ...@@ -35,6 +36,49 @@ def test_averaging_kernel():
np.testing.assert_almost_equal(reference, dst_arr) np.testing.assert_almost_equal(reference, dst_arr)
@pytest.mark.parametrize('use_3d', ('use_3d', False))
@pytest.mark.parametrize('use_fortran_layout', ('use_fortran_layout', False))
def test_autotuning(use_fortran_layout, use_3d):
print(f'Use Fortan layout: {use_fortran_layout}')
if use_3d:
size = (256, 256, 256)
else:
size = (256, 256)
src_arr = np.random.rand(*size)
if use_fortran_layout:
src_arr = np.asfortranarray(src_arr)
src_arr = add_ghost_layers(src_arr)
print(src_arr.strides)
dst_arr = np.zeros_like(src_arr)
src_field = Field.create_from_numpy_array('src', src_arr)
dst_field = Field.create_from_numpy_array('dst', dst_arr)
if use_3d:
update_rules = (Assignment(dst_field[0, 0, 0],
(src_field[0, 0, 1] + src_field[0, 0, -1] + src_field[0, 1, 0] + src_field[0, -1, 0])
/ 4),
Assignment(dst_field[0, 0, 0],
(src_field[1, 0, 0] + src_field[-1, 0, 0] + src_field[0, 1, 0] + src_field[0, -1, 0])
/ 4))
else:
update_rules = (Assignment(dst_field[0, 0],
(src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0])
/ 4),
Assignment(dst_field[0, 0],
(src_field[1, 0] + src_field[-1, 0] + src_field[0, 1] + src_field[0, -1])
/ 4))
for i in range(2):
ast = create_cuda_kernel(sympy_cse_on_assignment_list([update_rules[i]]))
ast._autotune_options = 1
kernel = make_python_function(ast)
gpu_src_arr = gpuarray.to_gpu(src_arr)
gpu_dst_arr = gpuarray.to_gpu(dst_arr)
kernel(src=gpu_src_arr, dst=gpu_dst_arr)
gpu_dst_arr.get(dst_arr)
def test_variable_sized_fields(): def test_variable_sized_fields():
src_field = Field.create_generic('src', spatial_dimensions=2) src_field = Field.create_generic('src', spatial_dimensions=2)
dst_field = Field.create_generic('dst', spatial_dimensions=2) dst_field = Field.create_generic('dst', spatial_dimensions=2)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment