From 0800d84a34d8a370e80f4f08e41074e576a0aa96 Mon Sep 17 00:00:00 2001 From: Stephan Seitz <stephan.seitz@fau.de> Date: Wed, 11 Dec 2019 10:28:19 +0100 Subject: [PATCH] Add auto-tuning for CUDA call parameters --- pystencils/astnodes.py | 5 +++ pystencils/gpucuda/cudajit.py | 23 +++++++++--- pystencils/gpucuda/indexing.py | 63 ++++++++++++++++++++++++++++++++ pystencils/kernelcreation.py | 7 +++- pystencils_tests/test_cudagpu.py | 44 ++++++++++++++++++++++ 5 files changed, 135 insertions(+), 7 deletions(-) diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index cc58ebd3..155487be 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -178,6 +178,11 @@ class KernelFunction(Node): self.instruction_set = None # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use # function that compiles the node to a Python callable, is set by the backends self._compile_function = compile_function + self._autotune_options = None + + @property + def do_cudaautotune(self): + return self._autotune_options is not None @property def target(self): diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py index 28ec47d0..63801060 100644 --- a/pystencils/gpucuda/cudajit.py +++ b/pystencils/gpucuda/cudajit.py @@ -1,5 +1,8 @@ +from functools import partial + import numpy as np +import pystencils from pystencils.backends.cbackend import generate_c, get_headers from pystencils.data_types import StructType from pystencils.field import FieldType @@ -77,11 +80,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen full_arguments.update(kwargs) shape = _check_arguments(parameters, full_arguments) - indexing = kernel_function_node.indexing - block_and_thread_numbers = indexing.call_parameters(shape) - block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) - block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) - # TODO: use texture objects: # https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ for tex in textures: @@ -89,6 +87,21 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode, tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer) args = _build_numpy_argument_list(parameters, full_arguments) + indexing = kernel_function_node.indexing + if kernel_function_node.do_cudaautotune: + block_and_thread_numbers = ( + indexing.autotune_call_parameters(partial(func, *args), + shape, + kernel_function_node.function_name, + tuple((k, v.strides, v.shape) + for k, v in kwargs.items() + if (isinstance(v, pycuda.gpuarray.GPUArray))) + + (str(pystencils.show_code(kernel_function_node)),))) + else: + block_and_thread_numbers = indexing.call_parameters(shape) + block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) + block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) + cache[key] = (args, block_and_thread_numbers) cache_values.append(kwargs) # keep objects alive such that ids remain unique func(*args, **block_and_thread_numbers) diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py index eb212119..bf8b5302 100644 --- a/pystencils/gpucuda/indexing.py +++ b/pystencils/gpucuda/indexing.py @@ -1,10 +1,12 @@ import abc +import timeit from functools import partial import sympy as sp from sympy.core.cache import cacheit from pystencils.astnodes import Block, Conditional +from pystencils.cache import disk_cache from pystencils.data_types import TypedSymbol, create_type from pystencils.integer_functions import div_ceil, div_floor from pystencils.slicing import normalize_slice @@ -83,6 +85,60 @@ class AbstractIndexing(abc.ABC): def symbolic_parameters(self): """Set of symbols required in call_parameters code""" + def autotune_call_parameters(self, partial_function, call_shape, function_name, magic_hash): + """Autotune call parameters for a specific kernel call + + Tries to find the optimum call parameters ``block``, ``grid`` for a kernel function. + + Args: + partial_function: Partial PyCUDA function with parameters block and grid missing + """ + import pycuda.driver + + @disk_cache + def _autotune_call_parameters(self, + call_shape, + num_profile_calls, + function_name, + block_sizes, + magic_hash # needed for disk_cache + ): + BIG_NUMBER = 100000000 + current_best = self.call_parameters(call_shape) + best_timing = BIG_NUMBER + + print(f'Autotuning function {function_name}') + for block_size in block_sizes: + self._block_size = block_size + if isinstance(self, BlockIndexing): + self._block_size = ( + BlockIndexing.permute_block_size_according_to_layout(self._block_size, self._layout)) + block_and_thread_numbers = self.call_parameters(call_shape) + block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) + block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) + + # TODO(seitz) can we use the CUDA profiler?: pycuda.driver.start_profiler() + def profile_call(): + for i in range(num_profile_calls): + partial_function(**block_and_thread_numbers) + pycuda.driver.Context.synchronize() + + current_time = timeit.timeit(profile_call, number=1) + print(f'{block_size} takes {current_time} ({num_profile_calls})') + if current_time < best_timing: + best_timing = current_time + current_best = block_and_thread_numbers + + print(f'{current_best} is the best out of {self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES}') + self._block_size = current_best + return current_best + return _autotune_call_parameters(self, + call_shape, + self.AUTOTUNE_NUM_CALLS, + function_name, + self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES, + magic_hash) + # -------------------------------------------- Implementations --------------------------------------------------------- @@ -97,6 +153,8 @@ class BlockIndexing(AbstractIndexing): gets the largest amount of threads compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used """ + AUTOTUNE_BLOCK_SIZES = ((16, 16, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),) + AUTOTUNE_NUM_CALLS = 10 def __init__(self, field, iteration_slice, block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, @@ -118,14 +176,17 @@ class BlockIndexing(AbstractIndexing): maximum_block_size = tuple(device.get_attribute(a) for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)) + self._layout = field.layout self._maximum_block_size = maximum_block_size self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape) self._dim = field.spatial_dimensions self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape] self._compile_time_block_size = compile_time_block_size + self._autotune_block_sizes = None @property def coordinates(self): + # TODO(seitz): require layout in constructor to rotate the thread indices: thread_idx == fastest offsets = _get_start_from_slice(self._iterationSlice) block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM coordinates = [block_index * bs + thread_idx + off @@ -227,6 +288,8 @@ class LineIndexing(AbstractIndexing): This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the maximum amount of threads allowed in a CUDA block (which depends on device). """ + AUTOTUNE_BLOCK_SIZES = ((16, 1, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),) + AUTOTUNE_NUM_CALLS = 10 def __init__(self, field, iteration_slice): available_indices = [THREAD_IDX[0]] + BLOCK_IDX diff --git a/pystencils/kernelcreation.py b/pystencils/kernelcreation.py index 408cc506..cc71cdce 100644 --- a/pystencils/kernelcreation.py +++ b/pystencils/kernelcreation.py @@ -1,5 +1,5 @@ -from types import MappingProxyType from itertools import combinations +from types import MappingProxyType import sympy as sp @@ -27,7 +27,8 @@ def create_kernel(assignments, gpu_indexing_params=MappingProxyType({}), use_textures_for_interpolation=True, cpu_prepend_optimizations=[], - use_auto_for_assignments=False): + use_auto_for_assignments=False, + autotune_cuda_callparameters=False): """ Creates abstract syntax tree (AST) of kernel, using a list of update equations. @@ -121,6 +122,8 @@ def create_kernel(assignments, for a in ast.atoms(SympyAssignment): a.use_auto = True + if autotune_cuda_callparameters: + ast._autotune_options = True return ast diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py index 77790229..a8ca3126 100644 --- a/pystencils_tests/test_cudagpu.py +++ b/pystencils_tests/test_cudagpu.py @@ -1,5 +1,6 @@ import numpy as np import pycuda.gpuarray as gpuarray +import pytest import sympy as sp from scipy.ndimage import convolve @@ -35,6 +36,49 @@ def test_averaging_kernel(): np.testing.assert_almost_equal(reference, dst_arr) +@pytest.mark.parametrize('use_3d', ('use_3d', False)) +@pytest.mark.parametrize('use_fortran_layout', ('use_fortran_layout', False)) +def test_autotuning(use_fortran_layout, use_3d): + print(f'Use Fortan layout: {use_fortran_layout}') + if use_3d: + size = (256, 256, 256) + else: + size = (256, 256) + src_arr = np.random.rand(*size) + if use_fortran_layout: + src_arr = np.asfortranarray(src_arr) + src_arr = add_ghost_layers(src_arr) + print(src_arr.strides) + dst_arr = np.zeros_like(src_arr) + src_field = Field.create_from_numpy_array('src', src_arr) + dst_field = Field.create_from_numpy_array('dst', dst_arr) + + if use_3d: + update_rules = (Assignment(dst_field[0, 0, 0], + (src_field[0, 0, 1] + src_field[0, 0, -1] + src_field[0, 1, 0] + src_field[0, -1, 0]) + / 4), + Assignment(dst_field[0, 0, 0], + (src_field[1, 0, 0] + src_field[-1, 0, 0] + src_field[0, 1, 0] + src_field[0, -1, 0]) + / 4)) + else: + update_rules = (Assignment(dst_field[0, 0], + (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) + / 4), + Assignment(dst_field[0, 0], + (src_field[1, 0] + src_field[-1, 0] + src_field[0, 1] + src_field[0, -1]) + / 4)) + + for i in range(2): + ast = create_cuda_kernel(sympy_cse_on_assignment_list([update_rules[i]])) + ast._autotune_options = 1 + kernel = make_python_function(ast) + + gpu_src_arr = gpuarray.to_gpu(src_arr) + gpu_dst_arr = gpuarray.to_gpu(dst_arr) + kernel(src=gpu_src_arr, dst=gpu_dst_arr) + gpu_dst_arr.get(dst_arr) + + def test_variable_sized_fields(): src_field = Field.create_generic('src', spatial_dimensions=2) dst_field = Field.create_generic('dst', spatial_dimensions=2) -- GitLab