Add auto-tuning for CUDA call parameters

0800d84a · Stephan Seitz · f9ba7391 · 0800d84a · 0800d84a · 0800d84a
Commit 0800d84a authored 5 years ago by Stephan Seitz
--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -178,6 +178,11 @@ class KernelFunction(Node):
        self.instruction_set = None  # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use
        # function that compiles the node to a Python callable, is set by the backends
        self._compile_function = compile_function
+        self._autotune_options = None
+    @property
+    def do_cudaautotune(self):
+        return self._autotune_options is not None
    @property
    def target(self):

--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
+from functools import partial
 import numpy as np
+import pystencils
 from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.data_types import StructType
 from pystencils.field import FieldType
@@ -77,11 +80,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
            full_arguments.update(kwargs)
            shape = _check_arguments(parameters, full_arguments)
-            indexing = kernel_function_node.indexing
-            block_and_thread_numbers = indexing.call_parameters(shape)
-            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
-            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
            # TODO: use texture objects:
            # https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
            for tex in textures:
@@ -89,6 +87,21 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
                ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
                               tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
            args = _build_numpy_argument_list(parameters, full_arguments)
+            indexing = kernel_function_node.indexing
+            if kernel_function_node.do_cudaautotune:
+                block_and_thread_numbers = (
+                    indexing.autotune_call_parameters(partial(func, *args),
+                                                      shape,
+                                                      kernel_function_node.function_name,
+                                                      tuple((k, v.strides, v.shape)
+                                                            for k, v in kwargs.items()
+                                                            if (isinstance(v, pycuda.gpuarray.GPUArray)))
+                                                      + (str(pystencils.show_code(kernel_function_node)),)))
+            else:
+                block_and_thread_numbers = indexing.call_parameters(shape)
+                block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+                block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
            cache[key] = (args, block_and_thread_numbers)
            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
            func(*args, **block_and_thread_numbers)

--- a/pystencils/gpucuda/indexing.py
+++ b/pystencils/gpucuda/indexing.py
 import abc
+import timeit
 from functools import partial
 import sympy as sp
 from sympy.core.cache import cacheit
 from pystencils.astnodes import Block, Conditional
+from pystencils.cache import disk_cache
 from pystencils.data_types import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
 from pystencils.slicing import normalize_slice
@@ -83,6 +85,60 @@ class AbstractIndexing(abc.ABC):
    def symbolic_parameters(self):
        """Set of symbols required in call_parameters code"""
+    def autotune_call_parameters(self, partial_function, call_shape, function_name, magic_hash):
+        """Autotune call parameters for a specific kernel call
+        Tries to find the optimum call parameters ``block``, ``grid`` for a kernel function.
+        Args:
+            partial_function: Partial PyCUDA function with parameters block and grid missing
+        """
+        import pycuda.driver
+        @disk_cache
+        def _autotune_call_parameters(self,
+                                      call_shape,
+                                      num_profile_calls,
+                                      function_name,
+                                      block_sizes,
+                                      magic_hash  # needed for disk_cache
+                                      ):
+            BIG_NUMBER = 100000000
+            current_best = self.call_parameters(call_shape)
+            best_timing = BIG_NUMBER
+            print(f'Autotuning function {function_name}')
+            for block_size in block_sizes:
+                self._block_size = block_size
+                if isinstance(self, BlockIndexing):
+                    self._block_size = (
+                        BlockIndexing.permute_block_size_according_to_layout(self._block_size, self._layout))
+                block_and_thread_numbers = self.call_parameters(call_shape)
+                block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+                block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+                # TODO(seitz) can we use the CUDA profiler?: pycuda.driver.start_profiler()
+                def profile_call():
+                    for i in range(num_profile_calls):
+                        partial_function(**block_and_thread_numbers)
+                    pycuda.driver.Context.synchronize()
+                current_time = timeit.timeit(profile_call, number=1)
+                print(f'{block_size} takes {current_time} ({num_profile_calls})')
+                if current_time < best_timing:
+                    best_timing = current_time
+                    current_best = block_and_thread_numbers
+            print(f'{current_best} is the best out of {self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES}')
+            self._block_size = current_best
+            return current_best
+        return _autotune_call_parameters(self,
+                                         call_shape,
+                                         self.AUTOTUNE_NUM_CALLS,
+                                         function_name,
+                                         self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES,
+                                         magic_hash)
 # -------------------------------------------- Implementations ---------------------------------------------------------
@@ -97,6 +153,8 @@ class BlockIndexing(AbstractIndexing):
                                                gets the largest amount of threads
        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
    """
+    AUTOTUNE_BLOCK_SIZES = ((16, 16, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),)
+    AUTOTUNE_NUM_CALLS = 10
    def __init__(self, field, iteration_slice,
                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
@@ -118,14 +176,17 @@ class BlockIndexing(AbstractIndexing):
            maximum_block_size = tuple(device.get_attribute(a)
                                       for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z))
+        self._layout = field.layout
        self._maximum_block_size = maximum_block_size
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._dim = field.spatial_dimensions
        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
        self._compile_time_block_size = compile_time_block_size
+        self._autotune_block_sizes = None
    @property
    def coordinates(self):
+        # TODO(seitz): require layout in constructor to rotate the thread indices: thread_idx == fastest
        offsets = _get_start_from_slice(self._iterationSlice)
        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
        coordinates = [block_index * bs + thread_idx + off
@@ -227,6 +288,8 @@ class LineIndexing(AbstractIndexing):
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a CUDA block (which depends on device).
    """
+    AUTOTUNE_BLOCK_SIZES = ((16, 1, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),)
+    AUTOTUNE_NUM_CALLS = 10
    def __init__(self, field, iteration_slice):
        available_indices = [THREAD_IDX[0]] + BLOCK_IDX

--- a/pystencils/kernelcreation.py
+++ b/pystencils/kernelcreation.py
-from types import MappingProxyType
 from itertools import combinations
+from types import MappingProxyType
 import sympy as sp
@@ -27,7 +27,8 @@ def create_kernel(assignments,
                  gpu_indexing_params=MappingProxyType({}),
                  use_textures_for_interpolation=True,
                  cpu_prepend_optimizations=[],
-                  use_auto_for_assignments=False):
+                  use_auto_for_assignments=False,
+                  autotune_cuda_callparameters=False):
    """
    Creates abstract syntax tree (AST) of kernel, using a list of update equations.
@@ -121,6 +122,8 @@ def create_kernel(assignments,
        for a in ast.atoms(SympyAssignment):
            a.use_auto = True
+    if autotune_cuda_callparameters:
+        ast._autotune_options = True
    return ast

--- a/pystencils_tests/test_cudagpu.py
+++ b/pystencils_tests/test_cudagpu.py
 import numpy as np
 import pycuda.gpuarray as gpuarray
+import pytest
 import sympy as sp
 from scipy.ndimage import convolve
@@ -35,6 +36,49 @@ def test_averaging_kernel():
    np.testing.assert_almost_equal(reference, dst_arr)
+@pytest.mark.parametrize('use_3d', ('use_3d', False))
+@pytest.mark.parametrize('use_fortran_layout', ('use_fortran_layout', False))
+def test_autotuning(use_fortran_layout, use_3d):
+    print(f'Use Fortan layout: {use_fortran_layout}')
+    if use_3d:
+        size = (256, 256, 256)
+    else:
+        size = (256, 256)
+    src_arr = np.random.rand(*size)
+    if use_fortran_layout:
+        src_arr = np.asfortranarray(src_arr)
+    src_arr = add_ghost_layers(src_arr)
+    print(src_arr.strides)
+    dst_arr = np.zeros_like(src_arr)
+    src_field = Field.create_from_numpy_array('src', src_arr)
+    dst_field = Field.create_from_numpy_array('dst', dst_arr)
+    if use_3d:
+        update_rules = (Assignment(dst_field[0, 0, 0],
+                                   (src_field[0, 0, 1] + src_field[0, 0, -1] + src_field[0, 1, 0] + src_field[0, -1, 0])
+                                   / 4),
+                        Assignment(dst_field[0, 0, 0],
+                                   (src_field[1, 0, 0] + src_field[-1, 0, 0] + src_field[0, 1, 0] + src_field[0, -1, 0])
+                                   / 4))
+    else:
+        update_rules = (Assignment(dst_field[0, 0],
+                                   (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0])
+                                   / 4),
+                        Assignment(dst_field[0, 0],
+                                   (src_field[1, 0] + src_field[-1, 0] + src_field[0, 1] + src_field[0, -1])
+                                   / 4))
+    for i in range(2):
+        ast = create_cuda_kernel(sympy_cse_on_assignment_list([update_rules[i]]))
+        ast._autotune_options = 1
+        kernel = make_python_function(ast)
+        gpu_src_arr = gpuarray.to_gpu(src_arr)
+        gpu_dst_arr = gpuarray.to_gpu(dst_arr)
+        kernel(src=gpu_src_arr, dst=gpu_dst_arr)
+        gpu_dst_arr.get(dst_arr)
 def test_variable_sized_fields():
    src_field = Field.create_generic('src', spatial_dimensions=2)
    dst_field = Field.create_generic('dst', spatial_dimensions=2)