From 0800d84a34d8a370e80f4f08e41074e576a0aa96 Mon Sep 17 00:00:00 2001
From: Stephan Seitz <stephan.seitz@fau.de>
Date: Wed, 11 Dec 2019 10:28:19 +0100
Subject: [PATCH] Add auto-tuning for CUDA call parameters

---
 pystencils/astnodes.py           |  5 +++
 pystencils/gpucuda/cudajit.py    | 23 +++++++++---
 pystencils/gpucuda/indexing.py   | 63 ++++++++++++++++++++++++++++++++
 pystencils/kernelcreation.py     |  7 +++-
 pystencils_tests/test_cudagpu.py | 44 ++++++++++++++++++++++
 5 files changed, 135 insertions(+), 7 deletions(-)

diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py
index cc58ebd3..155487be 100644
--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -178,6 +178,11 @@ class KernelFunction(Node):
         self.instruction_set = None  # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use
         # function that compiles the node to a Python callable, is set by the backends
         self._compile_function = compile_function
+        self._autotune_options = None
+
+    @property
+    def do_cudaautotune(self):
+        return self._autotune_options is not None
 
     @property
     def target(self):
diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py
index 28ec47d0..63801060 100644
--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
@@ -1,5 +1,8 @@
+from functools import partial
+
 import numpy as np
 
+import pystencils
 from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.data_types import StructType
 from pystencils.field import FieldType
@@ -77,11 +80,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
             full_arguments.update(kwargs)
             shape = _check_arguments(parameters, full_arguments)
 
-            indexing = kernel_function_node.indexing
-            block_and_thread_numbers = indexing.call_parameters(shape)
-            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
-            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
-
             # TODO: use texture objects:
             # https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
             for tex in textures:
@@ -89,6 +87,21 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
                 ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
                                tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
             args = _build_numpy_argument_list(parameters, full_arguments)
+            indexing = kernel_function_node.indexing
+            if kernel_function_node.do_cudaautotune:
+                block_and_thread_numbers = (
+                    indexing.autotune_call_parameters(partial(func, *args),
+                                                      shape,
+                                                      kernel_function_node.function_name,
+                                                      tuple((k, v.strides, v.shape)
+                                                            for k, v in kwargs.items()
+                                                            if (isinstance(v, pycuda.gpuarray.GPUArray)))
+                                                      + (str(pystencils.show_code(kernel_function_node)),)))
+            else:
+                block_and_thread_numbers = indexing.call_parameters(shape)
+                block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+                block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+
             cache[key] = (args, block_and_thread_numbers)
             cache_values.append(kwargs)  # keep objects alive such that ids remain unique
             func(*args, **block_and_thread_numbers)
diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py
index eb212119..bf8b5302 100644
--- a/pystencils/gpucuda/indexing.py
+++ b/pystencils/gpucuda/indexing.py
@@ -1,10 +1,12 @@
 import abc
+import timeit
 from functools import partial
 
 import sympy as sp
 from sympy.core.cache import cacheit
 
 from pystencils.astnodes import Block, Conditional
+from pystencils.cache import disk_cache
 from pystencils.data_types import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
 from pystencils.slicing import normalize_slice
@@ -83,6 +85,60 @@ class AbstractIndexing(abc.ABC):
     def symbolic_parameters(self):
         """Set of symbols required in call_parameters code"""
 
+    def autotune_call_parameters(self, partial_function, call_shape, function_name, magic_hash):
+        """Autotune call parameters for a specific kernel call
+
+        Tries to find the optimum call parameters ``block``, ``grid`` for a kernel function.
+
+        Args:
+            partial_function: Partial PyCUDA function with parameters block and grid missing
+        """
+        import pycuda.driver
+
+        @disk_cache
+        def _autotune_call_parameters(self,
+                                      call_shape,
+                                      num_profile_calls,
+                                      function_name,
+                                      block_sizes,
+                                      magic_hash  # needed for disk_cache
+                                      ):
+            BIG_NUMBER = 100000000
+            current_best = self.call_parameters(call_shape)
+            best_timing = BIG_NUMBER
+
+            print(f'Autotuning function {function_name}')
+            for block_size in block_sizes:
+                self._block_size = block_size
+                if isinstance(self, BlockIndexing):
+                    self._block_size = (
+                        BlockIndexing.permute_block_size_according_to_layout(self._block_size, self._layout))
+                block_and_thread_numbers = self.call_parameters(call_shape)
+                block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+                block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+
+                # TODO(seitz) can we use the CUDA profiler?: pycuda.driver.start_profiler()
+                def profile_call():
+                    for i in range(num_profile_calls):
+                        partial_function(**block_and_thread_numbers)
+                    pycuda.driver.Context.synchronize()
+
+                current_time = timeit.timeit(profile_call, number=1)
+                print(f'{block_size} takes {current_time} ({num_profile_calls})')
+                if current_time < best_timing:
+                    best_timing = current_time
+                    current_best = block_and_thread_numbers
+
+            print(f'{current_best} is the best out of {self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES}')
+            self._block_size = current_best
+            return current_best
+        return _autotune_call_parameters(self,
+                                         call_shape,
+                                         self.AUTOTUNE_NUM_CALLS,
+                                         function_name,
+                                         self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES,
+                                         magic_hash)
+
 
 # -------------------------------------------- Implementations ---------------------------------------------------------
 
@@ -97,6 +153,8 @@ class BlockIndexing(AbstractIndexing):
                                                 gets the largest amount of threads
         compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
     """
+    AUTOTUNE_BLOCK_SIZES = ((16, 16, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),)
+    AUTOTUNE_NUM_CALLS = 10
 
     def __init__(self, field, iteration_slice,
                  block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
@@ -118,14 +176,17 @@ class BlockIndexing(AbstractIndexing):
             maximum_block_size = tuple(device.get_attribute(a)
                                        for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z))
 
+        self._layout = field.layout
         self._maximum_block_size = maximum_block_size
         self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
         self._dim = field.spatial_dimensions
         self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
         self._compile_time_block_size = compile_time_block_size
+        self._autotune_block_sizes = None
 
     @property
     def coordinates(self):
+        # TODO(seitz): require layout in constructor to rotate the thread indices: thread_idx == fastest
         offsets = _get_start_from_slice(self._iterationSlice)
         block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
         coordinates = [block_index * bs + thread_idx + off
@@ -227,6 +288,8 @@ class LineIndexing(AbstractIndexing):
     This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
     maximum amount of threads allowed in a CUDA block (which depends on device).
     """
+    AUTOTUNE_BLOCK_SIZES = ((16, 1, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),)
+    AUTOTUNE_NUM_CALLS = 10
 
     def __init__(self, field, iteration_slice):
         available_indices = [THREAD_IDX[0]] + BLOCK_IDX
diff --git a/pystencils/kernelcreation.py b/pystencils/kernelcreation.py
index 408cc506..cc71cdce 100644
--- a/pystencils/kernelcreation.py
+++ b/pystencils/kernelcreation.py
@@ -1,5 +1,5 @@
-from types import MappingProxyType
 from itertools import combinations
+from types import MappingProxyType
 
 import sympy as sp
 
@@ -27,7 +27,8 @@ def create_kernel(assignments,
                   gpu_indexing_params=MappingProxyType({}),
                   use_textures_for_interpolation=True,
                   cpu_prepend_optimizations=[],
-                  use_auto_for_assignments=False):
+                  use_auto_for_assignments=False,
+                  autotune_cuda_callparameters=False):
     """
     Creates abstract syntax tree (AST) of kernel, using a list of update equations.
 
@@ -121,6 +122,8 @@ def create_kernel(assignments,
         for a in ast.atoms(SympyAssignment):
             a.use_auto = True
 
+    if autotune_cuda_callparameters:
+        ast._autotune_options = True
     return ast
 
 
diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py
index 77790229..a8ca3126 100644
--- a/pystencils_tests/test_cudagpu.py
+++ b/pystencils_tests/test_cudagpu.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pycuda.gpuarray as gpuarray
+import pytest
 import sympy as sp
 from scipy.ndimage import convolve
 
@@ -35,6 +36,49 @@ def test_averaging_kernel():
     np.testing.assert_almost_equal(reference, dst_arr)
 
 
+@pytest.mark.parametrize('use_3d', ('use_3d', False))
+@pytest.mark.parametrize('use_fortran_layout', ('use_fortran_layout', False))
+def test_autotuning(use_fortran_layout, use_3d):
+    print(f'Use Fortan layout: {use_fortran_layout}')
+    if use_3d:
+        size = (256, 256, 256)
+    else:
+        size = (256, 256)
+    src_arr = np.random.rand(*size)
+    if use_fortran_layout:
+        src_arr = np.asfortranarray(src_arr)
+    src_arr = add_ghost_layers(src_arr)
+    print(src_arr.strides)
+    dst_arr = np.zeros_like(src_arr)
+    src_field = Field.create_from_numpy_array('src', src_arr)
+    dst_field = Field.create_from_numpy_array('dst', dst_arr)
+
+    if use_3d:
+        update_rules = (Assignment(dst_field[0, 0, 0],
+                                   (src_field[0, 0, 1] + src_field[0, 0, -1] + src_field[0, 1, 0] + src_field[0, -1, 0])
+                                   / 4),
+                        Assignment(dst_field[0, 0, 0],
+                                   (src_field[1, 0, 0] + src_field[-1, 0, 0] + src_field[0, 1, 0] + src_field[0, -1, 0])
+                                   / 4))
+    else:
+        update_rules = (Assignment(dst_field[0, 0],
+                                   (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0])
+                                   / 4),
+                        Assignment(dst_field[0, 0],
+                                   (src_field[1, 0] + src_field[-1, 0] + src_field[0, 1] + src_field[0, -1])
+                                   / 4))
+
+    for i in range(2):
+        ast = create_cuda_kernel(sympy_cse_on_assignment_list([update_rules[i]]))
+        ast._autotune_options = 1
+        kernel = make_python_function(ast)
+
+        gpu_src_arr = gpuarray.to_gpu(src_arr)
+        gpu_dst_arr = gpuarray.to_gpu(dst_arr)
+        kernel(src=gpu_src_arr, dst=gpu_dst_arr)
+        gpu_dst_arr.get(dst_arr)
+
+
 def test_variable_sized_fields():
     src_field = Field.create_generic('src', spatial_dimensions=2)
     dst_field = Field.create_generic('dst', spatial_dimensions=2)
-- 
GitLab