From ba7b20ac7e1021dc5fd7007637ab4597cc0dd453 Mon Sep 17 00:00:00 2001
From: Stephan Seitz <stephan.seitz@fau.de>
Date: Thu, 12 Dec 2019 15:00:41 +0100
Subject: [PATCH] Add 'cuda' compiler config (with preferred_block_size and
 always_autotune)

---
 pystencils/cpu/cpujit.py       | 12 +++++++++++-
 pystencils/gpucuda/cudajit.py  |  3 ++-
 pystencils/gpucuda/indexing.py |  9 +++++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index 6376ffb85..80d8b96ff 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -175,9 +175,15 @@ def read_config():
         ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
         ('clear_cache_on_start', False),
     ])
+    default_cuda_config = OrderedDict([
+        ('always_autotune', False),
+        ('preferred_block_size', (16, 16, 1)),
+    ])
 
     default_config = OrderedDict([('compiler', default_compiler_config),
-                                  ('cache', default_cache_config)])
+                                  ('cache', default_cache_config),
+                                  ('cuda', default_cuda_config)
+                                  ])
 
     config_path, config_exists = get_configuration_file_path()
     config = default_config.copy()
@@ -219,6 +225,10 @@ def get_cache_config():
     return _config['cache']
 
 
+def get_cuda_config():
+    return _config['cuda']
+
+
 def add_or_change_compiler_flags(flags):
     if not isinstance(flags, list) and not isinstance(flags, tuple):
         flags = [flags]
diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py
index 638010609..c249a6756 100644
--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
@@ -4,6 +4,7 @@ import numpy as np
 
 import pystencils
 from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.cpu.cpujit import get_cuda_config
 from pystencils.data_types import StructType
 from pystencils.field import FieldType
 from pystencils.gpucuda.texture_utils import ndarray_to_tex
@@ -88,7 +89,7 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
                                tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
             args = _build_numpy_argument_list(parameters, full_arguments)
             indexing = kernel_function_node.indexing
-            if kernel_function_node.do_cudaautotune:
+            if kernel_function_node.do_cudaautotune or get_cuda_config()['always_autotune']:
                 block_and_thread_numbers = (
                     indexing.autotune_call_parameters(partial(func, *args),
                                                       shape,
diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py
index bf8b53027..0cd18e9ba 100644
--- a/pystencils/gpucuda/indexing.py
+++ b/pystencils/gpucuda/indexing.py
@@ -7,6 +7,7 @@ from sympy.core.cache import cacheit
 
 from pystencils.astnodes import Block, Conditional
 from pystencils.cache import disk_cache
+from pystencils.cpu.cpujit import get_cuda_config
 from pystencils.data_types import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
 from pystencils.slicing import normalize_slice
@@ -130,7 +131,7 @@ class AbstractIndexing(abc.ABC):
                     current_best = block_and_thread_numbers
 
             print(f'{current_best} is the best out of {self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES}')
-            self._block_size = current_best
+            self._block_size = current_best['block']
             return current_best
         return _autotune_call_parameters(self,
                                          call_shape,
@@ -157,7 +158,10 @@ class BlockIndexing(AbstractIndexing):
     AUTOTUNE_NUM_CALLS = 10
 
     def __init__(self, field, iteration_slice,
-                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
+                 block_size=tuple(get_cuda_config()['preferred_block_size']),
+                 permute_block_size_dependent_on_layout=True,
+                 compile_time_block_size=False,
+
                  maximum_block_size=(1024, 1024, 64)):
         if field.spatial_dimensions > 3:
             raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
@@ -304,6 +308,7 @@ class LineIndexing(AbstractIndexing):
         self._coordinates = coordinates
         self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
         self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
+        self._autotune_block_sizes = None
 
     @property
     def coordinates(self):
-- 
GitLab