Add 'cuda' compiler config (with preferred_block_size and always_autotune)

ba7b20ac · Stephan Seitz · 0800d84a · ba7b20ac · ba7b20ac · ba7b20ac
Commit ba7b20ac authored 5 years ago by Stephan Seitz
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -175,9 +175,15 @@ def read_config():
        ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
        ('clear_cache_on_start', False),
    ])
+    default_cuda_config = OrderedDict([
+        ('always_autotune', False),
+        ('preferred_block_size', (16, 16, 1)),
+    ])
    default_config = OrderedDict([('compiler', default_compiler_config),
-                                  ('cache', default_cache_config)])
+                                  ('cache', default_cache_config),
+                                  ('cuda', default_cuda_config)
+                                  ])
    config_path, config_exists = get_configuration_file_path()
    config = default_config.copy()
@@ -219,6 +225,10 @@ def get_cache_config():
    return _config['cache']
+def get_cuda_config():
+    return _config['cuda']
 def add_or_change_compiler_flags(flags):
    if not isinstance(flags, list) and not isinstance(flags, tuple):
        flags = [flags]

--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
@@ -4,6 +4,7 @@ import numpy as np
 import pystencils
 from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.cpu.cpujit import get_cuda_config
 from pystencils.data_types import StructType
 from pystencils.field import FieldType
 from pystencils.gpucuda.texture_utils import ndarray_to_tex
@@ -88,7 +89,7 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
                               tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
            args = _build_numpy_argument_list(parameters, full_arguments)
            indexing = kernel_function_node.indexing
-            if kernel_function_node.do_cudaautotune:
+            if kernel_function_node.do_cudaautotune or get_cuda_config()['always_autotune']:
                block_and_thread_numbers = (
                    indexing.autotune_call_parameters(partial(func, *args),
                                                      shape,

--- a/pystencils/gpucuda/indexing.py
+++ b/pystencils/gpucuda/indexing.py
@@ -7,6 +7,7 @@ from sympy.core.cache import cacheit
 from pystencils.astnodes import Block, Conditional
 from pystencils.cache import disk_cache
+from pystencils.cpu.cpujit import get_cuda_config
 from pystencils.data_types import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
 from pystencils.slicing import normalize_slice
@@ -130,7 +131,7 @@ class AbstractIndexing(abc.ABC):
                    current_best = block_and_thread_numbers
            print(f'{current_best} is the best out of {self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES}')
-            self._block_size = current_best
+            self._block_size = current_best['block']
            return current_best
        return _autotune_call_parameters(self,
                                         call_shape,
@@ -157,7 +158,10 @@ class BlockIndexing(AbstractIndexing):
    AUTOTUNE_NUM_CALLS = 10
    def __init__(self, field, iteration_slice,
-                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
+                 block_size=tuple(get_cuda_config()['preferred_block_size']),
+                 permute_block_size_dependent_on_layout=True,
+                 compile_time_block_size=False,
                 maximum_block_size=(1024, 1024, 64)):
        if field.spatial_dimensions > 3:
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
@@ -304,6 +308,7 @@ class LineIndexing(AbstractIndexing):
        self._coordinates = coordinates
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
+        self._autotune_block_sizes = None
    @property
    def coordinates(self):