Merge branch 'halfP' into 'master'

Add experimental half precison support See merge request !332

Merge branch 'halfP' into 'master'
89e6b0f2 · Michael Kuron · e3f6d8da · d5c1c566 · 89e6b0f2 · 89e6b0f2
Commit 89e6b0f2 authored 2 years ago by Michael Kuron
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -143,7 +143,7 @@ ubuntu:
    - sed -i 's/--doctest-modules //g' pytest.ini
    - env
    - pip3 list
-    - pytest-3 -v -n $NUM_CORES --junitxml=report.xml pystencils_tests/test_*vec*.py pystencils_tests/test_random.py
+    - pytest-3 -v -n $NUM_CORES --junitxml=report.xml pystencils_tests/test_*vec*.py pystencils_tests/test_random.py pystencils_tests/test_half_precision.py
  tags:
    - docker
    - AVX
@@ -170,6 +170,7 @@ ppc64le:

 arm64v9:
  # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
+  # For half precision Clang is necessary
  extends: .multiarch_template
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
  before_script:

--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -43,26 +43,30 @@ Then 'cl.exe' is used to compile.
  For Windows compilers the qualifier should be ``__restrict``

 """
+from appdirs import user_cache_dir, user_config_dir
+from collections import OrderedDict
 import hashlib
+import importlib.util
 import json
 import os
 import platform
 import shutil
 import subprocess
+import sysconfig
+import tempfile
 import textwrap
-from collections import OrderedDict
-from sysconfig import get_paths
-from tempfile import TemporaryDirectory, NamedTemporaryFile
+import time
+import warnings

 import numpy as np
-from appdirs import user_cache_dir, user_config_dir

 from pystencils import FieldType
 from pystencils.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import generate_c, get_headers, CFunction
-from pystencils.typing import CastFunc, VectorType, VectorMemoryAccess
+from pystencils.cpu.msvc_detection import get_environment
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
+from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess
 from pystencils.utils import atomic_file_write, recursive_dict_update


@@ -216,12 +220,11 @@ def read_config():
            shutil.rmtree(config['cache']['object_cache'], ignore_errors=True)

        create_folder(config['cache']['object_cache'], False)
-        with NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
+        with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
            json.dump(config['compiler'], f, indent=4)
        os.replace(f.name, cache_status_file)

    if config['compiler']['os'] == 'windows':
-        from pystencils.cpu.msvc_detection import get_environment
        msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch'])
        if 'env' not in config['compiler']:
            config['compiler']['env'] = {}
@@ -470,18 +473,15 @@ def create_module_boilerplate_code(module_name, names):


 def load_kernel_from_file(module_name, function_name, path):
-    from importlib.util import spec_from_file_location, module_from_spec
    try:
-        spec = spec_from_file_location(name=module_name, location=path)
-        mod = module_from_spec(spec)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
+        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)
    except ImportError:
-        import time
-        import warnings
        warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
        time.sleep(5)
-        spec = spec_from_file_location(name=module_name, location=path)
-        mod = module_from_spec(spec)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
+        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)

    return getattr(mod, function_name)
@@ -520,9 +520,13 @@ class ExtensionModuleCode:

        headers = {'<math.h>', '<stdint.h>'}
        for ast in self._ast_nodes:
+            for field in ast.fields_accessed:
+                if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                    # Add the half precision header only if half precision numbers occur in the AST
+                    headers.add('"half_precision.h"')
            headers.update(get_headers(ast))
-        header_list = list(headers)
-        header_list.sort()
+
+        header_list = sorted(headers)
        header_list.insert(0, '"Python.h"')
        ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list
                      if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))]
@@ -559,7 +563,7 @@ def compile_module(code, code_hash, base_dir, compile_flags=None):
        compile_flags = []

    compiler_config = get_compiler_config()
-    extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
+    extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags

    if compiler_config['os'].lower() == 'windows':
        lib_suffix = '.pyd'
@@ -593,7 +597,6 @@ def compile_module(code, code_hash, base_dir, compile_flags=None):

        # Linking
        if windows:
-            import sysconfig
            config_vars = sysconfig.get_config_vars()
            py_lib = os.path.join(config_vars["installed_base"], "libs",
                                  f"python{config_vars['py_version_nodot']}.lib")
@@ -627,7 +630,7 @@ def compile_and_load(ast, custom_backend=None):
        compile_flags = ast.instruction_set['compile_flags']

    if cache_config['object_cache'] is False:
-        with TemporaryDirectory() as base_dir:
+        with tempfile.TemporaryDirectory() as base_dir:
            lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags)
            result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
    else:

--- a/pystencils/gpu/__init__.py
+++ b/pystencils/gpu/__init__.py
 from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
-from pystencils.gpu.cudajit import make_python_function
+from pystencils.gpu.gpujit import make_python_function
 from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel

 from .indexing import AbstractIndexing, BlockIndexing, LineIndexing

--- a/pystencils/gpu/cudajit.py
+++ b/pystencils/gpu/cudajit.py
@@ -7,7 +7,7 @@ from pystencils.typing import StructType
 from pystencils.field import FieldType
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
-from pystencils.typing.typed_sympy import FieldPointerSymbol
+from pystencils.typing import BasicType, FieldPointerSymbol

 USE_FAST_MATH = True

@@ -39,26 +39,29 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
    if argument_dict is None:
        argument_dict = {}

+    headers = get_headers(kernel_function_node)
    if cp.cuda.runtime.is_hip:
-        header_list = ['"gpu_defines.h"'] + list(get_headers(kernel_function_node))
+        headers.add('"gpu_defines.h"')
    else:
-        header_list = ['"gpu_defines.h"', '<cstdint>'] + list(get_headers(kernel_function_node))
+        headers.update({'"gpu_defines.h"', '<cstdint>'})
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<cuda_fp16.h>')
+
+    header_list = sorted(headers)
    includes = "\n".join([f"#include {include_file}" for include_file in header_list])

    code = includes + "\n"
    code += "#define FUNC_PREFIX __global__\n"
    code += "#define RESTRICT __restrict__\n\n"
-    code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
-    code = 'extern "C" {\n%s\n}\n' % code
+    code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend))

    options = ["-w", "-std=c++11"]
    if USE_FAST_MATH:
        options.append("-use_fast_math")
    options.append("-I" + get_pystencils_include_path())

-    mod = cp.RawModule(code=code, options=tuple(options), backend="nvrtc", jitify=True)
-    func = mod.get_function(kernel_function_node.function_name)
-
+    func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True)
    parameters = kernel_function_node.get_parameters()

    cache = {}

--- a/pystencils/gpu/kernelcreation.py
+++ b/pystencils/gpu/kernelcreation.py
@@ -8,7 +8,7 @@ from pystencils.typing import StructType, TypedSymbol
 from pystencils.typing.transformations import add_types
 from pystencils.field import Field, FieldType
 from pystencils.enums import Target, Backend
-from pystencils.gpu.cudajit import make_python_function
+from pystencils.gpu.gpujit import make_python_function
 from pystencils.node_collection import NodeCollection
 from pystencils.gpu.indexing import indexing_creator_from_params
 from pystencils.simp.assignment_collection import AssignmentCollection

--- a/pystencils/include/half_precision.h
+++ b/pystencils/include/half_precision.h
+/// Half precision support. Experimental. Use carefully.
+///
+/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
+/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
+/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
+/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
+
+/// Clang version must be 15 or higher for x86 half precision support.
+/// GCC version must be 12 or higher for x86 half precision support.
+/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
+/// See
+///   https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
+///   https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
+/// for more information.
+
+#pragma once
+using half    = _Float16;
--- a/pystencils/typing/types.py
+++ b/pystencils/typing/types.py
@@ -25,6 +25,8 @@ def numpy_name_to_c(name: str) -> str:
        return 'double'
    elif name == 'float32':
        return 'float'
+    elif name == 'float16' or name == 'half':
+        return 'half'
    elif name.startswith('int'):
        width = int(name[len("int"):])
        return f"int{width}_t"
@@ -94,6 +96,9 @@ class BasicType(AbstractType):
    def is_float(self):
        return issubclass(self.numpy_dtype.type, np.floating)

+    def is_half(self):
+        return issubclass(self.numpy_dtype.type, np.half)
+
    def is_int(self):
        return issubclass(self.numpy_dtype.type, np.integer)


--- a/pystencils_tests/test_custom_backends.py
+++ b/pystencils_tests/test_custom_backends.py
@@ -38,7 +38,7 @@ def test_custom_backends_cpu():
 def test_custom_backends_gpu():
    pytest.importorskip('cupy')
    import cupy
-    import pystencils.gpu.cudajit
+    import pystencils.gpu.gpujit

    z, x, y = pystencils.fields("z, y, x: [2d]")

@@ -48,4 +48,4 @@ def test_custom_backends_gpu():
    ast = pystencils.create_kernel(normal_assignments, target=Target.GPU)
    pystencils.show_code(ast, ScreamingGpuBackend())
    with pytest.raises(cupy.cuda.compiler.JitifyException):
-        pystencils.gpu.cudajit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
+        pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
--- a/pystencils_tests/test_half_precision.py
+++ b/pystencils_tests/test_half_precision.py
+import pytest
+import platform
+
+import numpy as np
+import pystencils as ps
+
+
+@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU))
+def test_half_precison(target):
+    if target == ps.Target.CPU:
+        if not platform.machine() in ['arm64', 'aarch64']:
+            pytest.xfail("skipping half precision test on non arm platform")
+
+        if 'clang' not in ps.cpu.cpujit.get_compiler_config()['command']:
+            pytest.xfail("skipping half precision because clang compiler is not used")
+
+    if target == ps.Target.GPU:
+        pytest.importorskip("cupy")
+
+    dh = ps.create_data_handling(domain_size=(10, 10), default_target=target)
+
+    f1 = dh.add_array("f1", values_per_cell=1, dtype=np.float16)
+    dh.fill("f1", 1.0, ghost_layers=True)
+    f2 = dh.add_array("f2", values_per_cell=1, dtype=np.float16)
+    dh.fill("f2", 2.0, ghost_layers=True)
+
+    f3 = dh.add_array("f3", values_per_cell=1, dtype=np.float16)
+    dh.fill("f3", 0.0, ghost_layers=True)
+
+    up = ps.Assignment(f3.center, f1.center + 2.1 * f2.center)
+
+    config = ps.CreateKernelConfig(target=dh.default_target, default_number_float=np.float32)
+    ast = ps.create_kernel(up, config=config)
+
+    kernel = ast.compile()
+
+    dh.run_kernel(kernel)
+    dh.all_to_cpu()
+
+    assert np.all(dh.cpu_arrays[f3.name] == 5.2)
+    assert dh.cpu_arrays[f3.name].dtype == np.float16