From d5c1c566378a38f7f71a801fedaa9d41e781d2c1 Mon Sep 17 00:00:00 2001
From: Markus Holzer <markus.holzer@fau.de>
Date: Wed, 28 Jun 2023 20:31:47 +0200
Subject: [PATCH] Add experimental half precison support

---
 .gitlab-ci.yml                           |  3 +-
 pystencils/cpu/cpujit.py                 | 41 +++++++++++++-----------
 pystencils/gpu/__init__.py               |  2 +-
 pystencils/gpu/{cudajit.py => gpujit.py} | 19 ++++++-----
 pystencils/gpu/kernelcreation.py         |  2 +-
 pystencils/include/half_precision.h      | 17 ++++++++++
 pystencils/typing/types.py               |  5 +++
 pystencils_tests/test_custom_backends.py |  4 +--
 pystencils_tests/test_half_precision.py  | 41 ++++++++++++++++++++++++
 9 files changed, 102 insertions(+), 32 deletions(-)
 rename pystencils/gpu/{cudajit.py => gpujit.py} (92%)
 create mode 100644 pystencils/include/half_precision.h
 create mode 100644 pystencils_tests/test_half_precision.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f338f2740..12ef5bf48 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -143,7 +143,7 @@ ubuntu:
     - sed -i 's/--doctest-modules //g' pytest.ini
     - env
     - pip3 list
-    - pytest-3 -v -n $NUM_CORES --junitxml=report.xml pystencils_tests/test_*vec*.py pystencils_tests/test_random.py
+    - pytest-3 -v -n $NUM_CORES --junitxml=report.xml pystencils_tests/test_*vec*.py pystencils_tests/test_random.py pystencils_tests/test_half_precision.py
   tags:
     - docker
     - AVX
@@ -170,6 +170,7 @@ ppc64le:
 
 arm64v9:
   # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
+  # For half precision Clang is necessary
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
   before_script:
diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index 4c3febe29..b839f87cf 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -43,26 +43,30 @@ Then 'cl.exe' is used to compile.
   For Windows compilers the qualifier should be ``__restrict``
 
 """
+from appdirs import user_cache_dir, user_config_dir
+from collections import OrderedDict
 import hashlib
+import importlib.util
 import json
 import os
 import platform
 import shutil
 import subprocess
+import sysconfig
+import tempfile
 import textwrap
-from collections import OrderedDict
-from sysconfig import get_paths
-from tempfile import TemporaryDirectory, NamedTemporaryFile
+import time
+import warnings
 
 import numpy as np
-from appdirs import user_cache_dir, user_config_dir
 
 from pystencils import FieldType
 from pystencils.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import generate_c, get_headers, CFunction
-from pystencils.typing import CastFunc, VectorType, VectorMemoryAccess
+from pystencils.cpu.msvc_detection import get_environment
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
+from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess
 from pystencils.utils import atomic_file_write, recursive_dict_update
 
 
@@ -216,12 +220,11 @@ def read_config():
             shutil.rmtree(config['cache']['object_cache'], ignore_errors=True)
 
         create_folder(config['cache']['object_cache'], False)
-        with NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
+        with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
             json.dump(config['compiler'], f, indent=4)
         os.replace(f.name, cache_status_file)
 
     if config['compiler']['os'] == 'windows':
-        from pystencils.cpu.msvc_detection import get_environment
         msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch'])
         if 'env' not in config['compiler']:
             config['compiler']['env'] = {}
@@ -470,18 +473,15 @@ def create_module_boilerplate_code(module_name, names):
 
 
 def load_kernel_from_file(module_name, function_name, path):
-    from importlib.util import spec_from_file_location, module_from_spec
     try:
-        spec = spec_from_file_location(name=module_name, location=path)
-        mod = module_from_spec(spec)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
+        mod = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(mod)
     except ImportError:
-        import time
-        import warnings
         warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
         time.sleep(5)
-        spec = spec_from_file_location(name=module_name, location=path)
-        mod = module_from_spec(spec)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
+        mod = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(mod)
 
     return getattr(mod, function_name)
@@ -520,9 +520,13 @@ class ExtensionModuleCode:
 
         headers = {'<math.h>', '<stdint.h>'}
         for ast in self._ast_nodes:
+            for field in ast.fields_accessed:
+                if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                    # Add the half precision header only if half precision numbers occur in the AST
+                    headers.add('"half_precision.h"')
             headers.update(get_headers(ast))
-        header_list = list(headers)
-        header_list.sort()
+
+        header_list = sorted(headers)
         header_list.insert(0, '"Python.h"')
         ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list
                       if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))]
@@ -559,7 +563,7 @@ def compile_module(code, code_hash, base_dir, compile_flags=None):
         compile_flags = []
 
     compiler_config = get_compiler_config()
-    extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
+    extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
 
     if compiler_config['os'].lower() == 'windows':
         lib_suffix = '.pyd'
@@ -593,7 +597,6 @@ def compile_module(code, code_hash, base_dir, compile_flags=None):
 
         # Linking
         if windows:
-            import sysconfig
             config_vars = sysconfig.get_config_vars()
             py_lib = os.path.join(config_vars["installed_base"], "libs",
                                   f"python{config_vars['py_version_nodot']}.lib")
@@ -627,7 +630,7 @@ def compile_and_load(ast, custom_backend=None):
         compile_flags = ast.instruction_set['compile_flags']
 
     if cache_config['object_cache'] is False:
-        with TemporaryDirectory() as base_dir:
+        with tempfile.TemporaryDirectory() as base_dir:
             lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags)
             result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
     else:
diff --git a/pystencils/gpu/__init__.py b/pystencils/gpu/__init__.py
index 8a3093731..c0d1fd34d 100644
--- a/pystencils/gpu/__init__.py
+++ b/pystencils/gpu/__init__.py
@@ -1,5 +1,5 @@
 from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
-from pystencils.gpu.cudajit import make_python_function
+from pystencils.gpu.gpujit import make_python_function
 from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
 
 from .indexing import AbstractIndexing, BlockIndexing, LineIndexing
diff --git a/pystencils/gpu/cudajit.py b/pystencils/gpu/gpujit.py
similarity index 92%
rename from pystencils/gpu/cudajit.py
rename to pystencils/gpu/gpujit.py
index 950445ff2..e29f85d43 100644
--- a/pystencils/gpu/cudajit.py
+++ b/pystencils/gpu/gpujit.py
@@ -7,7 +7,7 @@ from pystencils.typing import StructType
 from pystencils.field import FieldType
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
-from pystencils.typing.typed_sympy import FieldPointerSymbol
+from pystencils.typing import BasicType, FieldPointerSymbol
 
 USE_FAST_MATH = True
 
@@ -39,26 +39,29 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
     if argument_dict is None:
         argument_dict = {}
 
+    headers = get_headers(kernel_function_node)
     if cp.cuda.runtime.is_hip:
-        header_list = ['"gpu_defines.h"'] + list(get_headers(kernel_function_node))
+        headers.add('"gpu_defines.h"')
     else:
-        header_list = ['"gpu_defines.h"', '<cstdint>'] + list(get_headers(kernel_function_node))
+        headers.update({'"gpu_defines.h"', '<cstdint>'})
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<cuda_fp16.h>')
+
+    header_list = sorted(headers)
     includes = "\n".join([f"#include {include_file}" for include_file in header_list])
 
     code = includes + "\n"
     code += "#define FUNC_PREFIX __global__\n"
     code += "#define RESTRICT __restrict__\n\n"
-    code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
-    code = 'extern "C" {\n%s\n}\n' % code
+    code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
 
     options = ["-w", "-std=c++11"]
     if USE_FAST_MATH:
         options.append("-use_fast_math")
     options.append("-I" + get_pystencils_include_path())
 
-    mod = cp.RawModule(code=code, options=tuple(options), backend="nvrtc", jitify=True)
-    func = mod.get_function(kernel_function_node.function_name)
-
+    func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True)
     parameters = kernel_function_node.get_parameters()
 
     cache = {}
diff --git a/pystencils/gpu/kernelcreation.py b/pystencils/gpu/kernelcreation.py
index 78a7355c6..066038cde 100644
--- a/pystencils/gpu/kernelcreation.py
+++ b/pystencils/gpu/kernelcreation.py
@@ -8,7 +8,7 @@ from pystencils.typing import StructType, TypedSymbol
 from pystencils.typing.transformations import add_types
 from pystencils.field import Field, FieldType
 from pystencils.enums import Target, Backend
-from pystencils.gpu.cudajit import make_python_function
+from pystencils.gpu.gpujit import make_python_function
 from pystencils.node_collection import NodeCollection
 from pystencils.gpu.indexing import indexing_creator_from_params
 from pystencils.simp.assignment_collection import AssignmentCollection
diff --git a/pystencils/include/half_precision.h b/pystencils/include/half_precision.h
new file mode 100644
index 000000000..0bba7b86d
--- /dev/null
+++ b/pystencils/include/half_precision.h
@@ -0,0 +1,17 @@
+/// Half precision support. Experimental. Use carefully.
+///
+/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
+/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
+/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
+/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
+
+/// Clang version must be 15 or higher for x86 half precision support.
+/// GCC version must be 12 or higher for x86 half precision support.
+/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
+/// See
+///   https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
+///   https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
+/// for more information.
+
+#pragma once
+using half    = _Float16;
diff --git a/pystencils/typing/types.py b/pystencils/typing/types.py
index d1c473a0a..531a8e290 100644
--- a/pystencils/typing/types.py
+++ b/pystencils/typing/types.py
@@ -25,6 +25,8 @@ def numpy_name_to_c(name: str) -> str:
         return 'double'
     elif name == 'float32':
         return 'float'
+    elif name == 'float16' or name == 'half':
+        return 'half'
     elif name.startswith('int'):
         width = int(name[len("int"):])
         return f"int{width}_t"
@@ -94,6 +96,9 @@ class BasicType(AbstractType):
     def is_float(self):
         return issubclass(self.numpy_dtype.type, np.floating)
 
+    def is_half(self):
+        return issubclass(self.numpy_dtype.type, np.half)
+
     def is_int(self):
         return issubclass(self.numpy_dtype.type, np.integer)
 
diff --git a/pystencils_tests/test_custom_backends.py b/pystencils_tests/test_custom_backends.py
index 645fbae86..9b625f8f9 100644
--- a/pystencils_tests/test_custom_backends.py
+++ b/pystencils_tests/test_custom_backends.py
@@ -38,7 +38,7 @@ def test_custom_backends_cpu():
 def test_custom_backends_gpu():
     pytest.importorskip('cupy')
     import cupy
-    import pystencils.gpu.cudajit
+    import pystencils.gpu.gpujit
 
     z, x, y = pystencils.fields("z, y, x: [2d]")
 
@@ -48,4 +48,4 @@ def test_custom_backends_gpu():
     ast = pystencils.create_kernel(normal_assignments, target=Target.GPU)
     pystencils.show_code(ast, ScreamingGpuBackend())
     with pytest.raises(cupy.cuda.compiler.JitifyException):
-        pystencils.gpu.cudajit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
+        pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
diff --git a/pystencils_tests/test_half_precision.py b/pystencils_tests/test_half_precision.py
new file mode 100644
index 000000000..6d55d1f0e
--- /dev/null
+++ b/pystencils_tests/test_half_precision.py
@@ -0,0 +1,41 @@
+import pytest
+import platform
+
+import numpy as np
+import pystencils as ps
+
+
+@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU))
+def test_half_precison(target):
+    if target == ps.Target.CPU:
+        if not platform.machine() in ['arm64', 'aarch64']:
+            pytest.xfail("skipping half precision test on non arm platform")
+
+        if 'clang' not in ps.cpu.cpujit.get_compiler_config()['command']:
+            pytest.xfail("skipping half precision because clang compiler is not used")
+
+    if target == ps.Target.GPU:
+        pytest.importorskip("cupy")
+
+    dh = ps.create_data_handling(domain_size=(10, 10), default_target=target)
+
+    f1 = dh.add_array("f1", values_per_cell=1, dtype=np.float16)
+    dh.fill("f1", 1.0, ghost_layers=True)
+    f2 = dh.add_array("f2", values_per_cell=1, dtype=np.float16)
+    dh.fill("f2", 2.0, ghost_layers=True)
+
+    f3 = dh.add_array("f3", values_per_cell=1, dtype=np.float16)
+    dh.fill("f3", 0.0, ghost_layers=True)
+
+    up = ps.Assignment(f3.center, f1.center + 2.1 * f2.center)
+
+    config = ps.CreateKernelConfig(target=dh.default_target, default_number_float=np.float32)
+    ast = ps.create_kernel(up, config=config)
+
+    kernel = ast.compile()
+
+    dh.run_kernel(kernel)
+    dh.all_to_cpu()
+
+    assert np.all(dh.cpu_arrays[f3.name] == 5.2)
+    assert dh.cpu_arrays[f3.name].dtype == np.float16
-- 
GitLab