diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f338f2740a72d98e046a0b14c300e3d74688f839..12ef5bf483ed593926bd26f4d257f8f0028e6e69 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -143,7 +143,7 @@ ubuntu: - sed -i 's/--doctest-modules //g' pytest.ini - env - pip3 list - - pytest-3 -v -n $NUM_CORES --junitxml=report.xml pystencils_tests/test_*vec*.py pystencils_tests/test_random.py + - pytest-3 -v -n $NUM_CORES --junitxml=report.xml pystencils_tests/test_*vec*.py pystencils_tests/test_random.py pystencils_tests/test_half_precision.py tags: - docker - AVX @@ -170,6 +170,7 @@ ppc64le: arm64v9: # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors). + # For half precision Clang is necessary extends: .multiarch_template image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64 before_script: diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index 4c3febe2981562f6b0c923e8d68417c722ca7d61..b839f87cfc06385e0c48080795c339ce4c88556d 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -43,26 +43,30 @@ Then 'cl.exe' is used to compile. For Windows compilers the qualifier should be ``__restrict`` """ +from appdirs import user_cache_dir, user_config_dir +from collections import OrderedDict import hashlib +import importlib.util import json import os import platform import shutil import subprocess +import sysconfig +import tempfile import textwrap -from collections import OrderedDict -from sysconfig import get_paths -from tempfile import TemporaryDirectory, NamedTemporaryFile +import time +import warnings import numpy as np -from appdirs import user_cache_dir, user_config_dir from pystencils import FieldType from pystencils.astnodes import LoopOverCoordinate from pystencils.backends.cbackend import generate_c, get_headers, CFunction -from pystencils.typing import CastFunc, VectorType, VectorMemoryAccess +from pystencils.cpu.msvc_detection import get_environment from pystencils.include import get_pystencils_include_path from pystencils.kernel_wrapper import KernelWrapper +from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess from pystencils.utils import atomic_file_write, recursive_dict_update @@ -216,12 +220,11 @@ def read_config(): shutil.rmtree(config['cache']['object_cache'], ignore_errors=True) create_folder(config['cache']['object_cache'], False) - with NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f: + with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f: json.dump(config['compiler'], f, indent=4) os.replace(f.name, cache_status_file) if config['compiler']['os'] == 'windows': - from pystencils.cpu.msvc_detection import get_environment msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch']) if 'env' not in config['compiler']: config['compiler']['env'] = {} @@ -470,18 +473,15 @@ def create_module_boilerplate_code(module_name, names): def load_kernel_from_file(module_name, function_name, path): - from importlib.util import spec_from_file_location, module_from_spec try: - spec = spec_from_file_location(name=module_name, location=path) - mod = module_from_spec(spec) + spec = importlib.util.spec_from_file_location(name=module_name, location=path) + mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) except ImportError: - import time - import warnings warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...") time.sleep(5) - spec = spec_from_file_location(name=module_name, location=path) - mod = module_from_spec(spec) + spec = importlib.util.spec_from_file_location(name=module_name, location=path) + mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) return getattr(mod, function_name) @@ -520,9 +520,13 @@ class ExtensionModuleCode: headers = {'<math.h>', '<stdint.h>'} for ast in self._ast_nodes: + for field in ast.fields_accessed: + if isinstance(field.dtype, BasicType) and field.dtype.is_half(): + # Add the half precision header only if half precision numbers occur in the AST + headers.add('"half_precision.h"') headers.update(get_headers(ast)) - header_list = list(headers) - header_list.sort() + + header_list = sorted(headers) header_list.insert(0, '"Python.h"') ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))] @@ -559,7 +563,7 @@ def compile_module(code, code_hash, base_dir, compile_flags=None): compile_flags = [] compiler_config = get_compiler_config() - extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags + extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags if compiler_config['os'].lower() == 'windows': lib_suffix = '.pyd' @@ -593,7 +597,6 @@ def compile_module(code, code_hash, base_dir, compile_flags=None): # Linking if windows: - import sysconfig config_vars = sysconfig.get_config_vars() py_lib = os.path.join(config_vars["installed_base"], "libs", f"python{config_vars['py_version_nodot']}.lib") @@ -627,7 +630,7 @@ def compile_and_load(ast, custom_backend=None): compile_flags = ast.instruction_set['compile_flags'] if cache_config['object_cache'] is False: - with TemporaryDirectory() as base_dir: + with tempfile.TemporaryDirectory() as base_dir: lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags) result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file) else: diff --git a/pystencils/gpu/__init__.py b/pystencils/gpu/__init__.py index 8a30937318b554115d66712c8dd1d986a4d29d51..c0d1fd34d7d8028469d68aaa694f88c3e607e9ee 100644 --- a/pystencils/gpu/__init__.py +++ b/pystencils/gpu/__init__.py @@ -1,5 +1,5 @@ from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler -from pystencils.gpu.cudajit import make_python_function +from pystencils.gpu.gpujit import make_python_function from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel from .indexing import AbstractIndexing, BlockIndexing, LineIndexing diff --git a/pystencils/gpu/cudajit.py b/pystencils/gpu/gpujit.py similarity index 92% rename from pystencils/gpu/cudajit.py rename to pystencils/gpu/gpujit.py index 950445ff2b4f4f51cbd1584a09fd3f9e07bd2a29..e29f85d4369b7dfc38bba9259a39bfc666dc0977 100644 --- a/pystencils/gpu/cudajit.py +++ b/pystencils/gpu/gpujit.py @@ -7,7 +7,7 @@ from pystencils.typing import StructType from pystencils.field import FieldType from pystencils.include import get_pystencils_include_path from pystencils.kernel_wrapper import KernelWrapper -from pystencils.typing.typed_sympy import FieldPointerSymbol +from pystencils.typing import BasicType, FieldPointerSymbol USE_FAST_MATH = True @@ -39,26 +39,29 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen if argument_dict is None: argument_dict = {} + headers = get_headers(kernel_function_node) if cp.cuda.runtime.is_hip: - header_list = ['"gpu_defines.h"'] + list(get_headers(kernel_function_node)) + headers.add('"gpu_defines.h"') else: - header_list = ['"gpu_defines.h"', '<cstdint>'] + list(get_headers(kernel_function_node)) + headers.update({'"gpu_defines.h"', '<cstdint>'}) + for field in kernel_function_node.fields_accessed: + if isinstance(field.dtype, BasicType) and field.dtype.is_half(): + headers.add('<cuda_fp16.h>') + + header_list = sorted(headers) includes = "\n".join([f"#include {include_file}" for include_file in header_list]) code = includes + "\n" code += "#define FUNC_PREFIX __global__\n" code += "#define RESTRICT __restrict__\n\n" - code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend)) - code = 'extern "C" {\n%s\n}\n' % code + code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend)) options = ["-w", "-std=c++11"] if USE_FAST_MATH: options.append("-use_fast_math") options.append("-I" + get_pystencils_include_path()) - mod = cp.RawModule(code=code, options=tuple(options), backend="nvrtc", jitify=True) - func = mod.get_function(kernel_function_node.function_name) - + func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True) parameters = kernel_function_node.get_parameters() cache = {} diff --git a/pystencils/gpu/kernelcreation.py b/pystencils/gpu/kernelcreation.py index 78a7355c6e3a1f92db8d0ccb017dca4ce9909c19..066038cde246934d1694ee20613434ac9e3dc678 100644 --- a/pystencils/gpu/kernelcreation.py +++ b/pystencils/gpu/kernelcreation.py @@ -8,7 +8,7 @@ from pystencils.typing import StructType, TypedSymbol from pystencils.typing.transformations import add_types from pystencils.field import Field, FieldType from pystencils.enums import Target, Backend -from pystencils.gpu.cudajit import make_python_function +from pystencils.gpu.gpujit import make_python_function from pystencils.node_collection import NodeCollection from pystencils.gpu.indexing import indexing_creator_from_params from pystencils.simp.assignment_collection import AssignmentCollection diff --git a/pystencils/include/half_precision.h b/pystencils/include/half_precision.h new file mode 100644 index 0000000000000000000000000000000000000000..0bba7b86d24e2caab295c92ee1cdff5b4f103a8c --- /dev/null +++ b/pystencils/include/half_precision.h @@ -0,0 +1,17 @@ +/// Half precision support. Experimental. Use carefully. +/// +/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support. +/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and +/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16. +/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future. + +/// Clang version must be 15 or higher for x86 half precision support. +/// GCC version must be 12 or higher for x86 half precision support. +/// Also support seems to require SSE, so ensure that respective instruction sets are enabled. +/// See +/// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point +/// https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html +/// for more information. + +#pragma once +using half = _Float16; diff --git a/pystencils/typing/types.py b/pystencils/typing/types.py index d1c473a0aa06bacf14bf4ef84d577abee69e4155..531a8e2902fe5f44db911ff5fe4ee6eb4f11737b 100644 --- a/pystencils/typing/types.py +++ b/pystencils/typing/types.py @@ -25,6 +25,8 @@ def numpy_name_to_c(name: str) -> str: return 'double' elif name == 'float32': return 'float' + elif name == 'float16' or name == 'half': + return 'half' elif name.startswith('int'): width = int(name[len("int"):]) return f"int{width}_t" @@ -94,6 +96,9 @@ class BasicType(AbstractType): def is_float(self): return issubclass(self.numpy_dtype.type, np.floating) + def is_half(self): + return issubclass(self.numpy_dtype.type, np.half) + def is_int(self): return issubclass(self.numpy_dtype.type, np.integer) diff --git a/pystencils_tests/test_custom_backends.py b/pystencils_tests/test_custom_backends.py index 645fbae868a30c837fc96882fd5b53814037f3f8..9b625f8f9020e42d6528e4fdf64da7e89d777bae 100644 --- a/pystencils_tests/test_custom_backends.py +++ b/pystencils_tests/test_custom_backends.py @@ -38,7 +38,7 @@ def test_custom_backends_cpu(): def test_custom_backends_gpu(): pytest.importorskip('cupy') import cupy - import pystencils.gpu.cudajit + import pystencils.gpu.gpujit z, x, y = pystencils.fields("z, y, x: [2d]") @@ -48,4 +48,4 @@ def test_custom_backends_gpu(): ast = pystencils.create_kernel(normal_assignments, target=Target.GPU) pystencils.show_code(ast, ScreamingGpuBackend()) with pytest.raises(cupy.cuda.compiler.JitifyException): - pystencils.gpu.cudajit.make_python_function(ast, custom_backend=ScreamingGpuBackend()) + pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend()) diff --git a/pystencils_tests/test_half_precision.py b/pystencils_tests/test_half_precision.py new file mode 100644 index 0000000000000000000000000000000000000000..6d55d1f0e292fec7b27e9b5837b9771f6890d6b3 --- /dev/null +++ b/pystencils_tests/test_half_precision.py @@ -0,0 +1,41 @@ +import pytest +import platform + +import numpy as np +import pystencils as ps + + +@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU)) +def test_half_precison(target): + if target == ps.Target.CPU: + if not platform.machine() in ['arm64', 'aarch64']: + pytest.xfail("skipping half precision test on non arm platform") + + if 'clang' not in ps.cpu.cpujit.get_compiler_config()['command']: + pytest.xfail("skipping half precision because clang compiler is not used") + + if target == ps.Target.GPU: + pytest.importorskip("cupy") + + dh = ps.create_data_handling(domain_size=(10, 10), default_target=target) + + f1 = dh.add_array("f1", values_per_cell=1, dtype=np.float16) + dh.fill("f1", 1.0, ghost_layers=True) + f2 = dh.add_array("f2", values_per_cell=1, dtype=np.float16) + dh.fill("f2", 2.0, ghost_layers=True) + + f3 = dh.add_array("f3", values_per_cell=1, dtype=np.float16) + dh.fill("f3", 0.0, ghost_layers=True) + + up = ps.Assignment(f3.center, f1.center + 2.1 * f2.center) + + config = ps.CreateKernelConfig(target=dh.default_target, default_number_float=np.float32) + ast = ps.create_kernel(up, config=config) + + kernel = ast.compile() + + dh.run_kernel(kernel) + dh.all_to_cpu() + + assert np.all(dh.cpu_arrays[f3.name] == 5.2) + assert dh.cpu_arrays[f3.name].dtype == np.float16