Skip to content
Snippets Groups Projects
Commit 89e6b0f2 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

Merge branch 'halfP' into 'master'

Add experimental half precison support

See merge request !332
parents e3f6d8da d5c1c566
No related branches found
No related tags found
1 merge request!332Add experimental half precison support
Pipeline #53916 passed
......@@ -143,7 +143,7 @@ ubuntu:
- sed -i 's/--doctest-modules //g' pytest.ini
- env
- pip3 list
- pytest-3 -v -n $NUM_CORES --junitxml=report.xml pystencils_tests/test_*vec*.py pystencils_tests/test_random.py
- pytest-3 -v -n $NUM_CORES --junitxml=report.xml pystencils_tests/test_*vec*.py pystencils_tests/test_random.py pystencils_tests/test_half_precision.py
tags:
- docker
- AVX
......@@ -170,6 +170,7 @@ ppc64le:
arm64v9:
# SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
# For half precision Clang is necessary
extends: .multiarch_template
image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
before_script:
......
......@@ -43,26 +43,30 @@ Then 'cl.exe' is used to compile.
For Windows compilers the qualifier should be ``__restrict``
"""
from appdirs import user_cache_dir, user_config_dir
from collections import OrderedDict
import hashlib
import importlib.util
import json
import os
import platform
import shutil
import subprocess
import sysconfig
import tempfile
import textwrap
from collections import OrderedDict
from sysconfig import get_paths
from tempfile import TemporaryDirectory, NamedTemporaryFile
import time
import warnings
import numpy as np
from appdirs import user_cache_dir, user_config_dir
from pystencils import FieldType
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import generate_c, get_headers, CFunction
from pystencils.typing import CastFunc, VectorType, VectorMemoryAccess
from pystencils.cpu.msvc_detection import get_environment
from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper
from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess
from pystencils.utils import atomic_file_write, recursive_dict_update
......@@ -216,12 +220,11 @@ def read_config():
shutil.rmtree(config['cache']['object_cache'], ignore_errors=True)
create_folder(config['cache']['object_cache'], False)
with NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
json.dump(config['compiler'], f, indent=4)
os.replace(f.name, cache_status_file)
if config['compiler']['os'] == 'windows':
from pystencils.cpu.msvc_detection import get_environment
msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch'])
if 'env' not in config['compiler']:
config['compiler']['env'] = {}
......@@ -470,18 +473,15 @@ def create_module_boilerplate_code(module_name, names):
def load_kernel_from_file(module_name, function_name, path):
from importlib.util import spec_from_file_location, module_from_spec
try:
spec = spec_from_file_location(name=module_name, location=path)
mod = module_from_spec(spec)
spec = importlib.util.spec_from_file_location(name=module_name, location=path)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
except ImportError:
import time
import warnings
warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
time.sleep(5)
spec = spec_from_file_location(name=module_name, location=path)
mod = module_from_spec(spec)
spec = importlib.util.spec_from_file_location(name=module_name, location=path)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return getattr(mod, function_name)
......@@ -520,9 +520,13 @@ class ExtensionModuleCode:
headers = {'<math.h>', '<stdint.h>'}
for ast in self._ast_nodes:
for field in ast.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
# Add the half precision header only if half precision numbers occur in the AST
headers.add('"half_precision.h"')
headers.update(get_headers(ast))
header_list = list(headers)
header_list.sort()
header_list = sorted(headers)
header_list.insert(0, '"Python.h"')
ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list
if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))]
......@@ -559,7 +563,7 @@ def compile_module(code, code_hash, base_dir, compile_flags=None):
compile_flags = []
compiler_config = get_compiler_config()
extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
if compiler_config['os'].lower() == 'windows':
lib_suffix = '.pyd'
......@@ -593,7 +597,6 @@ def compile_module(code, code_hash, base_dir, compile_flags=None):
# Linking
if windows:
import sysconfig
config_vars = sysconfig.get_config_vars()
py_lib = os.path.join(config_vars["installed_base"], "libs",
f"python{config_vars['py_version_nodot']}.lib")
......@@ -627,7 +630,7 @@ def compile_and_load(ast, custom_backend=None):
compile_flags = ast.instruction_set['compile_flags']
if cache_config['object_cache'] is False:
with TemporaryDirectory() as base_dir:
with tempfile.TemporaryDirectory() as base_dir:
lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags)
result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
else:
......
from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
from pystencils.gpu.cudajit import make_python_function
from pystencils.gpu.gpujit import make_python_function
from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
from .indexing import AbstractIndexing, BlockIndexing, LineIndexing
......
......@@ -7,7 +7,7 @@ from pystencils.typing import StructType
from pystencils.field import FieldType
from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper
from pystencils.typing.typed_sympy import FieldPointerSymbol
from pystencils.typing import BasicType, FieldPointerSymbol
USE_FAST_MATH = True
......@@ -39,26 +39,29 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
if argument_dict is None:
argument_dict = {}
headers = get_headers(kernel_function_node)
if cp.cuda.runtime.is_hip:
header_list = ['"gpu_defines.h"'] + list(get_headers(kernel_function_node))
headers.add('"gpu_defines.h"')
else:
header_list = ['"gpu_defines.h"', '<cstdint>'] + list(get_headers(kernel_function_node))
headers.update({'"gpu_defines.h"', '<cstdint>'})
for field in kernel_function_node.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
headers.add('<cuda_fp16.h>')
header_list = sorted(headers)
includes = "\n".join([f"#include {include_file}" for include_file in header_list])
code = includes + "\n"
code += "#define FUNC_PREFIX __global__\n"
code += "#define RESTRICT __restrict__\n\n"
code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
code = 'extern "C" {\n%s\n}\n' % code
code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
options = ["-w", "-std=c++11"]
if USE_FAST_MATH:
options.append("-use_fast_math")
options.append("-I" + get_pystencils_include_path())
mod = cp.RawModule(code=code, options=tuple(options), backend="nvrtc", jitify=True)
func = mod.get_function(kernel_function_node.function_name)
func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True)
parameters = kernel_function_node.get_parameters()
cache = {}
......
......@@ -8,7 +8,7 @@ from pystencils.typing import StructType, TypedSymbol
from pystencils.typing.transformations import add_types
from pystencils.field import Field, FieldType
from pystencils.enums import Target, Backend
from pystencils.gpu.cudajit import make_python_function
from pystencils.gpu.gpujit import make_python_function
from pystencils.node_collection import NodeCollection
from pystencils.gpu.indexing import indexing_creator_from_params
from pystencils.simp.assignment_collection import AssignmentCollection
......
/// Half precision support. Experimental. Use carefully.
///
/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
/// Clang version must be 15 or higher for x86 half precision support.
/// GCC version must be 12 or higher for x86 half precision support.
/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
/// See
/// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
/// https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
/// for more information.
#pragma once
using half = _Float16;
......@@ -25,6 +25,8 @@ def numpy_name_to_c(name: str) -> str:
return 'double'
elif name == 'float32':
return 'float'
elif name == 'float16' or name == 'half':
return 'half'
elif name.startswith('int'):
width = int(name[len("int"):])
return f"int{width}_t"
......@@ -94,6 +96,9 @@ class BasicType(AbstractType):
def is_float(self):
return issubclass(self.numpy_dtype.type, np.floating)
def is_half(self):
return issubclass(self.numpy_dtype.type, np.half)
def is_int(self):
return issubclass(self.numpy_dtype.type, np.integer)
......
......@@ -38,7 +38,7 @@ def test_custom_backends_cpu():
def test_custom_backends_gpu():
pytest.importorskip('cupy')
import cupy
import pystencils.gpu.cudajit
import pystencils.gpu.gpujit
z, x, y = pystencils.fields("z, y, x: [2d]")
......@@ -48,4 +48,4 @@ def test_custom_backends_gpu():
ast = pystencils.create_kernel(normal_assignments, target=Target.GPU)
pystencils.show_code(ast, ScreamingGpuBackend())
with pytest.raises(cupy.cuda.compiler.JitifyException):
pystencils.gpu.cudajit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
import pytest
import platform
import numpy as np
import pystencils as ps
@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU))
def test_half_precison(target):
if target == ps.Target.CPU:
if not platform.machine() in ['arm64', 'aarch64']:
pytest.xfail("skipping half precision test on non arm platform")
if 'clang' not in ps.cpu.cpujit.get_compiler_config()['command']:
pytest.xfail("skipping half precision because clang compiler is not used")
if target == ps.Target.GPU:
pytest.importorskip("cupy")
dh = ps.create_data_handling(domain_size=(10, 10), default_target=target)
f1 = dh.add_array("f1", values_per_cell=1, dtype=np.float16)
dh.fill("f1", 1.0, ghost_layers=True)
f2 = dh.add_array("f2", values_per_cell=1, dtype=np.float16)
dh.fill("f2", 2.0, ghost_layers=True)
f3 = dh.add_array("f3", values_per_cell=1, dtype=np.float16)
dh.fill("f3", 0.0, ghost_layers=True)
up = ps.Assignment(f3.center, f1.center + 2.1 * f2.center)
config = ps.CreateKernelConfig(target=dh.default_target, default_number_float=np.float32)
ast = ps.create_kernel(up, config=config)
kernel = ast.compile()
dh.run_kernel(kernel)
dh.all_to_cpu()
assert np.all(dh.cpu_arrays[f3.name] == 5.2)
assert dh.cpu_arrays[f3.name].dtype == np.float16
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment