Skip to content
Snippets Groups Projects
Commit e0c51476 authored by Michael Kuron's avatar Michael Kuron :mortar_board: Committed by Markus Holzer
Browse files

Add some xfail-ing integer vectorization tests

and clean up some old xfails
parent 6dfcd917
Branches
Tags
1 merge request!48RNG SIMD
......@@ -11,7 +11,7 @@ from pystencils.astnodes import KernelFunction, Node
from pystencils.cpu.vectorization import vec_all, vec_any
from pystencils.data_types import (
PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
reinterpret_cast_func, vector_memory_access)
reinterpret_cast_func, vector_memory_access, BasicType, TypedSymbol)
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
from pystencils.integer_functions import (
bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor,
......@@ -134,7 +134,7 @@ class CustomCodeNode(Node):
self._symbols_defined = set(symbols_defined)
self.headers = []
def get_code(self, dialect, vector_instruction_set):
def get_code(self, dialect, vector_instruction_set, print_arg):
return self._code
@property
......@@ -297,7 +297,7 @@ class CBackend:
return "continue;"
def _print_CustomCodeNode(self, node):
return node.get_code(self._dialect, self._vector_instruction_set)
return node.get_code(self._dialect, self._vector_instruction_set, print_arg=self.sympy_printer._print)
def _print_SourceCodeComment(self, node):
return f"/* {node.text } */"
......@@ -548,12 +548,16 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
if type(data_type) is VectorType:
if isinstance(arg, sp.Tuple):
is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
is_integer = get_type_of_expression(arg[0]) == create_type("int")
printed_args = [self._print(a) for a in arg]
instruction = 'makeVecBool' if is_boolean else 'makeVec'
instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
return self.instruction_set[instruction].format(*printed_args)
else:
is_boolean = get_type_of_expression(arg) == create_type("bool")
instruction = 'makeVecConstBool' if is_boolean else 'makeVecConst'
is_integer = get_type_of_expression(arg) == create_type("int") or \
(isinstance(arg, TypedSymbol) and arg.dtype.is_int())
instruction = 'makeVecConstBool' if is_boolean else \
'makeVecConstInt' if is_integer else 'makeVecConst'
return self.instruction_set[instruction].format(self._print(arg))
elif expr.func == fast_division:
result = self._scalarFallback('_print_Function', expr)
......@@ -609,12 +613,27 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
return result
def _print_Add(self, expr, order=None):
result = self._scalarFallback('_print_Add', expr)
try:
result = self._scalarFallback('_print_Add', expr)
except Exception:
result = None
if result:
return result
args = expr.args
# special treatment for all-integer args, for loop index arithmetic until we have proper int vectorization
suffix = ""
if all([(type(e) is cast_func and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer)
or (type(e) is TypedSymbol and isinstance(e.dtype, BasicType) and e.dtype.is_int()) for e in args]):
dtype = set([e.dtype for e in args if type(e) is cast_func])
assert len(dtype) == 1
dtype = dtype.pop()
args = [cast_func(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e
for e in args]
suffix = "int"
summands = []
for term in expr.args:
for term in args:
if term.func == sp.Mul:
sign, t = self._print_Mul(term, inside_add=True)
else:
......@@ -630,7 +649,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
assert len(summands) >= 2
processed = summands[0].term
for summand in summands[1:]:
func = self.instruction_set['-'] if summand.sign == -1 else self.instruction_set['+']
func = self.instruction_set['-' + suffix] if summand.sign == -1 else self.instruction_set['+' + suffix]
processed = func.format(processed, summand.term)
return processed
......
......@@ -18,7 +18,7 @@ def get_supported_instruction_sets():
result = []
required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
required_avx_flags = {'avx'}
required_avx_flags = {'avx', 'avx2'}
required_avx512_flags = {'avx512f'}
required_neon_flags = {'neon'}
flags = set(get_cpu_info()['flags'])
......
def get_argument_string(intrinsic_id, width, function_shortcut):
if intrinsic_id == 'makeVecConst':
if intrinsic_id == 'makeVecConst' or intrinsic_id == 'makeVecConstInt':
arg_string = f"({','.join(['{0}'] * width)})"
elif intrinsic_id == 'makeVec':
elif intrinsic_id == 'makeVec' or intrinsic_id == 'makeVecInt':
params = ["{" + str(i) + "}" for i in reversed(range(width))]
arg_string = f"({','.join(params)})"
elif intrinsic_id == 'makeVecBool':
......@@ -49,6 +49,8 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
'makeVec': 'set[]',
'makeVecBool': 'set[]',
'makeVecConstBool': 'set[]',
'makeVecInt': 'set[]',
'makeVecConstInt': 'set[]',
'loadU': 'loadu[0]',
'loadA': 'load[0]',
......@@ -86,6 +88,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
suffix = {
'double': 'pd',
'float': 'ps',
'int': 'epi32'
}
prefix = {
'sse': '_mm',
......@@ -96,22 +99,30 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
width = {
("double", "sse"): 2,
("float", "sse"): 4,
("int", "sse"): 4,
("double", "avx"): 4,
("float", "avx"): 8,
("int", "avx"): 8,
("double", "avx512"): 8,
("float", "avx512"): 16,
("int", "avx512"): 16,
}
result = {
'width': width[(data_type, instruction_set)],
'intwidth': width[('int', instruction_set)]
}
pre = prefix[instruction_set]
suf = suffix[data_type]
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
if 'Int' in intrinsic_id:
suf = suffix['int']
arg_string = get_argument_string(intrinsic_id, result['intwidth'], function_shortcut)
else:
suf = suffix[data_type]
arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
mask_suffix = '_mask' if instruction_set == 'avx512' and intrinsic_id in comparisons.keys() else ''
result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
......@@ -151,4 +162,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
if instruction_set == 'avx' and data_type == 'float':
result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})"
result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
return result
......@@ -73,11 +73,30 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
vector_width = vector_is['width']
kernel_ast.instruction_set = vector_is
vectorize_rng(kernel_ast, vector_width)
vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned,
nontemporal, assume_sufficient_line_padding)
insert_vector_casts(kernel_ast)
def vectorize_rng(kernel_ast, vector_width):
"""Replace scalar result symbols on RNG nodes with vectorial ones"""
from pystencils.rng import RNGBase
subst = {}
def visit_node(node):
for arg in node.args:
if isinstance(arg, RNGBase):
new_result_symbols = [TypedSymbol(s.name, VectorType(s.dtype, width=vector_width))
for s in arg.result_symbols]
subst.update({s[0]: s[1] for s in zip(arg.result_symbols, new_result_symbols)})
arg._symbols_defined = set(new_result_symbols)
else:
visit_node(arg)
visit_node(kernel_ast)
fast_subs(kernel_ast.body, subst, skip=lambda e: isinstance(e, RNGBase))
def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
assume_sufficient_line_padding):
"""Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
......@@ -129,8 +148,10 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
loop_node.step = vector_width
loop_node.subs(substitutions)
vector_loop_counter = cast_func(tuple(loop_counter_symbol + i for i in range(vector_width)),
VectorType(loop_counter_symbol.dtype, vector_width))
vector_int_width = ast_node.instruction_set['intwidth']
vector_loop_counter = cast_func((loop_counter_symbol,) * vector_int_width,
VectorType(loop_counter_symbol.dtype, vector_int_width)) + \
cast_func(tuple(range(vector_int_width)), VectorType(loop_counter_symbol.dtype, vector_int_width))
fast_subs(loop_node, {loop_counter_symbol: vector_loop_counter},
skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, vector_memory_access))
......@@ -178,7 +199,10 @@ def insert_vector_casts(ast_node):
return expr
elif expr.func is sp.Abs and 'abs' not in ast_node.instruction_set:
new_arg = visit_expr(expr.args[0])
pw = sp.Piecewise((-1 * new_arg, new_arg < 0), (new_arg, True))
base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is vector_memory_access \
else get_type_of_expression(expr.args[0])
pw = sp.Piecewise((base_type.numpy_dtype.type(-1) * new_arg, new_arg < base_type.numpy_dtype.type(0)),
(new_arg, True))
return visit_expr(pw)
elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction):
new_args = [visit_expr(a) for a in expr.args]
......
......@@ -119,7 +119,7 @@ class cast_func(sp.Function):
# rhs = cast_func(0, 'int')
# print( sp.Ne(lhs, rhs) ) # would give true if all cast_funcs are booleans
# -> thus a separate class boolean_cast_func is introduced
if isinstance(expr, Boolean):
if isinstance(expr, Boolean) and (not isinstance(expr, TypedSymbol) or expr.dtype == BasicType(bool)):
cls = boolean_cast_func
return sp.Function.__new__(cls, expr, dtype, *other_args, **kwargs)
......@@ -697,7 +697,7 @@ class VectorType(Type):
if self.instruction_set is None:
return "%s[%d]" % (self.base_type, self.width)
else:
if self.base_type == create_type("int64"):
if self.base_type == create_type("int64") or self.base_type == create_type("int32"):
return self.instruction_set['int']
elif self.base_type == create_type("float64"):
return self.instruction_set['double']
......
......@@ -958,8 +958,6 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0
if not alignment:
res = np.empty(shape, order='c', **kwargs)
else:
if alignment is True:
alignment = 8 * 4
res = aligned_empty(shape, alignment, byte_offset=byte_offset, **kwargs)
for a, b in reversed(swaps):
......
This diff is collapsed.
#pragma once
#if defined(__SSE2__) || defined(_MSC_VER)
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#ifdef __AVX512VL__
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
{
__m128i T0, T1, T2, T3;
T0 = _mm_unpacklo_epi32(R0, R1);
T1 = _mm_unpacklo_epi32(R2, R3);
T2 = _mm_unpackhi_epi32(R0, R1);
T3 = _mm_unpackhi_epi32(R2, R3);
R0 = _mm_unpacklo_epi64(T0, T1);
R1 = _mm_unpackhi_epi64(T0, T1);
R2 = _mm_unpacklo_epi64(T2, T3);
R3 = _mm_unpackhi_epi64(T2, T3);
}
#endif
#if defined(__SSE4_1__) || defined(_MSC_VER)
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{
#ifdef __AVX512VL__
return _mm_cvtepu64_pd(x);
#else
__m128i xH = _mm_srli_epi64(x, 32);
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
#endif
}
#endif
#ifdef __AVX2__
QUALIFIERS __m256i _my256_set_m128i(__m128i hi, __m128i lo)
{
#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
return _mm256_set_m128i(hi, lo);
#else
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
#endif
}
QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
{
#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
return _mm256_set_m128d(hi, lo);
#else
return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
#endif
}
QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
{
#ifdef __AVX512VL__
return _mm256_cvtepu32_ps(v);
#else
__m256i v2 = _mm256_srli_epi32(v, 1);
__m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
__m256 v2f = _mm256_cvtepi32_ps(v2);
__m256 v1f = _mm256_cvtepi32_ps(v1);
return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
#endif
}
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
{
#ifdef __AVX512VL__
return _mm256_cvtepu64_pd(x);
#else
__m256i xH = _mm256_srli_epi64(x, 32);
xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84
__m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
#endif
}
#endif
#ifdef __AVX512F__
QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
{
return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);
}
QUALIFIERS __m512d _my512_set_m256d(__m256d b, __m256d a)
{
return _mm512_insertf64x4(_mm512_castpd256_pd512(a), b, 1);
}
#endif
This diff is collapsed.
import copy
import numpy as np
import sympy as sp
from pystencils import TypedSymbol
from pystencils.data_types import TypedSymbol
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode
def _get_rng_template(name, data_type, num_vars):
if data_type is np.float32:
c_type = "float"
elif data_type is np.float64:
c_type = "double"
template = "\n"
for i in range(num_vars):
template += f"{{result_symbols[{i}].dtype}} {{result_symbols[{i}].name}};\n"
template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
.format(name, c_type, num_vars, *tuple(range(num_vars)))
return template
def _get_rng_code(template, dialect, vector_instruction_set, args, result_symbols):
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
return template.format(parameters=', '.join(str(a) for a in args),
result_symbols=result_symbols)
else:
raise NotImplementedError("Not yet implemented for this backend")
from pystencils.sympyextensions import fast_subs
class RNGBase(CustomCodeNode):
......@@ -50,7 +31,7 @@ class RNGBase(CustomCodeNode):
symbols_read = set.union(*[s.atoms(sp.Symbol) for s in self.args])
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self.headers = [f'"{self._name}_rand.h"']
self.headers = [f'"{self._name.split("_")[0]}_rand.h"']
RNGBase.id += 1
......@@ -58,9 +39,23 @@ class RNGBase(CustomCodeNode):
def args(self):
return self._args
def get_code(self, dialect, vector_instruction_set):
template = _get_rng_template(self._name, self._data_type, self._num_vars)
return _get_rng_code(template, dialect, vector_instruction_set, self.args, self.result_symbols)
def fast_subs(self, subs_dict, skip):
rng = copy.deepcopy(self)
rng._args = [fast_subs(a, subs_dict, skip) for a in rng._args]
return rng
def get_code(self, dialect, vector_instruction_set, print_arg):
code = "\n"
for r in self.result_symbols:
if vector_instruction_set and isinstance(self.args[1], sp.Integer):
# this vector RNG has become scalar through substitution
code += f"{r.dtype} {r.name};\n"
else:
code += f"{vector_instruction_set[r.dtype.base_name] if vector_instruction_set else r.dtype} " + \
f"{r.name};\n"
code += (self._name + "(" + ", ".join([print_arg(a) for a in self.args]
+ [r.name for r in self.result_symbols]) + ");\n")
return code
def __repr__(self):
return (", ".join(['{}'] * self._num_vars) + " \\leftarrow {}RNG").format(*self.result_symbols,
......@@ -68,37 +63,53 @@ class RNGBase(CustomCodeNode):
class PhiloxTwoDoubles(RNGBase):
_name = "philox"
_name = "philox_double2"
_data_type = np.float64
_num_vars = 2
_num_keys = 2
class PhiloxFourFloats(RNGBase):
_name = "philox"
_name = "philox_float4"
_data_type = np.float32
_num_vars = 4
_num_keys = 2
class AESNITwoDoubles(RNGBase):
_name = "aesni"
_name = "aesni_double2"
_data_type = np.float64
_num_vars = 2
_num_keys = 4
class AESNIFourFloats(RNGBase):
_name = "aesni"
_name = "aesni_float4"
_data_type = np.float32
_num_vars = 4
_num_keys = 4
def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs):
def random_symbol(assignment_list, dim, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles,
time_step=TypedSymbol("time_step", np.uint32), offsets=None):
"""Return a symbol generator for random numbers
Args:
assignment_list: the subexpressions member of an AssignmentCollection, into which helper variables assignments
will be inserted
dim: 2 or 3 for two or three spatial dimensions
seed: an integer or TypedSymbol(..., np.uint32) to seed the random number generator. If you create multiple
symbol generators, please pass them different seeds so you don't get the same stream of random numbers!
rng_node: which random number generator to use (PhiloxTwoDoubles, PhiloxFourFloats, AESNITwoDoubles,
AESNIFourFloats).
time_step: TypedSymbol(..., np.uint32) that indicates the number of the current time step
offsets: tuple of offsets (constant integers or TypedSymbol(..., np.uint32)) that give the global coordinates
of the local origin
"""
counter = 0
while True:
node = rng_node(*args, keys=(counter, seed), **kwargs)
keys = (counter, seed) + (0,) * (rng_node._num_keys - 2)
node = rng_node(dim, keys=keys, time_step=time_step, offsets=offsets)
inserted = False
for symbol in node.result_symbols:
if not inserted:
......
......@@ -77,14 +77,7 @@ def test_scale_interpolation():
pyconrad.imshow(out, "out " + address_mode)
@pytest.mark.parametrize('address_mode',
['border',
'clamp',
pytest.param('warp', marks=pytest.mark.xfail(
reason="requires interpolation-refactoring branch")),
pytest.param('mirror', marks=pytest.mark.xfail(
reason="requires interpolation-refactoring branch")),
])
@pytest.mark.parametrize('address_mode', ['border', 'clamp'])
def test_rotate_interpolation(address_mode):
"""
'wrap', 'mirror' currently fails on new sympy due to conjugate()
......@@ -144,18 +137,13 @@ def test_rotate_interpolation_gpu(dtype, address_mode, use_textures):
f"out {address_mode} texture:{use_textures} {type_map[dtype]}")
@pytest.mark.parametrize('address_mode', ['border', 'wrap',
pytest.param('warp', marks=pytest.mark.xfail(
reason="% printed as fmod on old sympy")),
pytest.param('mirror', marks=pytest.mark.xfail(
reason="% printed as fmod on old sympy")),
])
@pytest.mark.parametrize('address_mode', ['border', 'wrap', 'mirror'])
@pytest.mark.parametrize('dtype', [np.float64, np.float32, np.int32])
@pytest.mark.parametrize('use_textures', ('use_textures', False,))
def test_shift_interpolation_gpu(address_mode, dtype, use_textures):
sver = sympy.__version__.split(".")
if (int(sver[0]) == 1 and int(sver[1]) < 2) and address_mode in ['mirror', 'warp']:
pytest.skip()
if (int(sver[0]) == 1 and int(sver[1]) < 2) and address_mode == 'mirror':
pytest.skip("% printed as fmod on old sympy")
pytest.importorskip('pycuda')
import pycuda.gpuarray as gpuarray
......
......@@ -4,108 +4,197 @@ import pytest
import pystencils as ps
from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles, AESNIFourFloats, AESNITwoDoubles, random_symbol
# curand_Philox4x32_10(make_uint4(124, i, j, 0), make_uint2(0, 0))
philox_reference = np.array([[[3576608082, 1252663339, 1987745383, 348040302],
[1032407765, 970978240, 2217005168, 2424826293]],
[[2958765206, 3725192638, 2623672781, 1373196132],
[ 850605163, 1694561295, 3285694973, 2799652583]]])
@pytest.mark.parametrize('target', ('cpu', 'gpu'))
def test_philox_double(target):
from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
from pystencils.cpu.cpujit import get_compiler_config
from pystencils.data_types import TypedSymbol
RNGs = {('philox', 'float'): PhiloxFourFloats, ('philox', 'double'): PhiloxTwoDoubles,
('aesni', 'float'): AESNIFourFloats, ('aesni', 'double'): AESNITwoDoubles}
instruction_sets = get_supported_instruction_sets()
if get_compiler_config()['os'] == 'windows':
# skip instruction sets supported by CPU but not the compiler
if 'avx' in instruction_sets and ('/arch:avx2' not in get_compiler_config()['flags'].lower()
or '/arch:avx512' not in get_compiler_config()['flags'].lower()):
instruction_sets.remove('avx')
if 'avx512' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower():
instruction_sets.remove('avx512')
@pytest.mark.parametrize('target,rng', (('cpu', 'philox'), ('cpu', 'aesni'), ('gpu', 'philox')))
@pytest.mark.parametrize('precision', ('float', 'double'))
@pytest.mark.parametrize('dtype', ('float', 'double'))
def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
if target == 'gpu':
pytest.importorskip('pycuda')
if rng == 'aesni' and len(keys) == 2:
keys *= 2
if offset_values is None:
offset_values = offsets
dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target=target)
f = dh.add_array("f", values_per_cell=2)
f = dh.add_array("f", values_per_cell=4 if precision == 'float' else 2,
dtype=np.float32 if dtype == 'float' else np.float64)
dh.fill(f.name, 42.0)
dh.fill('f', 42.0)
philox_node = PhiloxTwoDoubles(dh.dim)
assignments = [philox_node,
ps.Assignment(f(0), philox_node.result_symbols[0]),
ps.Assignment(f(1), philox_node.result_symbols[1])]
rng_node = RNGs[(rng, precision)](dh.dim, offsets=offsets, keys=keys)
assignments = [rng_node] + [ps.Assignment(f(i), s) for i, s in enumerate(rng_node.result_symbols)]
kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
dh.all_to_gpu()
dh.run_kernel(kernel, time_step=124)
kwargs = {'time_step': t}
if offset_values != offsets:
kwargs.update({k.name: v for k, v in zip(offsets, offset_values)})
dh.run_kernel(kernel, **kwargs)
dh.all_to_cpu()
arr = dh.gather_array('f')
arr = dh.gather_array(f.name)
assert np.logical_and(arr <= 1.0, arr >= 0).all()
x = philox_reference[:,:,0::2]
y = philox_reference[:,:,1::2]
z = x ^ y << (53 - 32)
double_reference = z * 2.**-53 + 2.**-54
assert(np.allclose(arr, double_reference, rtol=0, atol=np.finfo(np.float64).eps))
@pytest.mark.parametrize('target', ('cpu', 'gpu'))
def test_philox_float(target):
if target == 'gpu':
pytest.importorskip('pycuda')
dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target=target)
f = dh.add_array("f", values_per_cell=4)
dh.fill('f', 42.0)
philox_node = PhiloxFourFloats(dh.dim)
assignments = [philox_node] + [ps.Assignment(f(i), philox_node.result_symbols[i]) for i in range(4)]
if rng == 'philox' and t == 124 and offsets == (0, 0) and keys == (0, 0) and dh.shape == (2, 2):
int_reference = np.array([[[3576608082, 1252663339, 1987745383, 348040302],
[1032407765, 970978240, 2217005168, 2424826293]],
[[2958765206, 3725192638, 2623672781, 1373196132],
[850605163, 1694561295, 3285694973, 2799652583]]])
else:
pytest.importorskip('randomgen')
if rng == 'aesni':
from randomgen import AESCounter
int_reference = np.empty(dh.shape + (4,), dtype=int)
for x in range(dh.shape[0]):
for y in range(dh.shape[1]):
r = AESCounter(counter=t + (x + offset_values[0]) * 2 ** 32 + (y + offset_values[1]) * 2 ** 64,
key=keys[0] + keys[1] * 2 ** 32 + keys[2] * 2 ** 64 + keys[3] * 2 ** 96,
mode="sequence")
a, b = r.random_raw(size=2)
int_reference[x, y, :] = [a % 2 ** 32, a // 2 ** 32, b % 2 ** 32, b // 2 ** 32]
else:
from randomgen import Philox
int_reference = np.empty(dh.shape + (4,), dtype=int)
for x in range(dh.shape[0]):
for y in range(dh.shape[1]):
r = Philox(counter=t + (x + offset_values[0]) * 2 ** 32 + (y + offset_values[1]) * 2 ** 64,
key=keys[0] + keys[1] * 2 ** 32, number=4, width=32)
r.advance(-4, counter=False)
int_reference[x, y, :] = r.random_raw(size=4)
if precision == 'float' or dtype == 'float':
eps = np.finfo(np.float32).eps
else:
eps = np.finfo(np.float64).eps
if rng == 'aesni': # precision appears to be slightly worse
eps = max(1e-12, 2 * eps)
if precision == 'float':
reference = int_reference * 2. ** -32 + 2. ** -33
else:
x = int_reference[:, :, 0::2]
y = int_reference[:, :, 1::2]
z = x ^ y << (53 - 32)
reference = z * 2. ** -53 + 2. ** -54
assert np.allclose(arr, reference, rtol=0, atol=eps)
@pytest.mark.parametrize('vectorized', (False, True))
@pytest.mark.parametrize('kind', ('value', 'symbol'))
def test_rng_offsets(kind, vectorized):
if vectorized:
test = test_rng_vectorized
if not instruction_sets:
pytest.skip("cannot detect CPU instruction set")
else:
test = test_rng
if kind == 'value':
test(instruction_sets[0] if vectorized else 'cpu', 'philox', 'float', 'float', t=8,
offsets=(6, 7), keys=(5, 309))
elif kind == 'symbol':
offsets = (TypedSymbol("x0", np.uint32), TypedSymbol("y0", np.uint32))
test(instruction_sets[0] if vectorized else 'cpu', 'philox', 'float', 'float', t=8,
offsets=offsets, offset_values=(6, 7), keys=(5, 309))
@pytest.mark.parametrize('target', instruction_sets)
@pytest.mark.parametrize('rng', ('philox', 'aesni'))
@pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double')))
def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None):
cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target}
dh = ps.create_data_handling((17, 17), default_ghost_layers=0, default_target='cpu')
f = dh.add_array("f", values_per_cell=4 if precision == 'float' else 2,
dtype=np.float32 if dtype == 'float' else np.float64, alignment=True)
dh.fill(f.name, 42.0)
ref = dh.add_array("ref", values_per_cell=4 if precision == 'float' else 2)
rng_node = RNGs[(rng, precision)](dh.dim, offsets=offsets)
assignments = [rng_node] + [ps.Assignment(ref(i), s) for i, s in enumerate(rng_node.result_symbols)]
kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
dh.all_to_gpu()
dh.run_kernel(kernel, time_step=124)
dh.all_to_cpu()
arr = dh.gather_array('f')
assert np.logical_and(arr <= 1.0, arr >= 0).all()
kwargs = {'time_step': t}
if offset_values is not None:
kwargs.update({k.name: v for k, v in zip(offsets, offset_values)})
dh.run_kernel(kernel, **kwargs)
float_reference = philox_reference * 2.**-32 + 2.**-33
assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
rng_node = RNGs[(rng, precision)](dh.dim, offsets=offsets)
assignments = [rng_node] + [ps.Assignment(f(i), s) for i, s in enumerate(rng_node.result_symbols)]
kernel = ps.create_kernel(assignments, target=dh.default_target, cpu_vectorize_info=cpu_vectorize_info).compile()
def test_aesni_double():
dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu")
f = dh.add_array("f", values_per_cell=2)
dh.run_kernel(kernel, **kwargs)
dh.fill('f', 42.0)
ref_data = dh.gather_array(ref.name)
data = dh.gather_array(f.name)
aesni_node = AESNITwoDoubles(dh.dim)
assignments = [aesni_node,
ps.Assignment(f(0), aesni_node.result_symbols[0]),
ps.Assignment(f(1), aesni_node.result_symbols[1])]
kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
assert np.allclose(ref_data, data)
dh.all_to_gpu()
dh.run_kernel(kernel, time_step=124)
dh.all_to_cpu()
arr = dh.gather_array('f')
assert np.logical_and(arr <= 1.0, arr >= 0).all()
def test_aesni_float():
dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu")
f = dh.add_array("f", values_per_cell=4)
dh.fill('f', 42.0)
aesni_node = AESNIFourFloats(dh.dim)
assignments = [aesni_node] + [ps.Assignment(f(i), aesni_node.result_symbols[i]) for i in range(4)]
kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
dh.all_to_gpu()
dh.run_kernel(kernel, time_step=124)
dh.all_to_cpu()
arr = dh.gather_array('f')
assert np.logical_and(arr <= 1.0, arr >= 0).all()
def test_staggered():
@pytest.mark.parametrize('vectorized', (False, True))
def test_rng_symbol(vectorized):
"""Make sure that the RNG symbol generator generates symbols and that the resulting code compiles"""
if vectorized:
if not instruction_sets:
pytest.skip("cannot detect CPU instruction set")
else:
cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True,
'instruction_set': instruction_sets[0]}
else:
cpu_vectorize_info = None
dh = ps.create_data_handling((8, 8), default_ghost_layers=0, default_target="cpu")
f = dh.add_array("f", values_per_cell=2 * dh.dim, alignment=True)
ac = ps.AssignmentCollection([ps.Assignment(f(i), 0) for i in range(f.shape[-1])])
rng_symbol_gen = random_symbol(ac.subexpressions, dim=dh.dim)
for i in range(f.shape[-1]):
ac.main_assignments[i] = ps.Assignment(ac.main_assignments[i].lhs, next(rng_symbol_gen))
symbols = [a.rhs for a in ac.main_assignments]
assert len(symbols) == f.shape[-1] and len(set(symbols)) == f.shape[-1]
ps.create_kernel(ac, target=dh.default_target, cpu_vectorize_info=cpu_vectorize_info).compile()
@pytest.mark.parametrize('vectorized', (False, True))
def test_staggered(vectorized):
"""Make sure that the RNG counter can be substituted during loop cutting"""
dh = ps.create_data_handling((8, 8), default_ghost_layers=0, default_target="cpu")
j = dh.add_array("j", values_per_cell=dh.dim, field_type=ps.FieldType.STAGGERED_FLUX)
a = ps.AssignmentCollection([ps.Assignment(j.staggered_access(n), 0) for n in j.staggered_stencil])
rng_symbol_gen = random_symbol(a.subexpressions, dim=dh.dim)
rng_symbol_gen = random_symbol(a.subexpressions, dim=dh.dim, rng_node=AESNITwoDoubles)
a.main_assignments[0] = ps.Assignment(a.main_assignments[0].lhs, next(rng_symbol_gen))
kernel = ps.create_staggered_kernel(a, target=dh.default_target).compile()
if not vectorized:
return
if not instruction_sets:
pytest.skip("cannot detect CPU instruction set")
pytest.importorskip('islpy')
cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': False,
'instruction_set': instruction_sets[0]}
dh.fill(j.name, 867)
dh.run_kernel(kernel, seed=5, time_step=309)
ref_data = dh.gather_array(j.name)
kernel2 = ps.create_staggered_kernel(a, target=dh.default_target, cpu_vectorize_info=cpu_vectorize_info).compile()
dh.fill(j.name, 867)
dh.run_kernel(kernel2, seed=5, time_step=309)
data = dh.gather_array(j.name)
assert np.allclose(ref_data, data)
......@@ -124,7 +124,8 @@ setuptools.setup(name='pystencils',
'flake8',
'nbformat',
'nbconvert',
'ipython'],
'ipython',
'randomgen>=1.18'],
python_requires=">=3.6",
cmdclass={
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment