Add some xfail-ing integer vectorization tests

and clean up some old xfails

Add some xfail-ing integer vectorization tests
e0c51476 · Michael Kuron · Markus Holzer · 6dfcd917 · e0c51476 · e0c51476
Commit e0c51476 authored 4 years ago by Michael Kuron Committed by Markus Holzer 4 years ago
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -11,7 +11,7 @@ from pystencils.astnodes import KernelFunction, Node
 from pystencils.cpu.vectorization import vec_all, vec_any
 from pystencils.data_types import (
    PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
-    reinterpret_cast_func, vector_memory_access)
+    reinterpret_cast_func, vector_memory_access, BasicType, TypedSymbol)
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
 from pystencils.integer_functions import (
    bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor,
@@ -134,7 +134,7 @@ class CustomCodeNode(Node):
        self._symbols_defined = set(symbols_defined)
        self.headers = []

-    def get_code(self, dialect, vector_instruction_set):
+    def get_code(self, dialect, vector_instruction_set, print_arg):
        return self._code

    @property
@@ -297,7 +297,7 @@ class CBackend:
        return "continue;"

    def _print_CustomCodeNode(self, node):
-        return node.get_code(self._dialect, self._vector_instruction_set)
+        return node.get_code(self._dialect, self._vector_instruction_set, print_arg=self.sympy_printer._print)

    def _print_SourceCodeComment(self, node):
        return f"/* {node.text } */"
@@ -548,12 +548,16 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
            if type(data_type) is VectorType:
                if isinstance(arg, sp.Tuple):
                    is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
+                    is_integer = get_type_of_expression(arg[0]) == create_type("int")
                    printed_args = [self._print(a) for a in arg]
-                    instruction = 'makeVecBool' if is_boolean else 'makeVec'
+                    instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
                    return self.instruction_set[instruction].format(*printed_args)
                else:
                    is_boolean = get_type_of_expression(arg) == create_type("bool")
-                    instruction = 'makeVecConstBool' if is_boolean else 'makeVecConst'
+                    is_integer = get_type_of_expression(arg) == create_type("int") or \
+                        (isinstance(arg, TypedSymbol) and arg.dtype.is_int())
+                    instruction = 'makeVecConstBool' if is_boolean else \
+                                  'makeVecConstInt' if is_integer else 'makeVecConst'
                    return self.instruction_set[instruction].format(self._print(arg))
        elif expr.func == fast_division:
            result = self._scalarFallback('_print_Function', expr)
@@ -609,12 +613,27 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        return result

    def _print_Add(self, expr, order=None):
-        result = self._scalarFallback('_print_Add', expr)
+        try:
+            result = self._scalarFallback('_print_Add', expr)
+        except Exception:
+            result = None
        if result:
            return result
+        args = expr.args
+
+        # special treatment for all-integer args, for loop index arithmetic until we have proper int vectorization
+        suffix = ""
+        if all([(type(e) is cast_func and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer)
+                or (type(e) is TypedSymbol and isinstance(e.dtype, BasicType) and e.dtype.is_int()) for e in args]):
+            dtype = set([e.dtype for e in args if type(e) is cast_func])
+            assert len(dtype) == 1
+            dtype = dtype.pop()
+            args = [cast_func(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e
+                    for e in args]
+            suffix = "int"

        summands = []
-        for term in expr.args:
+        for term in args:
            if term.func == sp.Mul:
                sign, t = self._print_Mul(term, inside_add=True)
            else:
@@ -630,7 +649,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        assert len(summands) >= 2
        processed = summands[0].term
        for summand in summands[1:]:
-            func = self.instruction_set['-'] if summand.sign == -1 else self.instruction_set['+']
+            func = self.instruction_set['-' + suffix] if summand.sign == -1 else self.instruction_set['+' + suffix]
            processed = func.format(processed, summand.term)
        return processed


--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -18,7 +18,7 @@ def get_supported_instruction_sets():

    result = []
    required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
-    required_avx_flags = {'avx'}
+    required_avx_flags = {'avx', 'avx2'}
    required_avx512_flags = {'avx512f'}
    required_neon_flags = {'neon'}
    flags = set(get_cpu_info()['flags'])

--- a/pystencils/backends/x86_instruction_sets.py
+++ b/pystencils/backends/x86_instruction_sets.py
 def get_argument_string(intrinsic_id, width, function_shortcut):
-    if intrinsic_id == 'makeVecConst':
+    if intrinsic_id == 'makeVecConst' or intrinsic_id == 'makeVecConstInt':
        arg_string = f"({','.join(['{0}'] * width)})"
-    elif intrinsic_id == 'makeVec':
+    elif intrinsic_id == 'makeVec' or intrinsic_id == 'makeVecInt':
        params = ["{" + str(i) + "}" for i in reversed(range(width))]
        arg_string = f"({','.join(params)})"
    elif intrinsic_id == 'makeVecBool':
@@ -49,6 +49,8 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
        'makeVec': 'set[]',
        'makeVecBool': 'set[]',
        'makeVecConstBool': 'set[]',
+        'makeVecInt': 'set[]',
+        'makeVecConstInt': 'set[]',
        
        'loadU': 'loadu[0]',
        'loadA': 'load[0]',
@@ -86,6 +88,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
    suffix = {
        'double': 'pd',
        'float': 'ps',
+        'int': 'epi32'
    }
    prefix = {
        'sse': '_mm',
@@ -96,22 +99,30 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
    width = {
        ("double", "sse"): 2,
        ("float", "sse"): 4,
+        ("int", "sse"): 4,
        ("double", "avx"): 4,
        ("float", "avx"): 8,
+        ("int", "avx"): 8,
        ("double", "avx512"): 8,
        ("float", "avx512"): 16,
+        ("int", "avx512"): 16,
    }

    result = {
        'width': width[(data_type, instruction_set)],
+        'intwidth': width[('int', instruction_set)]
    }
    pre = prefix[instruction_set]
-    suf = suffix[data_type]
    for intrinsic_id, function_shortcut in base_names.items():
        function_shortcut = function_shortcut.strip()
        name = function_shortcut[:function_shortcut.index('[')]

-        arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
+        if 'Int' in intrinsic_id:
+            suf = suffix['int']
+            arg_string = get_argument_string(intrinsic_id, result['intwidth'], function_shortcut)
+        else:
+            suf = suffix[data_type]
+            arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)

        mask_suffix = '_mask' if instruction_set == 'avx512' and intrinsic_id in comparisons.keys() else ''
        result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
@@ -151,4 +162,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
    if instruction_set == 'avx' and data_type == 'float':
        result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})"

+    result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
+
    return result
--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -73,11 +73,30 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
    vector_width = vector_is['width']
    kernel_ast.instruction_set = vector_is

+    vectorize_rng(kernel_ast, vector_width)
    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned,
                                                nontemporal, assume_sufficient_line_padding)
    insert_vector_casts(kernel_ast)


+def vectorize_rng(kernel_ast, vector_width):
+    """Replace scalar result symbols on RNG nodes with vectorial ones"""
+    from pystencils.rng import RNGBase
+    subst = {}
+
+    def visit_node(node):
+        for arg in node.args:
+            if isinstance(arg, RNGBase):
+                new_result_symbols = [TypedSymbol(s.name, VectorType(s.dtype, width=vector_width))
+                                      for s in arg.result_symbols]
+                subst.update({s[0]: s[1] for s in zip(arg.result_symbols, new_result_symbols)})
+                arg._symbols_defined = set(new_result_symbols)
+            else:
+                visit_node(arg)
+    visit_node(kernel_ast)
+    fast_subs(kernel_ast.body, subst, skip=lambda e: isinstance(e, RNGBase))
+
+
 def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
                                                assume_sufficient_line_padding):
    """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
@@ -129,8 +148,10 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a

        loop_node.step = vector_width
        loop_node.subs(substitutions)
-        vector_loop_counter = cast_func(tuple(loop_counter_symbol + i for i in range(vector_width)),
-                                        VectorType(loop_counter_symbol.dtype, vector_width))
+        vector_int_width = ast_node.instruction_set['intwidth']
+        vector_loop_counter = cast_func((loop_counter_symbol,) * vector_int_width,
+                                        VectorType(loop_counter_symbol.dtype, vector_int_width)) + \
+            cast_func(tuple(range(vector_int_width)), VectorType(loop_counter_symbol.dtype, vector_int_width))

        fast_subs(loop_node, {loop_counter_symbol: vector_loop_counter},
                  skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, vector_memory_access))
@@ -178,7 +199,10 @@ def insert_vector_casts(ast_node):
            return expr
        elif expr.func is sp.Abs and 'abs' not in ast_node.instruction_set:
            new_arg = visit_expr(expr.args[0])
-            pw = sp.Piecewise((-1 * new_arg, new_arg < 0), (new_arg, True))
+            base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is vector_memory_access \
+                else get_type_of_expression(expr.args[0])
+            pw = sp.Piecewise((base_type.numpy_dtype.type(-1) * new_arg, new_arg < base_type.numpy_dtype.type(0)),
+                              (new_arg, True))
            return visit_expr(pw)
        elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction):
            new_args = [visit_expr(a) for a in expr.args]

--- a/pystencils/data_types.py
+++ b/pystencils/data_types.py
@@ -119,7 +119,7 @@ class cast_func(sp.Function):
        # rhs = cast_func(0, 'int')
        # print( sp.Ne(lhs, rhs) ) # would give true if all cast_funcs are booleans
        # -> thus a separate class boolean_cast_func is introduced
-        if isinstance(expr, Boolean):
+        if isinstance(expr, Boolean) and (not isinstance(expr, TypedSymbol) or expr.dtype == BasicType(bool)):
            cls = boolean_cast_func

        return sp.Function.__new__(cls, expr, dtype, *other_args, **kwargs)
@@ -697,7 +697,7 @@ class VectorType(Type):
        if self.instruction_set is None:
            return "%s[%d]" % (self.base_type, self.width)
        else:
-            if self.base_type == create_type("int64"):
+            if self.base_type == create_type("int64") or self.base_type == create_type("int32"):
                return self.instruction_set['int']
            elif self.base_type == create_type("float64"):
                return self.instruction_set['double']

--- a/pystencils/field.py
+++ b/pystencils/field.py
@@ -958,8 +958,6 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0
    if not alignment:
        res = np.empty(shape, order='c', **kwargs)
    else:
-        if alignment is True:
-            alignment = 8 * 4
        res = aligned_empty(shape, alignment, byte_offset=byte_offset, **kwargs)

    for a, b in reversed(swaps):

--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
--- a/pystencils/include/myintrin.h
+++ b/pystencils/include/myintrin.h
+#pragma once
+
+#if defined(__SSE2__) || defined(_MSC_VER)
+QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
+{
+#ifdef __AVX512VL__
+    return _mm_cvtepu32_ps(v);
+#else
+    __m128i v2 = _mm_srli_epi32(v, 1);
+    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
+    __m128 v2f = _mm_cvtepi32_ps(v2);
+    __m128 v1f = _mm_cvtepi32_ps(v1);
+    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
+#endif
+}
+
+QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
+{
+    __m128i T0, T1, T2, T3;
+    T0  = _mm_unpacklo_epi32(R0, R1);
+    T1  = _mm_unpacklo_epi32(R2, R3);
+    T2  = _mm_unpackhi_epi32(R0, R1);
+    T3  = _mm_unpackhi_epi32(R2, R3);
+    R0  = _mm_unpacklo_epi64(T0, T1);
+    R1  = _mm_unpackhi_epi64(T0, T1);
+    R2  = _mm_unpacklo_epi64(T2, T3);
+    R3  = _mm_unpackhi_epi64(T2, T3);
+}
+#endif
+
+#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+__attribute__((optimize("no-associative-math")))
+#endif
+QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
+{
+#ifdef __AVX512VL__
+    return _mm_cvtepu64_pd(x);
+#else
+    __m128i xH = _mm_srli_epi64(x, 32);
+    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
+    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
+    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
+    return _mm_add_pd(f, _mm_castsi128_pd(xL));
+#endif
+}
+#endif
+
+#ifdef __AVX2__
+QUALIFIERS __m256i _my256_set_m128i(__m128i hi, __m128i lo)
+{
+#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
+    return _mm256_set_m128i(hi, lo);
+#else
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+#endif
+}
+
+QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
+{
+#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
+    return _mm256_set_m128d(hi, lo);
+#else
+    return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
+#endif
+}
+
+QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
+{
+#ifdef __AVX512VL__
+    return _mm256_cvtepu32_ps(v);
+#else
+    __m256i v2 = _mm256_srli_epi32(v, 1);
+    __m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
+    __m256 v2f = _mm256_cvtepi32_ps(v2);
+    __m256 v1f = _mm256_cvtepi32_ps(v1);
+    return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
+#endif
+}
+
+#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+__attribute__((optimize("no-associative-math")))
+#endif
+QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
+{
+#ifdef __AVX512VL__
+    return _mm256_cvtepu64_pd(x);
+#else
+    __m256i xH = _mm256_srli_epi64(x, 32);
+    xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.)));          //  2^84
+    __m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
+    __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
+    return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+#endif
+}
+#endif
+
+#ifdef __AVX512F__
+QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
+{
+    return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);
+}
+
+QUALIFIERS __m512d _my512_set_m256d(__m256d b, __m256d a)
+{
+    return _mm512_insertf64x4(_mm512_castpd256_pd512(a), b, 1);
+}
+#endif
+
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
--- a/pystencils/rng.py
+++ b/pystencils/rng.py
+import copy
 import numpy as np
 import sympy as sp

-from pystencils import TypedSymbol
+from pystencils.data_types import TypedSymbol
 from pystencils.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import CustomCodeNode
-
-
-def _get_rng_template(name, data_type, num_vars):
-    if data_type is np.float32:
-        c_type = "float"
-    elif data_type is np.float64:
-        c_type = "double"
-    template = "\n"
-    for i in range(num_vars):
-        template += f"{{result_symbols[{i}].dtype}} {{result_symbols[{i}].name}};\n"
-    template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
-        .format(name, c_type, num_vars, *tuple(range(num_vars)))
-    return template
-
-
-def _get_rng_code(template, dialect, vector_instruction_set, args, result_symbols):
-    if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
-        return template.format(parameters=', '.join(str(a) for a in args),
-                               result_symbols=result_symbols)
-    else:
-        raise NotImplementedError("Not yet implemented for this backend")
+from pystencils.sympyextensions import fast_subs


 class RNGBase(CustomCodeNode):
@@ -50,7 +31,7 @@ class RNGBase(CustomCodeNode):
        symbols_read = set.union(*[s.atoms(sp.Symbol) for s in self.args])
        super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)

-        self.headers = [f'"{self._name}_rand.h"']
+        self.headers = [f'"{self._name.split("_")[0]}_rand.h"']

        RNGBase.id += 1

@@ -58,9 +39,23 @@ class RNGBase(CustomCodeNode):
    def args(self):
        return self._args

-    def get_code(self, dialect, vector_instruction_set):
-        template = _get_rng_template(self._name, self._data_type, self._num_vars)
-        return _get_rng_code(template, dialect, vector_instruction_set, self.args, self.result_symbols)
+    def fast_subs(self, subs_dict, skip):
+        rng = copy.deepcopy(self)
+        rng._args = [fast_subs(a, subs_dict, skip) for a in rng._args]
+        return rng
+
+    def get_code(self, dialect, vector_instruction_set, print_arg):
+        code = "\n"
+        for r in self.result_symbols:
+            if vector_instruction_set and isinstance(self.args[1], sp.Integer):
+                # this vector RNG has become scalar through substitution
+                code += f"{r.dtype} {r.name};\n"
+            else:
+                code += f"{vector_instruction_set[r.dtype.base_name] if vector_instruction_set else r.dtype} " + \
+                        f"{r.name};\n"
+        code += (self._name + "(" + ", ".join([print_arg(a) for a in self.args]
+                                              + [r.name for r in self.result_symbols]) + ");\n")
+        return code

    def __repr__(self):
        return (", ".join(['{}'] * self._num_vars) + " \\leftarrow {}RNG").format(*self.result_symbols,
@@ -68,37 +63,53 @@ class RNGBase(CustomCodeNode):


 class PhiloxTwoDoubles(RNGBase):
-    _name = "philox"
+    _name = "philox_double2"
    _data_type = np.float64
    _num_vars = 2
    _num_keys = 2


 class PhiloxFourFloats(RNGBase):
-    _name = "philox"
+    _name = "philox_float4"
    _data_type = np.float32
    _num_vars = 4
    _num_keys = 2


 class AESNITwoDoubles(RNGBase):
-    _name = "aesni"
+    _name = "aesni_double2"
    _data_type = np.float64
    _num_vars = 2
    _num_keys = 4


 class AESNIFourFloats(RNGBase):
-    _name = "aesni"
+    _name = "aesni_float4"
    _data_type = np.float32
    _num_vars = 4
    _num_keys = 4


-def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs):
+def random_symbol(assignment_list, dim, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles,
+                  time_step=TypedSymbol("time_step", np.uint32), offsets=None):
+    """Return a symbol generator for random numbers
+    
+    Args:
+        assignment_list: the subexpressions member of an AssignmentCollection, into which helper variables assignments
+                         will be inserted
+        dim: 2 or 3 for two or three spatial dimensions
+        seed: an integer or TypedSymbol(..., np.uint32) to seed the random number generator. If you create multiple
+              symbol generators, please pass them different seeds so you don't get the same stream of random numbers!
+        rng_node: which random number generator to use (PhiloxTwoDoubles, PhiloxFourFloats, AESNITwoDoubles,
+                  AESNIFourFloats).
+        time_step: TypedSymbol(..., np.uint32) that indicates the number of the current time step
+        offsets: tuple of offsets (constant integers or TypedSymbol(..., np.uint32)) that give the global coordinates
+                 of the local origin
+    """
    counter = 0
    while True:
-        node = rng_node(*args, keys=(counter, seed), **kwargs)
+        keys = (counter, seed) + (0,) * (rng_node._num_keys - 2)
+        node = rng_node(dim, keys=keys, time_step=time_step, offsets=offsets)
        inserted = False
        for symbol in node.result_symbols:
            if not inserted:

--- a/pystencils_tests/test_interpolation.py
+++ b/pystencils_tests/test_interpolation.py
@@ -77,14 +77,7 @@ def test_scale_interpolation():
        pyconrad.imshow(out, "out " + address_mode)


-@pytest.mark.parametrize('address_mode',
-                         ['border',
-                          'clamp',
-                          pytest.param('warp', marks=pytest.mark.xfail(
-                              reason="requires interpolation-refactoring branch")),
-                          pytest.param('mirror', marks=pytest.mark.xfail(
-                              reason="requires interpolation-refactoring branch")),
-                          ])
+@pytest.mark.parametrize('address_mode', ['border', 'clamp'])
 def test_rotate_interpolation(address_mode):
    """
    'wrap', 'mirror' currently fails on new sympy due to conjugate()
@@ -144,18 +137,13 @@ def test_rotate_interpolation_gpu(dtype, address_mode, use_textures):
                    f"out {address_mode} texture:{use_textures} {type_map[dtype]}")


-@pytest.mark.parametrize('address_mode', ['border', 'wrap',
-                                          pytest.param('warp', marks=pytest.mark.xfail(
-                                              reason="% printed as fmod on old sympy")),
-                                          pytest.param('mirror', marks=pytest.mark.xfail(
-                                              reason="% printed as fmod on old sympy")),
-                                          ])
+@pytest.mark.parametrize('address_mode', ['border', 'wrap', 'mirror'])
 @pytest.mark.parametrize('dtype', [np.float64, np.float32, np.int32])
 @pytest.mark.parametrize('use_textures', ('use_textures', False,))
 def test_shift_interpolation_gpu(address_mode, dtype, use_textures):
    sver = sympy.__version__.split(".")
-    if (int(sver[0]) == 1 and int(sver[1]) < 2) and address_mode in ['mirror', 'warp']:
-        pytest.skip()
+    if (int(sver[0]) == 1 and int(sver[1]) < 2) and address_mode == 'mirror':
+        pytest.skip("% printed as fmod on old sympy")
    pytest.importorskip('pycuda')

    import pycuda.gpuarray as gpuarray

--- a/pystencils_tests/test_random.py
+++ b/pystencils_tests/test_random.py
@@ -4,108 +4,197 @@ import pytest

 import pystencils as ps
 from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles, AESNIFourFloats, AESNITwoDoubles, random_symbol
-
-
-# curand_Philox4x32_10(make_uint4(124, i, j, 0), make_uint2(0, 0))
-philox_reference = np.array([[[3576608082, 1252663339, 1987745383,  348040302],
-                              [1032407765,  970978240, 2217005168, 2424826293]],
-                             [[2958765206, 3725192638, 2623672781, 1373196132],
-                              [ 850605163, 1694561295, 3285694973, 2799652583]]])
-
-@pytest.mark.parametrize('target', ('cpu', 'gpu'))
-def test_philox_double(target):
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
+from pystencils.cpu.cpujit import get_compiler_config
+from pystencils.data_types import TypedSymbol
+
+RNGs = {('philox', 'float'): PhiloxFourFloats, ('philox', 'double'): PhiloxTwoDoubles,
+        ('aesni', 'float'): AESNIFourFloats, ('aesni', 'double'): AESNITwoDoubles}
+
+instruction_sets = get_supported_instruction_sets()
+if get_compiler_config()['os'] == 'windows':
+    # skip instruction sets supported by CPU but not the compiler
+    if 'avx' in instruction_sets and ('/arch:avx2' not in get_compiler_config()['flags'].lower()
+                                      or '/arch:avx512' not in get_compiler_config()['flags'].lower()):
+        instruction_sets.remove('avx')
+    if 'avx512' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower():
+        instruction_sets.remove('avx512')
+
+
+@pytest.mark.parametrize('target,rng', (('cpu', 'philox'), ('cpu', 'aesni'), ('gpu', 'philox')))
+@pytest.mark.parametrize('precision', ('float', 'double'))
+@pytest.mark.parametrize('dtype', ('float', 'double'))
+def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
    if target == 'gpu':
        pytest.importorskip('pycuda')
+    if rng == 'aesni' and len(keys) == 2:
+        keys *= 2
+    if offset_values is None:
+        offset_values = offsets

    dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target=target)
-    f = dh.add_array("f", values_per_cell=2)
+    f = dh.add_array("f", values_per_cell=4 if precision == 'float' else 2,
+                     dtype=np.float32 if dtype == 'float' else np.float64)
+    dh.fill(f.name, 42.0)

-    dh.fill('f', 42.0)
-
-    philox_node = PhiloxTwoDoubles(dh.dim)
-    assignments = [philox_node,
-                   ps.Assignment(f(0), philox_node.result_symbols[0]),
-                   ps.Assignment(f(1), philox_node.result_symbols[1])]
+    rng_node = RNGs[(rng, precision)](dh.dim, offsets=offsets, keys=keys)
+    assignments = [rng_node] + [ps.Assignment(f(i), s) for i, s in enumerate(rng_node.result_symbols)]
    kernel = ps.create_kernel(assignments, target=dh.default_target).compile()

    dh.all_to_gpu()
-    dh.run_kernel(kernel, time_step=124)
+    kwargs = {'time_step': t}
+    if offset_values != offsets:
+        kwargs.update({k.name: v for k, v in zip(offsets, offset_values)})
+    dh.run_kernel(kernel, **kwargs)
    dh.all_to_cpu()
-
-    arr = dh.gather_array('f')
+    arr = dh.gather_array(f.name)
    assert np.logical_and(arr <= 1.0, arr >= 0).all()

-    x = philox_reference[:,:,0::2]
-    y = philox_reference[:,:,1::2]
-    z = x ^ y << (53 - 32)
-    double_reference = z * 2.**-53 + 2.**-54
-    assert(np.allclose(arr, double_reference, rtol=0, atol=np.finfo(np.float64).eps))
-
-
-@pytest.mark.parametrize('target', ('cpu', 'gpu'))
-def test_philox_float(target):
-    if target == 'gpu':
-        pytest.importorskip('pycuda')
-
-    dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target=target)
-    f = dh.add_array("f", values_per_cell=4)
-
-    dh.fill('f', 42.0)
-
-    philox_node = PhiloxFourFloats(dh.dim)
-    assignments = [philox_node] + [ps.Assignment(f(i), philox_node.result_symbols[i]) for i in range(4)]
+    if rng == 'philox' and t == 124 and offsets == (0, 0) and keys == (0, 0) and dh.shape == (2, 2):
+        int_reference = np.array([[[3576608082, 1252663339, 1987745383, 348040302],
+                                   [1032407765, 970978240, 2217005168, 2424826293]],
+                                  [[2958765206, 3725192638, 2623672781, 1373196132],
+                                   [850605163, 1694561295, 3285694973, 2799652583]]])
+    else:
+        pytest.importorskip('randomgen')
+        if rng == 'aesni':
+            from randomgen import AESCounter
+            int_reference = np.empty(dh.shape + (4,), dtype=int)
+            for x in range(dh.shape[0]):
+                for y in range(dh.shape[1]):
+                    r = AESCounter(counter=t + (x + offset_values[0]) * 2 ** 32 + (y + offset_values[1]) * 2 ** 64,
+                                   key=keys[0] + keys[1] * 2 ** 32 + keys[2] * 2 ** 64 + keys[3] * 2 ** 96,
+                                   mode="sequence")
+                    a, b = r.random_raw(size=2)
+                    int_reference[x, y, :] = [a % 2 ** 32, a // 2 ** 32, b % 2 ** 32, b // 2 ** 32]
+        else:
+            from randomgen import Philox
+            int_reference = np.empty(dh.shape + (4,), dtype=int)
+            for x in range(dh.shape[0]):
+                for y in range(dh.shape[1]):
+                    r = Philox(counter=t + (x + offset_values[0]) * 2 ** 32 + (y + offset_values[1]) * 2 ** 64,
+                               key=keys[0] + keys[1] * 2 ** 32, number=4, width=32)
+                    r.advance(-4, counter=False)
+                    int_reference[x, y, :] = r.random_raw(size=4)
+
+    if precision == 'float' or dtype == 'float':
+        eps = np.finfo(np.float32).eps
+    else:
+        eps = np.finfo(np.float64).eps
+    if rng == 'aesni':  # precision appears to be slightly worse
+        eps = max(1e-12, 2 * eps)
+
+    if precision == 'float':
+        reference = int_reference * 2. ** -32 + 2. ** -33
+    else:
+        x = int_reference[:, :, 0::2]
+        y = int_reference[:, :, 1::2]
+        z = x ^ y << (53 - 32)
+        reference = z * 2. ** -53 + 2. ** -54
+    assert np.allclose(arr, reference, rtol=0, atol=eps)
+
+
+@pytest.mark.parametrize('vectorized', (False, True))
+@pytest.mark.parametrize('kind', ('value', 'symbol'))
+def test_rng_offsets(kind, vectorized):
+    if vectorized:
+        test = test_rng_vectorized
+        if not instruction_sets:
+            pytest.skip("cannot detect CPU instruction set")
+    else:
+        test = test_rng
+    if kind == 'value':
+        test(instruction_sets[0] if vectorized else 'cpu', 'philox', 'float', 'float', t=8,
+             offsets=(6, 7), keys=(5, 309))
+    elif kind == 'symbol':
+        offsets = (TypedSymbol("x0", np.uint32), TypedSymbol("y0", np.uint32))
+        test(instruction_sets[0] if vectorized else 'cpu', 'philox', 'float', 'float', t=8,
+             offsets=offsets, offset_values=(6, 7), keys=(5, 309))
+
+
+@pytest.mark.parametrize('target', instruction_sets)
+@pytest.mark.parametrize('rng', ('philox', 'aesni'))
+@pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double')))
+def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None):
+    cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target}
+
+    dh = ps.create_data_handling((17, 17), default_ghost_layers=0, default_target='cpu')
+    f = dh.add_array("f", values_per_cell=4 if precision == 'float' else 2,
+                     dtype=np.float32 if dtype == 'float' else np.float64, alignment=True)
+    dh.fill(f.name, 42.0)
+    ref = dh.add_array("ref", values_per_cell=4 if precision == 'float' else 2)
+
+    rng_node = RNGs[(rng, precision)](dh.dim, offsets=offsets)
+    assignments = [rng_node] + [ps.Assignment(ref(i), s) for i, s in enumerate(rng_node.result_symbols)]
    kernel = ps.create_kernel(assignments, target=dh.default_target).compile()

-    dh.all_to_gpu()
-    dh.run_kernel(kernel, time_step=124)
-    dh.all_to_cpu()
-    arr = dh.gather_array('f')
-    assert np.logical_and(arr <= 1.0, arr >= 0).all()
+    kwargs = {'time_step': t}
+    if offset_values is not None:
+        kwargs.update({k.name: v for k, v in zip(offsets, offset_values)})
+    dh.run_kernel(kernel, **kwargs)

-    float_reference = philox_reference * 2.**-32 + 2.**-33
-    assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
+    rng_node = RNGs[(rng, precision)](dh.dim, offsets=offsets)
+    assignments = [rng_node] + [ps.Assignment(f(i), s) for i, s in enumerate(rng_node.result_symbols)]
+    kernel = ps.create_kernel(assignments, target=dh.default_target, cpu_vectorize_info=cpu_vectorize_info).compile()

-def test_aesni_double():
-    dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu")
-    f = dh.add_array("f", values_per_cell=2)
+    dh.run_kernel(kernel, **kwargs)

-    dh.fill('f', 42.0)
+    ref_data = dh.gather_array(ref.name)
+    data = dh.gather_array(f.name)

-    aesni_node = AESNITwoDoubles(dh.dim)
-    assignments = [aesni_node,
-                   ps.Assignment(f(0), aesni_node.result_symbols[0]),
-                   ps.Assignment(f(1), aesni_node.result_symbols[1])]
-    kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
+    assert np.allclose(ref_data, data)

-    dh.all_to_gpu()
-    dh.run_kernel(kernel, time_step=124)
-    dh.all_to_cpu()
-
-    arr = dh.gather_array('f')
-    assert np.logical_and(arr <= 1.0, arr >= 0).all()
-
-
-def test_aesni_float():
-    dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu")
-    f = dh.add_array("f", values_per_cell=4)
-
-    dh.fill('f', 42.0)
-
-    aesni_node = AESNIFourFloats(dh.dim)
-    assignments = [aesni_node] + [ps.Assignment(f(i), aesni_node.result_symbols[i]) for i in range(4)]
-    kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
-
-    dh.all_to_gpu()
-    dh.run_kernel(kernel, time_step=124)
-    dh.all_to_cpu()
-    arr = dh.gather_array('f')
-    assert np.logical_and(arr <= 1.0, arr >= 0).all()

-def test_staggered():
+@pytest.mark.parametrize('vectorized', (False, True))
+def test_rng_symbol(vectorized):
+    """Make sure that the RNG symbol generator generates symbols and that the resulting code compiles"""
+    if vectorized:
+        if not instruction_sets:
+            pytest.skip("cannot detect CPU instruction set")
+        else:
+            cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True,
+                                  'instruction_set': instruction_sets[0]}
+    else:
+        cpu_vectorize_info = None
+    
+    dh = ps.create_data_handling((8, 8), default_ghost_layers=0, default_target="cpu")
+    f = dh.add_array("f", values_per_cell=2 * dh.dim, alignment=True)
+    ac = ps.AssignmentCollection([ps.Assignment(f(i), 0) for i in range(f.shape[-1])])
+    rng_symbol_gen = random_symbol(ac.subexpressions, dim=dh.dim)
+    for i in range(f.shape[-1]):
+        ac.main_assignments[i] = ps.Assignment(ac.main_assignments[i].lhs, next(rng_symbol_gen))
+    symbols = [a.rhs for a in ac.main_assignments]
+    assert len(symbols) == f.shape[-1] and len(set(symbols)) == f.shape[-1]
+    ps.create_kernel(ac, target=dh.default_target, cpu_vectorize_info=cpu_vectorize_info).compile()
+
+
+@pytest.mark.parametrize('vectorized', (False, True))
+def test_staggered(vectorized):
    """Make sure that the RNG counter can be substituted during loop cutting"""
+
    dh = ps.create_data_handling((8, 8), default_ghost_layers=0, default_target="cpu")
    j = dh.add_array("j", values_per_cell=dh.dim, field_type=ps.FieldType.STAGGERED_FLUX)
    a = ps.AssignmentCollection([ps.Assignment(j.staggered_access(n), 0) for n in j.staggered_stencil])
-    rng_symbol_gen = random_symbol(a.subexpressions, dim=dh.dim)
+    rng_symbol_gen = random_symbol(a.subexpressions, dim=dh.dim, rng_node=AESNITwoDoubles)
    a.main_assignments[0] = ps.Assignment(a.main_assignments[0].lhs, next(rng_symbol_gen))
    kernel = ps.create_staggered_kernel(a, target=dh.default_target).compile()
+
+    if not vectorized:
+        return
+    if not instruction_sets:
+        pytest.skip("cannot detect CPU instruction set")
+    pytest.importorskip('islpy')
+    cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': False,
+                          'instruction_set': instruction_sets[0]}
+    
+    dh.fill(j.name, 867)
+    dh.run_kernel(kernel, seed=5, time_step=309)
+    ref_data = dh.gather_array(j.name)
+    
+    kernel2 = ps.create_staggered_kernel(a, target=dh.default_target, cpu_vectorize_info=cpu_vectorize_info).compile()
+
+    dh.fill(j.name, 867)
+    dh.run_kernel(kernel2, seed=5, time_step=309)
+    data = dh.gather_array(j.name)
+    
+    assert np.allclose(ref_data, data)
--- a/setup.py
+++ b/setup.py
@@ -124,7 +124,8 @@ setuptools.setup(name='pystencils',
                                'flake8',
                                'nbformat',
                                'nbconvert',
-                                'ipython'],
+                                'ipython',
+                                'randomgen>=1.18'],

                 python_requires=">=3.6",
                 cmdclass={