Michael Kuron · 2f299689 · 30341d80 · b45339bb · 6562a56c · 2f299689
--- a/pystencils/backends/arm_instruction_sets.py

+ 81

− 34
+++ b/pystencils/backends/arm_instruction_sets.py

+ 81

− 34
-def get_argument_string(function_shortcut):
+def get_argument_string(function_shortcut, first=''):
    args = function_shortcut[function_shortcut.index('[') + 1: -1]
    arg_string = "("
+    if first:
+        arg_string += first + ', '
    for arg in args.split(","):
        arg = arg.strip()
        if not arg:
 @@ -14,8 +16,17 @@ def get_argument_string(function_shortcut):
 @@ -14,8 +16,17 @@ def get_argument_string(function_shortcut):
 def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
-    if instruction_set != 'neon':
+    if instruction_set != 'neon' and not instruction_set.startswith('sve'):
        raise NotImplementedError(instruction_set)
+    if instruction_set == 'sve':
+        raise NotImplementedError("sizeless SVE is not implemented")
+    if instruction_set.startswith('sve'):
+        cmp = 'cmp'
+        bitwidth = int(instruction_set[3:])
+    elif instruction_set == 'neon':
+        cmp = 'c'
+        bitwidth = 128
    base_names = {
        '+': 'add[0, 1]',
 @@ -30,58 +41,94 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
 @@ -30,58 +41,94 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        'storeA': 'st1[0, 1]',
        'abs': 'abs[0]',
-        '==': 'ceq[0, 1]',
+        '==': f'{cmp}eq[0, 1]',
-        '<=': 'cle[0, 1]',
+        '!=': f'{cmp}eq[0, 1]',
-        '<': 'clt[0, 1]',
+        '<=': f'{cmp}le[0, 1]',
-        '>=': 'cge[0, 1]',
+        '<': f'{cmp}lt[0, 1]',
-        '>': 'cgt[0, 1]',
+        '>=': f'{cmp}ge[0, 1]',
+        '>': f'{cmp}gt[0, 1]',
    }
    bits = {'double': 64,
            'float': 32,
            'int': 32}
-    width = 128 // bits[data_type]
+    width = bitwidth // bits[data_type]
-    intwidth = 128 // bits['int']
+    intwidth = bitwidth // bits['int']
-    suffix = f'q_f{bits[data_type]}'
+    if instruction_set.startswith('sve'):
+        prefix = 'sv'
+        suffix = f'_f{bits[data_type]}' 
+    elif instruction_set == 'neon':
+        prefix = 'v'
+        suffix = f'q_f{bits[data_type]}' 
    result = dict()
-    result['bytes'] = 16
+    result['bytes'] = bitwidth // 8
+    predicate = f'{prefix}whilelt_b{bits[data_type]}(0, {width})'
+    int_predicate = f'{prefix}whilelt_b{bits["int"]}(0, {intwidth})'
    for intrinsic_id, function_shortcut in base_names.items():
        function_shortcut = function_shortcut.strip()
        name = function_shortcut[:function_shortcut.index('[')]
-        arg_string = get_argument_string(function_shortcut)
+        arg_string = get_argument_string(function_shortcut, first=predicate if prefix == 'sv' else '')
+        if prefix == 'sv' and not name.startswith('ld') and not name.startswith('st') and not name.startswith(cmp):
+            undef = '_x'
+        else:
+            undef = ''
-        result[intrinsic_id] = 'v' + name + suffix + arg_string
+        result[intrinsic_id] = prefix + name + suffix + undef + arg_string
-    result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
+    result['width'] = width
-    result['makeVec'] = f'makeVec_f{bits[data_type]}' + '(' + ", ".join(['{' + str(i) + '}' for i in range(width)]) + \
+    result['intwidth'] = intwidth
-        ')'
-    result['makeVecConstInt'] = f'vdupq_n_s{bits["int"]}' + '({0})'
-    result['makeVecInt'] = f'makeVec_s{bits["int"]}' + '({0}, {1}, {2}, {3})'
-    result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})"
+    if instruction_set.startswith('sve'):
+        result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})'
+        result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})'
+        result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})'
-    result['rsqrt'] = None
+        result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"
-    result['width'] = width
+        result[data_type] = f'svfloat{bits[data_type]}_st'
-    result['intwidth'] = intwidth
+        result['int'] = f'svint{bits["int"]}_st'
-    result[data_type] = f'float{bits[data_type]}x{width}_t'
+        result['bool'] = 'svbool_st'
-    result['int'] = f'int{bits["int"]}x{bits[data_type]}_t'
-    result['bool'] = f'uint{bits[data_type]}x{width}_t'
+        result['headers'] = ['<arm_sve.h>', '"arm_neon_helpers.h"']
-    result['headers'] = ['<arm_neon.h>', '"arm_neon_helpers.h"']
+        result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})'
+        result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})'
+        result['blendv'] = f'svsel_f{bits[data_type]}' + '({2}, {1}, {0})'
+        result['any'] = f'svptest_any({predicate}, {{0}})'
+        result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'
+        result['compile_flags'] = [f'-msve-vector-bits={bitwidth}']
+    else:
+        result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
+        result['makeVec'] = f'makeVec_f{bits[data_type]}' + '(' + ", ".join(['{' + str(i) + '}' for i in
+                                                                             range(width)]) + ')'
+        result['makeVecConstInt'] = f'vdupq_n_s{bits["int"]}' + '({0})'
+        result['makeVecInt'] = f'makeVec_s{bits["int"]}' + '({0}, {1}, {2}, {3})'
+        result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})"
+        result[data_type] = f'float{bits[data_type]}x{width}_t'
+        result['int'] = f'int{bits["int"]}x{intwidth}_t'
+        result['bool'] = f'uint{bits[data_type]}x{width}_t'
+        result['headers'] = ['<arm_neon.h>', '"arm_neon_helpers.h"']
-    result['!='] = f'vmvnq_u{bits[data_type]}({result["=="]})'
+        result['!='] = f'vmvnq_u{bits[data_type]}({result["=="]})'
-    result['&'] = f'vandq_u{bits[data_type]}' + '({0}, {1})'
+        result['&'] = f'vandq_u{bits[data_type]}' + '({0}, {1})'
-    result['|'] = f'vorrq_u{bits[data_type]}' + '({0}, {1})'
+        result['|'] = f'vorrq_u{bits[data_type]}' + '({0}, {1})'
-    result['blendv'] = f'vbslq_f{bits[data_type]}' + '({2}, {1}, {0})'
+        result['blendv'] = f'vbslq_f{bits[data_type]}' + '({2}, {1}, {0})'
-    result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
+        result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
-    result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
+        result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
-    result['cachelineSize'] = 'cachelineSize()'
+    if bitwidth & (bitwidth - 1) == 0:
-    result['cachelineZero'] = 'cachelineZero((void*) {0})'
+        # only power-of-2 vector sizes will evenly divide a cacheline
+        result['cachelineSize'] = 'cachelineSize()'
+        result['cachelineZero'] = 'cachelineZero((void*) {0})'
    return result