diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 04e690dc684a55be2f2c52130d59a9a90b82ef10..f71c6d20346a935749586919c2f135400805728b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -149,13 +149,19 @@ ubuntu:
     - sed -i 's/--doctest-modules //g' pytest.ini
     - env
     - pip3 list
-    - python3 -m pytest -v -n $NUM_CORES --junitxml=report.xml tests/test_*vec*.py tests/test_random.py tests/test_half_precision.py
+    - python3 -m pytest -v -n $NUM_CORES --cov-report html --cov-report xml --cov=. --junitxml=report.xml tests/test_*vec*.py tests/test_random.py tests/test_half_precision.py
+    - python3 -m coverage xml
   tags:
     - docker
     - AVX
   artifacts:
     when: always
+    paths:
+      - coverage_report
     reports:
+      coverage_report:
+        coverage_format: cobertura
+        path: coverage.xml
       junit: report.xml
 
 arm64v8:
@@ -181,7 +187,7 @@ arm64v9:
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
   before_script:
     - *multiarch_before_script
-    - sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
+    - sed -i s/march=native/march=armv8-a+sve+sme/g ~/.config/pystencils/config.json
     - sed -i s/g\+\+/clang++/g ~/.config/pystencils/config.json
 
 riscv64:
diff --git a/src/pystencils/backends/arm_instruction_sets.py b/src/pystencils/backends/arm_instruction_sets.py
index 7dede78aa8b636e1670ab4ec5ad540950e0b3aff..3e50d5f45374b2ece2aa21487db82a8e8ebb0097 100644
--- a/src/pystencils/backends/arm_instruction_sets.py
+++ b/src/pystencils/backends/arm_instruction_sets.py
@@ -16,9 +16,9 @@ def get_argument_string(function_shortcut, first=''):
 
 
 def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
-    if instruction_set != 'neon' and not instruction_set.startswith('sve'):
+    if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'):
         raise NotImplementedError(instruction_set)
-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sme']:
         cmp = 'cmp'
     elif instruction_set.startswith('sve'):
         cmp = 'cmp'
@@ -52,7 +52,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
 
     result = dict()
 
-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sme']:
         width = 'svcntd()' if data_type == 'double' else 'svcntw()'
         intwidth = 'svcntw()'
         result['bytes'] = 'svcntb()'
@@ -60,14 +60,14 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
         width = bitwidth // bits[data_type]
         intwidth = bitwidth // bits['int']
         result['bytes'] = bitwidth // 8
-    if instruction_set.startswith('sve'):
+    if instruction_set.startswith('sve') or instruction_set == 'sme':
         prefix = 'sv'
         suffix = f'_f{bits[data_type]}' 
     elif instruction_set == 'neon':
         prefix = 'v'
         suffix = f'q_f{bits[data_type]}' 
 
-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sme']:
         predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})'
         int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})'
     else:
@@ -86,7 +86,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
 
         result[intrinsic_id] = prefix + name + suffix + undef + arg_string
 
-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sme']:
         from pystencils.backends.cbackend import CFunction
         result['width'] = CFunction(width, "int")
         result['intwidth'] = CFunction(intwidth, "int")
@@ -94,23 +94,24 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
         result['width'] = width
         result['intwidth'] = intwidth
 
-    if instruction_set.startswith('sve'):
+    if instruction_set.startswith('sve') or instruction_set == 'sme':
         result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})'
         result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})'
         result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})'
 
-        vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
-        result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
-                           vindex.format("{2}") + ', {1})'
-        result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
-                          vindex.format("{1}") + ')'
+        if instruction_set != 'sme':
+            vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
+            result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                               vindex.format("{2}") + ', {1})'
+            result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                              vindex.format("{1}") + ')'
 
         result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"
 
-        result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set != "sve" else ""}t'
-        result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set != "sve" else ""}t'
-        result['int'] = f'svint{bits["int"]}_{"s" if instruction_set != "sve" else ""}t'
-        result['bool'] = f'svbool_{"s" if instruction_set != "sve" else ""}t'
+        result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t'
+        result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t'
+        result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sme"] else ""}t'
+        result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sme"] else ""}t'
 
         result['headers'] = ['<arm_sve.h>', '"arm_neon_helpers.h"']
 
@@ -121,9 +122,12 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
         result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'
 
         result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
-        result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
+        if instruction_set != 'sme':
+            result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
 
-        if instruction_set != 'sve':
+        if instruction_set == 'sme':
+            result['function_prefix'] = '__attribute__((arm_locally_streaming))'
+        elif instruction_set not in ['sve', 'sme']:
             result['compile_flags'] = [f'-msve-vector-bits={bitwidth}']
     else:
         result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
diff --git a/src/pystencils/backends/simd_instruction_sets.py b/src/pystencils/backends/simd_instruction_sets.py
index e9bce873751fb6639cbb77ba2427d6c68c0b3f8f..b94d9f3744d0fa97d901953047583f6719465f41 100644
--- a/src/pystencils/backends/simd_instruction_sets.py
+++ b/src/pystencils/backends/simd_instruction_sets.py
@@ -22,7 +22,7 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
 
     type_name = numpy_name_to_c(np.dtype(data_type).name)
 
-    if instruction_set in ['neon'] or instruction_set.startswith('sve'):
+    if instruction_set in ['neon', 'sme'] or instruction_set.startswith('sve'):
         return get_vector_instruction_set_arm(type_name, instruction_set)
     elif instruction_set in ['vsx']:
         return get_vector_instruction_set_ppc(type_name, instruction_set)
@@ -53,6 +53,9 @@ def get_supported_instruction_sets():
                 result.append(f"sve{length}")
                 length //= 2
             result.append("sve")
+        hwcap2 = libc.getauxval(26)  # AT_HWCAP2
+        if hwcap2 & (1 << 23):  # HWCAP2_SME
+            result.append("sme")
         return result
     elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
         libc = CDLL('libc.so.6')
diff --git a/src/pystencils/cpu/cpujit.py b/src/pystencils/cpu/cpujit.py
index b839f87cfc06385e0c48080795c339ce4c88556d..8cc10004505dffb6251b61474c2db1321ba7cffd 100644
--- a/src/pystencils/cpu/cpujit.py
+++ b/src/pystencils/cpu/cpujit.py
@@ -617,7 +617,12 @@ def compile_and_load(ast, custom_backend=None):
     cache_config = get_cache_config()
 
     compiler_config = get_compiler_config()
-    function_prefix = '__declspec(dllexport)' if compiler_config['os'].lower() == 'windows' else ''
+    if compiler_config['os'].lower() == 'windows':
+        function_prefix = '__declspec(dllexport)'
+    elif ast.instruction_set and 'function_prefix' in ast.instruction_set:
+        function_prefix = ast.instruction_set['function_prefix']
+    else:
+        function_prefix = ''
 
     code = ExtensionModuleCode(custom_backend=custom_backend)
     code.add_function(ast, ast.function_name)
diff --git a/src/pystencils/include/philox_rand.h b/src/pystencils/include/philox_rand.h
index 4320a8b9396e9f61060c7d50e512c0919eed6c56..8aa6a4e75683cb55384794bc7492121ae920add3 100644
--- a/src/pystencils/include/philox_rand.h
+++ b/src/pystencils/include/philox_rand.h
@@ -18,7 +18,7 @@
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
-#ifdef __ARM_FEATURE_SVE
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 #include <arm_sve.h>
 #endif
 
@@ -42,6 +42,8 @@
 #define QUALIFIERS static __forceinline__ __device__
 #elif defined(__OPENCL_VERSION__)
 #define QUALIFIERS static inline
+#elif defined(__ARM_FEATURE_SME)
+#define QUALIFIERS __attribute__((arm_streaming_compatible))
 #else
 #define QUALIFIERS inline
 #include "myintrin.h"
@@ -69,7 +71,7 @@ typedef std::uint64_t uint64;
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
 typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
 typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
-#elif defined(__ARM_FEATURE_SVE)
+#elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 typedef svfloat32_t svfloat32_st;
 typedef svfloat64_t svfloat64_st;
 #endif
@@ -714,7 +716,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
 #endif
 
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
 {
     svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
diff --git a/tests/test_conditional_vec.py b/tests/test_conditional_vec.py
index ebb25d50d58b2a1ba0c8a5b7a5e089f1e6a7ea5d..032c8ab7844e0b158c96c1b6809e90fe6477cc95 100644
--- a/tests/test_conditional_vec.py
+++ b/tests/test_conditional_vec.py
@@ -15,7 +15,7 @@ supported_instruction_sets = get_supported_instruction_sets() if get_supported_i
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_any(instruction_set, dtype):
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sme', 'rvv']:
         width = 4  # we don't know the actual value
     else:
         width = get_vector_instruction_set(dtype, instruction_set)['width']
@@ -34,7 +34,7 @@ def test_vec_any(instruction_set, dtype):
                            cpu_vectorize_info={'instruction_set': instruction_set})
     kernel = ast.compile()
     kernel(data=data_arr)
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sme', 'rvv']:
         # we only know that the first value has changed
         np.testing.assert_equal(data_arr[3:9, :3 * width - 1], 2.0)
     else:
@@ -44,7 +44,7 @@ def test_vec_any(instruction_set, dtype):
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_all(instruction_set, dtype):
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sme', 'rvv']:
         width = 1000  # we don't know the actual value, need something guaranteed larger than vector
     else:
         width = get_vector_instruction_set(dtype, instruction_set)['width']
@@ -59,7 +59,7 @@ def test_vec_all(instruction_set, dtype):
                            cpu_vectorize_info={'instruction_set': instruction_set})
     kernel = ast.compile()
     kernel(data=data_arr)
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sme', 'rvv']:
         # we only know that some values in the middle have been replaced
         assert np.all(data_arr[3:9, :2] <= 1.0)
         assert np.any(data_arr[3:9, 2:] == 2.0)
diff --git a/tests/test_random.py b/tests/test_random.py
index e82bff309e636c3c07f4b1f82b552d67e6ccb3d7..21933e89304a67dd65c484b98a4483bde5ad0a61 100644
--- a/tests/test_random.py
+++ b/tests/test_random.py
@@ -32,7 +32,7 @@ if get_compiler_config()['os'] == 'windows':
 def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
     if target == Target.GPU:
         pytest.importorskip('cupy')
-    if instruction_sets and {'neon', 'sve', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni':
+    if instruction_sets and {'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni':
         pytest.xfail('AES not yet implemented for this architecture')
     if rng == 'aesni' and len(keys) == 2:
         keys *= 2
@@ -122,7 +122,7 @@ def test_rng_offsets(kind, vectorized):
 @pytest.mark.parametrize('rng', ('philox', 'aesni'))
 @pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double')))
 def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None):
-    if (target in ['neon', 'vsx', 'rvv'] or target.startswith('sve')) and rng == 'aesni':
+    if (target in ['neon', 'vsx', 'rvv', 'sme'] or target.startswith('sve')) and rng == 'aesni':
         pytest.xfail('AES not yet implemented for this architecture')
     cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target}
 
diff --git a/tests/test_vectorization.py b/tests/test_vectorization.py
index d3066e1dfb863d806ce151849b55641e0ba3f4e0..d2350526e256a2913dd46607a0480f4f5279e9ab 100644
--- a/tests/test_vectorization.py
+++ b/tests/test_vectorization.py
@@ -146,7 +146,7 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set):
     if instruction_set in ['sse'] or instruction_set.startswith('avx'):
         assert 'stream' in ast.instruction_set
         assert 'streamFence' in ast.instruction_set
-    if instruction_set in ['neon', 'vsx', 'rvv'] or instruction_set.startswith('sve'):
+    if instruction_set in ['neon', 'sme', 'vsx', 'rvv'] or instruction_set.startswith('sve'):
         assert 'cachelineZero' in ast.instruction_set
     if instruction_set in ['vsx']:
         assert 'storeAAndFlushCacheline' in ast.instruction_set
@@ -331,7 +331,7 @@ def test_logical_operators(instruction_set=instruction_set):
 
 
 def test_hardware_query():
-    assert {'sse', 'neon', 'sve', 'vsx', 'rvv'}.intersection(supported_instruction_sets)
+    assert {'sse', 'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets)
 
 
 def test_vectorised_pow(instruction_set=instruction_set):