diff --git a/src/pystencils/backends/arm_instruction_sets.py b/src/pystencils/backends/arm_instruction_sets.py
index 5e650c035b3bbdc9e6e707eb857aab7403985424..227224f4e65460a291bd3a6cd3309ed3525072fa 100644
--- a/src/pystencils/backends/arm_instruction_sets.py
+++ b/src/pystencils/backends/arm_instruction_sets.py
@@ -110,8 +110,8 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
             result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
                               vindex.format("{1}") + ')'
         if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
-            result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
-                                vindex.format("{2}") + ', {1})'
+            result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                                vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})'
 
         result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"
 
diff --git a/src/pystencils/backends/cbackend.py b/src/pystencils/backends/cbackend.py
index 8f4d061786f09b9d6d479944944803d178abae11..a3853198b600db5c66dbb1da97d6807782fe3373 100644
--- a/src/pystencils/backends/cbackend.py
+++ b/src/pystencils/backends/cbackend.py
@@ -280,15 +280,22 @@ class CBackend:
             if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc):
                 arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args
                 instr = 'storeU'
-                if aligned:
+                if nontemporal and 'storeA' not in self._vector_instruction_set and \
+                        'stream' in self._vector_instruction_set:
+                    instr = 'stream'
+                elif aligned:
                     instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
                 if mask != True:  # NOQA
                     instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \
                             'maskStoreA' if aligned else 'maskStoreU'
                     if instr not in self._vector_instruction_set:
-                        self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format(
+                        store = 'store' + instr[-1]
+                        store = store if store in self._vector_instruction_set else 'storeU'
+                        load = 'load' + instr[-1]
+                        load = load if load in self._vector_instruction_set else 'loadU'
+                        self._vector_instruction_set[instr] = self._vector_instruction_set[store].format(
                             '{0}', self._vector_instruction_set['blendv'].format(
-                                self._vector_instruction_set['load' + instr[-1]].format('{0}', **self._kwargs),
+                                self._vector_instruction_set[load].format('{0}', **self._kwargs),
                                 '{1}', '{2}', **self._kwargs), **self._kwargs)
                     printed_mask = self.sympy_printer.doprint(mask)
                     if data_type.base_type.c_name == 'double':
@@ -314,12 +321,13 @@ class CBackend:
 
                 if stride != 1:
                     instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else
-                             'maskStoreS') if mask != True else ('streamS' if nontemporal else 'storeS') # NOQA
+                             'maskStoreS') if mask != True else \
+                            ('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS')  # NOQA
                     return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
                                                                       stride, printed_mask, **self._kwargs) + ';'
 
                 pre_code = ''
-                if nontemporal and 'cachelineZero' in self._vector_instruction_set:
+                if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True:  # NOQA
                     first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
                     offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
                                       * node.lhs.args[0].field.spatial_strides[i] for i in
@@ -339,15 +347,20 @@ class CBackend:
                     code2 = self._vector_instruction_set['flushCacheline'].format(
                         ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';'
                     code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
-                elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
+                elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
                     lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8]
                     rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
                     tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}'
                     code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
                         + self.sympy_printer.doprint(rhs) + ';'
                     code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
-                    code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask,
-                                                                                           **self._kwargs) + ';'
+                    instr2 = 'maskStoreAAndFlushCacheline' if mask != True else 'storeAAndFlushCacheline'  # NOQA
+                    if instr2 not in self._vector_instruction_set:
+                        self._vector_instruction_set[instr2] = self._vector_instruction_set['storeAAndFlushCacheline'] \
+                            .format('{0}', self._vector_instruction_set['blendv'].format(
+                                self._vector_instruction_set['loadA'].format('{0}', **self._kwargs),
+                                '{1}', '{2}', **self._kwargs), **self._kwargs)
+                    code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
                     code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
                 return pre_code + code
             else:
diff --git a/src/pystencils/backends/riscv_instruction_sets.py b/src/pystencils/backends/riscv_instruction_sets.py
index e456c2b8c05facda6f422d5061832331979c889f..27f631e7f92d25e366bc767c759697ac898f3308 100644
--- a/src/pystencils/backends/riscv_instruction_sets.py
+++ b/src/pystencils/backends/riscv_instruction_sets.py
@@ -34,7 +34,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
         'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
         'loadS': f'lse{bits[data_type]}_v[0, 1]',
         'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
-        'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]',
+        'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',
 
         'abs': 'fabs_v[0]',
         '==': 'mfeq_vv[0, 1]',
@@ -89,7 +89,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
 
     result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
     result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
-    result['maskStoreS'] = result['maskStoreS'].replace('{3}', f'{{3}}*{bits[data_type]//8}')
+    result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
 
     result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
 
diff --git a/tests/test_conditional_vec.py b/tests/test_conditional_vec.py
index 84d67f0cfe939b2a640889ec23551fd86c857ee3..0118178d51ca5ff5ebbd136a7ac29c3e6ca399c0 100644
--- a/tests/test_conditional_vec.py
+++ b/tests/test_conditional_vec.py
@@ -3,6 +3,7 @@ import sympy as sp
 import pytest
 
 import pystencils as ps
+from pystencils.alignedarray import aligned_zeros
 from pystencils.astnodes import Block, Conditional, SympyAssignment
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.enums import Target
@@ -94,16 +95,60 @@ def test_boolean_before_loop():
 
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
-def test_vec_maskstore(instruction_set, dtype):
-    data_arr = np.zeros((16, 16), dtype=dtype)
+@pytest.mark.parametrize('nontemporal', [False, True])
+@pytest.mark.parametrize('aligned', [False, True])
+def test_vec_maskstore(instruction_set, dtype, nontemporal, aligned):
+    data_arr = (aligned_zeros if aligned else np.zeros)((16, 16), dtype=dtype)
     data_arr[3:-3, 3:-3] = 1.0
     data = ps.fields(f"data: {dtype}[2D]", data=data_arr)
 
     c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
 
     assignmets = NodeCollection(c)
-    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, default_number_float=dtype)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal,
+                                                       'assume_aligned': aligned},
+                                   default_number_float=dtype)
     ast = ps.create_kernel(assignmets, config=config)
+    if 'maskStore' in ast.instruction_set:
+        instruction = 'maskStream' if nontemporal and 'maskStream' in ast.instruction_set else (
+                      'maskStoreA' if aligned and 'maskStoreA' in ast.instruction_set else 'maskStore')
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
+    print(ps.get_code_str(ast))
+    kernel = ast.compile()
+    kernel(data=data_arr)
+    np.testing.assert_equal(data_arr[:3, :], 2.0)
+    np.testing.assert_equal(data_arr[-3:, :], 2.0)
+    np.testing.assert_equal(data_arr[:, :3], 2.0)
+    np.testing.assert_equal(data_arr[:, -3:], 2.0)
+    np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0)
+
+
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+@pytest.mark.parametrize('nontemporal', [False, True])
+def test_vec_maskscatter(instruction_set, dtype, nontemporal):
+    data_arr = np.zeros((16, 16), dtype=dtype)
+    data_arr[3:-3, 3:-3] = 1.0
+    data = ps.fields(f"data: {dtype}[2D]")
+
+    c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
+
+    assignmets = NodeCollection(c)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal},
+                                   default_number_float=dtype)
+    if 'maskStoreS' not in get_vector_instruction_set(dtype, instruction_set) \
+            and not instruction_set.startswith('sve'):
+        with pytest.warns(UserWarning) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert 'Could not vectorize loop' in warn[0].message.args[0]
+    else:
+        with pytest.warns(None) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert len(warn) == 0
+        instruction = 'maskStreamS' if nontemporal and 'maskStreamS' in ast.instruction_set else 'maskStoreS'
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
     print(ps.get_code_str(ast))
     kernel = ast.compile()
     kernel(data=data_arr)
diff --git a/tests/test_vectorization.py b/tests/test_vectorization.py
index 7d487062eceac9a48071a355b1b90d8cb95113b5..729da033b9c6bca97af5466310fb2aff8dd5b43f 100644
--- a/tests/test_vectorization.py
+++ b/tests/test_vectorization.py
@@ -143,7 +143,7 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set):
     # Without the base pointer spec, the inner store is not aligned
     config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
     ast = ps.create_kernel(update_rule, config=config)
-    if instruction_set in ['sse'] or instruction_set.startswith('avx'):
+    if instruction_set in ['sse'] or instruction_set.startswith('avx') or instruction_set.startswith('sve'):
         assert 'stream' in ast.instruction_set
         assert 'streamFence' in ast.instruction_set
     if instruction_set in ['neon', 'vsx', 'rvv']:
diff --git a/tests/test_vectorization_specific.py b/tests/test_vectorization_specific.py
index 8a49fa7cdf68f9456a55d3610cb9585001e90ed5..84fec99dd549e2e18411e08bb7f8a88d7561b3a9 100644
--- a/tests/test_vectorization_specific.py
+++ b/tests/test_vectorization_specific.py
@@ -94,6 +94,11 @@ def test_strided(instruction_set, dtype, nontemporal):
         with pytest.warns(None) as warn:
             ast = ps.create_kernel(update_rule, config=config)
             assert len(warn) == 0
+        instruction = 'streamS' if nontemporal and 'streamS' in ast.instruction_set else 'storeS'
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
+    instruction = 'cachelineZero'
+    if instruction in ast.instruction_set:
+        assert ast.instruction_set[instruction] not in ps.get_code_str(ast)
 
     # ps.show_code(ast)
     func = ast.compile()