diff --git a/codegen.py b/codegen.py
index 8f8e248dd75ff8274ae18b9d91b8f55463d439b5..fb7fc6e3edd8b47d722825089fa7c282accbf710 100644
--- a/codegen.py
+++ b/codegen.py
@@ -4,6 +4,7 @@ from itertools import product
 from typing import Dict, Sequence, Tuple, Optional
 
 from pystencils import create_staggered_kernel, Field, create_kernel, Assignment, FieldType
+from pystencils.backends.cbackend import get_headers
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
 from pystencils.stencils import offset_to_direction_string, inverse_direction
 from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
@@ -44,6 +45,7 @@ def generate_sweep(generation_context, class_name, assignments,
             'namespace': namespace,
             'class_name': class_name,
             'target': create_kernel_params.get("target", "cpu"),
+            'headers': get_headers(ast),
         }
         header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
         source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
@@ -58,6 +60,7 @@ def generate_sweep(generation_context, class_name, assignments,
             'class_name': class_name,
             'target': create_kernel_params.get("target", "cpu"),
             'field': representative_field,
+            'headers': get_headers(ast),
         }
         header = env.get_template("SweepInnerOuter.tmpl.h").render(**jinja_context)
         source = env.get_template("SweepInnerOuter.tmpl.cpp").render(**jinja_context)
@@ -193,6 +196,7 @@ def default_create_kernel_parameters(generation_context, params):
 
     vec = params['cpu_vectorize_info']
     vec['instruction_set'] = vec.get('instruction_set', default_vec_is)
+    vec['assume_inner_stride_one'] = True
     vec['assume_aligned'] = vec.get('assume_aligned', False)
     vec['nontemporal'] = vec.get('nontemporal', False)
     return params
diff --git a/jinja_filters.py b/jinja_filters.py
index b96ba4bd15ed06e864a89cfca64030ec0f8eea36..c6b1f459823a8eda175eabf1fdfc3c4d9affe4c6 100644
--- a/jinja_filters.py
+++ b/jinja_filters.py
@@ -8,7 +8,7 @@ from pystencils.kernelparameters import SHAPE_DTYPE
 from pystencils.sympyextensions import prod
 
 temporary_fieldMemberTemplate = """
-std::set< {type} *, field::SwapableCompare< {type} * > > cache_{original_field_name}_;"""
+private: std::set< {type} *, field::SwapableCompare< {type} * > > cache_{original_field_name}_;"""
 
 temporary_fieldTemplate = """
 // Getting temporary field {tmp_field_name}
@@ -53,6 +53,23 @@ def get_field_fsize(field):
     else:
         return prod(field.index_shape)
 
+
+def get_field_stride(param):
+    field = param.fields[0]
+    type_str = get_base_type(param.symbol.dtype).base_name
+    stride_names = ['xStride()', 'yStride()', 'zStride()', 'fStride()']
+    stride_names = ["%s(%s->%s)" % (type_str, param.field_name, e) for e in stride_names]
+    strides = stride_names[:field.spatial_dimensions]
+    if field.index_dimensions > 0:
+        additional_strides = [1]
+        for shape in reversed(field.index_shape[1:]):
+            additional_strides.append(additional_strides[-1] * shape)
+        assert len(additional_strides) == field.index_dimensions
+        f_stride_name = stride_names[-1]
+        strides.extend(["%s(%d * %s)" % (type_str, e, f_stride_name) for e in reversed(additional_strides)])
+    return strides[param.symbol.coordinate]
+
+
 def generate_declaration(kernel_info):
     """Generates the declaration of the kernel function"""
     ast = kernel_info.ast
@@ -222,9 +239,8 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non
                 kernel_call_lines.append("%s %s = %s->dataAt(%s, %s, %s, %s);" %
                                          ((param.symbol.dtype, param.symbol.name, param.field_name) + coordinates))
         elif param.is_field_stride:
+            casted_stride = get_field_stride(param)
             type_str = param.symbol.dtype.base_name
-            stride_names = ('xStride()', 'yStride()', 'zStride()', 'fStride()')
-            casted_stride = "%s(%s->%s)" % (type_str, param.field_name, stride_names[param.symbol.coordinate])
             kernel_call_lines.append("const %s %s = %s;" % (type_str, param.symbol.name, casted_stride))
         elif param.is_field_shape:
             coord = param.symbol.coordinate
@@ -327,6 +343,9 @@ def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=Fals
         field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu)
         result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name))
 
+    if hasattr(kernel_info, 'varying_parameters'):
+        result.extend(["%s %s;" % e for e in kernel_info.varying_parameters])
+
     return "\n".join(result)
 
 
diff --git a/templates/Sweep.tmpl.cpp b/templates/Sweep.tmpl.cpp
index 4dc0b53ace664bf94e380e0578ea38046a8a0cd9..63d686e4a041bb9e8201843dc7d7ad13dc077d5a 100644
--- a/templates/Sweep.tmpl.cpp
+++ b/templates/Sweep.tmpl.cpp
@@ -23,6 +23,9 @@
 #include "core/DataTypes.h"
 #include "core/Macros.h"
 #include "{{class_name}}.h"
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
 
 
 {% if target is equalto 'cpu' -%}
@@ -36,6 +39,7 @@
 #   pragma GCC diagnostic ignored "-Wfloat-equal"
 #   pragma GCC diagnostic ignored "-Wshadow"
 #   pragma GCC diagnostic ignored "-Wconversion"
+#   pragma GCC diagnostic ignored "-Wunused-variable"
 #endif
 
 using namespace std;
@@ -43,15 +47,37 @@ using namespace std;
 namespace walberla {
 namespace {{namespace}} {
 
+
 {{kernel|generate_definition}}
 
-void {{class_name}}::operator() ( IBlock * block )
+
+void {{class_name}}::sweep( IBlock * block )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
     {{kernel|generate_call(stream='stream_')|indent(4)}}
     {{kernel|generate_swaps|indent(4)}}
 }
 
+
+void {{class_name}}::sweepOnCellInterval( const shared_ptr<StructuredBlockStorage> & blocks,
+                                          const CellInterval & globalCellInterval,
+                                          cell_idx_t ghostLayers,
+                                          IBlock * block )
+{
+    CellInterval ci = globalCellInterval;
+    CellInterval blockBB = blocks->getBlockCellBB( *block);
+    blockBB.expand( ghostLayers );
+    ci.intersect( blockBB );
+    blocks->transformGlobalToBlockLocalCellInterval( ci, *block );
+    if( ci.empty() )
+        return;
+
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+    {{kernel|generate_call(stream='stream_', cell_interval='ci')|indent(4)}}
+    {{kernel|generate_swaps|indent(4)}}
+}
+
+
 } // namespace {{namespace}}
 } // namespace walberla
 
diff --git a/templates/Sweep.tmpl.h b/templates/Sweep.tmpl.h
index af3879c12681ca35b16137913e0bdd0f9c3c116d..325b9c57c6dff002c69a73c92c6c0cf1dacf180d 100644
--- a/templates/Sweep.tmpl.h
+++ b/templates/Sweep.tmpl.h
@@ -17,6 +17,7 @@
 //! \\author pystencils
 //======================================================================================================================
 
+#pragma once
 #include "core/DataTypes.h"
 
 {% if target is equalto 'cpu' -%}
@@ -27,7 +28,7 @@
 #include "field/SwapableCompare.h"
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
-
+#include "domain_decomposition/StructuredBlockStorage.h"
 #include <set>
 
 #ifdef __GNUC__
@@ -56,9 +57,28 @@ public:
 
     {{ kernel| generate_destructor(class_name) |indent(4) }}
 
-    void operator() ( IBlock * block );
+    void operator()(IBlock * b) { sweep(b); }
+
+    std::function<void (IBlock*)> getSweep() {
+        return [this](IBlock * b) { this->sweep(b); };
+    }
+
+    std::function<void (IBlock*)> getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks,
+                                                         const CellInterval & globalCellInterval,
+                                                         cell_idx_t ghostLayers=1 )
+    {
+        return [this, blocks, globalCellInterval, ghostLayers] (IBlock * b) {
+            this->sweepOnCellInterval(blocks, globalCellInterval, ghostLayers, b);
+        };
+    }
+
+    {{ kernel|generate_members|indent(4) }}
+
 private:
-    {{kernel|generate_members|indent(4)}}
+    void sweep( IBlock * block );
+    void sweepOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks,
+                             const CellInterval & globalCellInterval, cell_idx_t ghostLayers, IBlock * block );
+
     {%if target is equalto 'gpu'%}
     cudaStream_t stream_;
     {% endif %}