diff --git a/sweep.py b/sweep.py
index 87354a2184b74472b9a62ebb9b61de41fc294cc8..5b0e2aa252e579f316fd862cfdd27818b49e5bb1 100644
--- a/sweep.py
+++ b/sweep.py
@@ -61,14 +61,54 @@ class Sweep:
     def generate_from_equations(name, function_returning_assignments, temporary_fields=[], field_swaps=[],
                                 namespace="pystencils", target='cpu', optimization={},
                                 staggered=False, varying_parameters=[], **kwargs):
-        from pystencils_walberla.cmake_integration import codegen
+
         cb = functools.partial(Sweep._generate_header_and_source, function_returning_assignments, name, target,
                                namespace, temporary_fields, field_swaps,
                                optimization=optimization, staggered=staggered,
                                varying_parameters=varying_parameters, **kwargs)
+
         file_names = [name + ".h", name + ('.cpp' if target == 'cpu' else '.cu')]
+        from pystencils_walberla.cmake_integration import codegen
         codegen.register(file_names, cb)
 
+    @staticmethod
+    def generate_inner_outer_kernel(name, function_returning_assignments, temporary_fields=[], field_swaps=[],
+                                    namespace="pystencils", target='cpu', optimization={}, outer_optimization={},
+                                    varying_parameters=[], **kwargs):
+        def generate_callback():
+            eqs = function_returning_assignments(**kwargs)
+
+            ast = create_kernel(eqs, target=target, **optimization)
+            ast.function_name = name
+
+            outer_kernels = {}
+            for dir_str in ('T', 'B', 'N', 'S', 'E', 'W'):
+                outer_kernel = create_kernel(eqs, target=target, **outer_optimization)
+                outer_kernel.function_name = name + "_" + dir_str
+                outer_kernels[dir_str] = KernelInfo(outer_kernel, temporary_fields, field_swaps, varying_parameters)
+
+            env = Environment(loader=PackageLoader('pystencils_walberla'))
+            add_pystencils_filters_to_jinja_env(env)
+
+            representative_field = {p.field_name for p in ast.parameters if p.is_field_argument}.pop()
+
+            context = {
+                'kernel': KernelInfo(ast, temporary_fields, field_swaps, varying_parameters),
+                'namespace': namespace,
+                'class_name': ast.function_name[0].upper() + ast.function_name[1:],
+                'target': target,
+                'field': representative_field,
+                'outer_kernels': outer_kernels,
+            }
+
+            header = env.get_template("SweepInnerOuter.tmpl.h").render(**context)
+            source = env.get_template("SweepInnerOuter.tmpl.cpp").render(**context)
+            return header, source
+
+        file_names = [name + ".h", name + ('.cpp' if target == 'cpu' else '.cu')]
+        from pystencils_walberla.cmake_integration import codegen
+        codegen.register(file_names, generate_callback)
+
     @staticmethod
     def generate_pack_info(name, function_returning_assignments, target='gpu', **kwargs):
         from pystencils_walberla.cmake_integration import codegen
@@ -81,11 +121,12 @@ class Sweep:
         file_names = [name + ".h", name + ('.cpp' if target == 'cpu' else '.cu')]
         codegen.register(file_names, callback)
 
+
     @staticmethod
-    def _generate_header_and_source(function_returning_equations, name, target, namespace,
+    def _generate_header_and_source(function_returning_assignments, name, target, namespace,
                                     temporary_fields, field_swaps, optimization, staggered,
                                     varying_parameters, **kwargs):
-        eqs = function_returning_equations(**kwargs)
+        eqs = function_returning_assignments(**kwargs)
 
         if not staggered:
             ast = create_kernel(eqs, target=target, **optimization)
@@ -96,16 +137,14 @@ class Sweep:
         env = Environment(loader=PackageLoader('pystencils_walberla'))
         add_pystencils_filters_to_jinja_env(env)
 
-        representative_field = {p.field_name for p in ast.parameters if p.is_field_argument}.pop()
-
         context = {
             'kernel': KernelInfo(ast, temporary_fields, field_swaps, varying_parameters),
             'namespace': namespace,
             'class_name': ast.function_name[0].upper() + ast.function_name[1:],
             'target': target,
-            'field': representative_field,
         }
 
         header = env.get_template("Sweep.tmpl.h").render(**context)
         source = env.get_template("Sweep.tmpl.cpp").render(**context)
         return header, source
+
diff --git a/templates/Sweep.tmpl.cpp b/templates/Sweep.tmpl.cpp
index ddb43db7a81d32f51c736d18a96dcf8b989be23d..4dc0b53ace664bf94e380e0578ea38046a8a0cd9 100644
--- a/templates/Sweep.tmpl.cpp
+++ b/templates/Sweep.tmpl.cpp
@@ -45,7 +45,6 @@ namespace {{namespace}} {
 
 {{kernel|generate_definition}}
 
-
 void {{class_name}}::operator() ( IBlock * block )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
@@ -53,70 +52,6 @@ void {{class_name}}::operator() ( IBlock * block )
     {{kernel|generate_swaps|indent(4)}}
 }
 
-
-
-void {{class_name}}::inner( IBlock * block )
-{
-    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
-
-    CellInterval inner = {{field}}->xyzSize();
-    inner.expand(-1);
-
-    {{kernel|generate_call(stream='stream_', cell_interval='inner')|indent(4)}}
-}
-
-
-void {{class_name}}::outer( IBlock * block )
-{
-    static std::vector<CellInterval> layers;
-    {%if target is equalto 'gpu'%}
-    static std::vector<cudaStream_t> streams;
-    {% endif %}
-
-    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
-
-    if( layers.size() == 0 )
-    {
-        CellInterval ci;
-
-        {{field}}->getSliceBeforeGhostLayer(stencil::T, ci, 1, false);
-        layers.push_back(ci);
-        {{field}}->getSliceBeforeGhostLayer(stencil::B, ci, 1, false);
-        layers.push_back(ci);
-
-        {{field}}->getSliceBeforeGhostLayer(stencil::N, ci, 1, false);
-        ci.expand(Cell(0, 0, -1));
-        layers.push_back(ci);
-        {{field}}->getSliceBeforeGhostLayer(stencil::S, ci, 1, false);
-        ci.expand(Cell(0, 0, -1));
-        layers.push_back(ci);
-
-        {{field}}->getSliceBeforeGhostLayer(stencil::E, ci, 1, false);
-        ci.expand(Cell(0, -1, -1));
-        layers.push_back(ci);
-        {{field}}->getSliceBeforeGhostLayer(stencil::W, ci, 1, false);
-        ci.expand(Cell(0, -1, -1));
-        layers.push_back(ci);
-
-        {%if target is equalto 'gpu'%}
-        for( int i=0; i < layers.size(); ++i )
-        {
-            streams.push_back(cudaStream_t());
-            WALBERLA_CUDA_CHECK( cudaStreamCreate(&streams.back() ) );
-        }
-        {% endif %}
-    }
-
-    for( int i=0; i < layers.size(); ++i )
-    {
-        {{kernel|generate_call(stream='streams[i]', cell_interval="layers[i]")|indent(8)}}
-        WALBERLA_CUDA_CHECK(cudaStreamSynchronize(streams[i])); // TODO move out when no memcpy is needed to setup call
-    }
-
-    {{kernel|generate_swaps|indent(4)}}
-}
-
-
 } // namespace {{namespace}}
 } // namespace walberla
 
diff --git a/templates/Sweep.tmpl.h b/templates/Sweep.tmpl.h
index bb687f9a4cbd20ea8003e73f7ee9be03ce085e52..8f976215a36075fd9e16bb54019f6a0fc642a8cf 100644
--- a/templates/Sweep.tmpl.h
+++ b/templates/Sweep.tmpl.h
@@ -55,10 +55,6 @@ public:
     {};
 
     void operator() ( IBlock * block );
-
-    void inner( IBlock * block );
-    void outer( IBlock * block );
-
 private:
     {{kernel|generate_members|indent(4)}}
     {%if target is equalto 'gpu'%}
diff --git a/templates/SweepInnerOuter.tmpl.cpp b/templates/SweepInnerOuter.tmpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f12d3629a25e407630bf8a533423af472d90ac1
--- /dev/null
+++ b/templates/SweepInnerOuter.tmpl.cpp
@@ -0,0 +1,134 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{className}}.cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "{{class_name}}.h"
+
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wfloat-equal"
+#   pragma GCC diagnostic ignored "-Wshadow"
+#   pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace {{namespace}} {
+
+{{kernel|generate_definition}}
+
+{% for outer_kernel in outer_kernels.values() %}
+{{outer_kernel|generate_definition}}
+{% endfor %}
+
+
+void {{class_name}}::operator() ( IBlock * block )
+{
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+    {{kernel|generate_call(stream='stream_')|indent(4)}}
+    {{kernel|generate_swaps|indent(4)}}
+}
+
+
+
+void {{class_name}}::inner( IBlock * block )
+{
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+
+    CellInterval inner = {{field}}->xyzSize();
+    inner.expand(-1);
+
+    {{kernel|generate_call(stream='stream_', cell_interval='inner')|indent(4)}}
+}
+
+
+void {{class_name}}::outer( IBlock * block )
+{
+    static std::vector<CellInterval> layers;
+    {%if target is equalto 'gpu'%}
+    static std::vector<cudaStream_t> streams;
+    {% endif %}
+
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+
+    if( layers.size() == 0 )
+    {
+        CellInterval ci;
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::T, ci, 1, false);
+        layers.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::B, ci, 1, false);
+        layers.push_back(ci);
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::N, ci, 1, false);
+        ci.expand(Cell(0, 0, -1));
+        layers.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::S, ci, 1, false);
+        ci.expand(Cell(0, 0, -1));
+        layers.push_back(ci);
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::E, ci, 1, false);
+        ci.expand(Cell(0, -1, -1));
+        layers.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::W, ci, 1, false);
+        ci.expand(Cell(0, -1, -1));
+        layers.push_back(ci);
+
+        {%if target is equalto 'gpu'%}
+        for( int i=0; i < layers.size(); ++i )
+        {
+            streams.push_back(cudaStream_t());
+            WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority(&streams.back(), cudaStreamDefault, -1) );
+        }
+        {% endif %}
+    }
+
+    { {{outer_kernels['W']|generate_call(stream='streams[5]', cell_interval="layers[5]")|indent(4)}} }
+    { {{outer_kernels['E']|generate_call(stream='streams[4]', cell_interval="layers[4]")|indent(4)}} }
+    { {{outer_kernels['S']|generate_call(stream='streams[3]', cell_interval="layers[3]")|indent(4)}} }
+    { {{outer_kernels['N']|generate_call(stream='streams[2]', cell_interval="layers[2]")|indent(4)}} }
+    { {{outer_kernels['B']|generate_call(stream='streams[1]', cell_interval="layers[1]")|indent(4)}} }
+    { {{outer_kernels['T']|generate_call(stream='streams[0]', cell_interval="layers[0]")|indent(4)}} }
+
+    for(int i=0; i < layers.size(); ++i )
+        WALBERLA_CUDA_CHECK( cudaStreamSynchronize(streams[i]) );
+
+    {{kernel|generate_swaps|indent(4)}}
+}
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
diff --git a/templates/SweepInnerOuter.tmpl.h b/templates/SweepInnerOuter.tmpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb687f9a4cbd20ea8003e73f7ee9be03ce085e52
--- /dev/null
+++ b/templates/SweepInnerOuter.tmpl.h
@@ -0,0 +1,76 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+
+{% if target is equalto 'cpu' -%}
+#include "field/GhostLayerField.h"
+{%- elif target is equalto 'gpu' -%}
+#include "cuda/GPUField.h"
+{%- endif %}
+#include "field/SwapableCompare.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+class {{class_name}}
+{
+public:
+    {{class_name}}( {{kernel|generate_constructor_parameters}}{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %})
+        : {{ kernel|generate_constructor_initializer_list }}, stream_(stream)
+    {};
+
+    void operator() ( IBlock * block );
+
+    void inner( IBlock * block );
+    void outer( IBlock * block );
+
+private:
+    {{kernel|generate_members|indent(4)}}
+    {%if target is equalto 'gpu'%}
+    cudaStream_t stream_;
+    {% endif %}
+};
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif