diff --git a/templates/SweepInnerOuter.tmpl.cpp b/templates/SweepInnerOuter.tmpl.cpp
index 4f12d3629a25e407630bf8a533423af472d90ac1..73f725420a28eed876950acec1cae472a40279bc 100644
--- a/templates/SweepInnerOuter.tmpl.cpp
+++ b/templates/SweepInnerOuter.tmpl.cpp
@@ -50,32 +50,29 @@ namespace {{namespace}} {
 {% endfor %}
 
 
-void {{class_name}}::operator() ( IBlock * block )
+void {{class_name}}::operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
-    {{kernel|generate_call(stream='stream_')|indent(4)}}
+    {{kernel|generate_call(stream='stream')|indent(4)}}
     {{kernel|generate_swaps|indent(4)}}
 }
 
 
 
-void {{class_name}}::inner( IBlock * block )
+void {{class_name}}::inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
 
     CellInterval inner = {{field}}->xyzSize();
     inner.expand(-1);
 
-    {{kernel|generate_call(stream='stream_', cell_interval='inner')|indent(4)}}
+    {{kernel|generate_call(stream='stream', cell_interval='inner')|indent(4)}}
 }
 
 
-void {{class_name}}::outer( IBlock * block )
+void {{class_name}}::outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream {% endif %} )
 {
     static std::vector<CellInterval> layers;
-    {%if target is equalto 'gpu'%}
-    static std::vector<cudaStream_t> streams;
-    {% endif %}
 
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
 
@@ -101,25 +98,30 @@ void {{class_name}}::outer( IBlock * block )
         {{field}}->getSliceBeforeGhostLayer(stencil::W, ci, 1, false);
         ci.expand(Cell(0, -1, -1));
         layers.push_back(ci);
-
-        {%if target is equalto 'gpu'%}
-        for( int i=0; i < layers.size(); ++i )
-        {
-            streams.push_back(cudaStream_t());
-            WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority(&streams.back(), cudaStreamDefault, -1) );
-        }
-        {% endif %}
     }
 
-    { {{outer_kernels['W']|generate_call(stream='streams[5]', cell_interval="layers[5]")|indent(4)}} }
-    { {{outer_kernels['E']|generate_call(stream='streams[4]', cell_interval="layers[4]")|indent(4)}} }
-    { {{outer_kernels['S']|generate_call(stream='streams[3]', cell_interval="layers[3]")|indent(4)}} }
-    { {{outer_kernels['N']|generate_call(stream='streams[2]', cell_interval="layers[2]")|indent(4)}} }
-    { {{outer_kernels['B']|generate_call(stream='streams[1]', cell_interval="layers[1]")|indent(4)}} }
-    { {{outer_kernels['T']|generate_call(stream='streams[0]', cell_interval="layers[0]")|indent(4)}} }
+    {%if target is equalto 'gpu'%}
+    auto s = parallelStreams_.parallelSection( stream );
+    {% endif %}
+
+    {% for dir, ci in [('W', "layers[5]"),
+                       ('E', 'layers[4]'),
+                       ('S', 'layers[3]'),
+                       ('N', 'layers[2]'),
+                       ('B', 'layers[1]'),
+                       ('T', 'layers[0]') ] %}
 
-    for(int i=0; i < layers.size(); ++i )
-        WALBERLA_CUDA_CHECK( cudaStreamSynchronize(streams[i]) );
+    {%if target is equalto 'gpu'%}
+    {
+        {{outer_kernels[dir]|generate_call(stream='s.stream()', cell_interval=ci)|indent(8)}}
+        s.next();
+    }
+    {% else %}
+    {
+        {{outer_kernels[dir]|generate_call(cell_interval=ci)|indent(8)}}
+    }
+    {% endif %}
+    {% endfor %}
 
     {{kernel|generate_swaps|indent(4)}}
 }
diff --git a/templates/SweepInnerOuter.tmpl.h b/templates/SweepInnerOuter.tmpl.h
index bb687f9a4cbd20ea8003e73f7ee9be03ce085e52..2bdde0a994961ab5ebf14417a821ba113a8c0078 100644
--- a/templates/SweepInnerOuter.tmpl.h
+++ b/templates/SweepInnerOuter.tmpl.h
@@ -23,6 +23,7 @@
 #include "field/GhostLayerField.h"
 {%- elif target is equalto 'gpu' -%}
 #include "cuda/GPUField.h"
+#include "cuda/ParallelStreams.h"
 {%- endif %}
 #include "field/SwapableCompare.h"
 #include "domain_decomposition/BlockDataID.h"
@@ -50,19 +51,25 @@ namespace {{namespace}} {
 class {{class_name}}
 {
 public:
-    {{class_name}}( {{kernel|generate_constructor_parameters}}{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %})
-        : {{ kernel|generate_constructor_initializer_list }}, stream_(stream)
+    {{class_name}}( {{kernel|generate_constructor_parameters}})
+        : {{ kernel|generate_constructor_initializer_list }}
     {};
 
-    void operator() ( IBlock * block );
+    void operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} );
 
-    void inner( IBlock * block );
-    void outer( IBlock * block );
+    void inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} );
+    void outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} );
 
+    void setOuterPriority(int priority ) {
+        {%if target is equalto 'gpu'%}
+        parallelStreams_.setStreamPriority(priority);
+        {%endif%}
+    }
 private:
     {{kernel|generate_members|indent(4)}}
+
     {%if target is equalto 'gpu'%}
-    cudaStream_t stream_;
+    cuda::ParallelStreams parallelStreams_;
     {% endif %}
 };