diff --git a/templates/SweepInnerOuter.tmpl.cpp b/templates/SweepInnerOuter.tmpl.cpp index 4f12d3629a25e407630bf8a533423af472d90ac1..73f725420a28eed876950acec1cae472a40279bc 100644 --- a/templates/SweepInnerOuter.tmpl.cpp +++ b/templates/SweepInnerOuter.tmpl.cpp @@ -50,32 +50,29 @@ namespace {{namespace}} { {% endfor %} -void {{class_name}}::operator() ( IBlock * block ) +void {{class_name}}::operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} ) { {{kernel|generate_block_data_to_field_extraction|indent(4)}} - {{kernel|generate_call(stream='stream_')|indent(4)}} + {{kernel|generate_call(stream='stream')|indent(4)}} {{kernel|generate_swaps|indent(4)}} } -void {{class_name}}::inner( IBlock * block ) +void {{class_name}}::inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} ) { {{kernel|generate_block_data_to_field_extraction|indent(4)}} CellInterval inner = {{field}}->xyzSize(); inner.expand(-1); - {{kernel|generate_call(stream='stream_', cell_interval='inner')|indent(4)}} + {{kernel|generate_call(stream='stream', cell_interval='inner')|indent(4)}} } -void {{class_name}}::outer( IBlock * block ) +void {{class_name}}::outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream {% endif %} ) { static std::vector<CellInterval> layers; - {%if target is equalto 'gpu'%} - static std::vector<cudaStream_t> streams; - {% endif %} {{kernel|generate_block_data_to_field_extraction|indent(4)}} @@ -101,25 +98,30 @@ void {{class_name}}::outer( IBlock * block ) {{field}}->getSliceBeforeGhostLayer(stencil::W, ci, 1, false); ci.expand(Cell(0, -1, -1)); layers.push_back(ci); - - {%if target is equalto 'gpu'%} - for( int i=0; i < layers.size(); ++i ) - { - streams.push_back(cudaStream_t()); - WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority(&streams.back(), cudaStreamDefault, -1) ); - } - {% endif %} } - { {{outer_kernels['W']|generate_call(stream='streams[5]', cell_interval="layers[5]")|indent(4)}} } - { {{outer_kernels['E']|generate_call(stream='streams[4]', cell_interval="layers[4]")|indent(4)}} } - { {{outer_kernels['S']|generate_call(stream='streams[3]', cell_interval="layers[3]")|indent(4)}} } - { {{outer_kernels['N']|generate_call(stream='streams[2]', cell_interval="layers[2]")|indent(4)}} } - { {{outer_kernels['B']|generate_call(stream='streams[1]', cell_interval="layers[1]")|indent(4)}} } - { {{outer_kernels['T']|generate_call(stream='streams[0]', cell_interval="layers[0]")|indent(4)}} } + {%if target is equalto 'gpu'%} + auto s = parallelStreams_.parallelSection( stream ); + {% endif %} + + {% for dir, ci in [('W', "layers[5]"), + ('E', 'layers[4]'), + ('S', 'layers[3]'), + ('N', 'layers[2]'), + ('B', 'layers[1]'), + ('T', 'layers[0]') ] %} - for(int i=0; i < layers.size(); ++i ) - WALBERLA_CUDA_CHECK( cudaStreamSynchronize(streams[i]) ); + {%if target is equalto 'gpu'%} + { + {{outer_kernels[dir]|generate_call(stream='s.stream()', cell_interval=ci)|indent(8)}} + s.next(); + } + {% else %} + { + {{outer_kernels[dir]|generate_call(cell_interval=ci)|indent(8)}} + } + {% endif %} + {% endfor %} {{kernel|generate_swaps|indent(4)}} } diff --git a/templates/SweepInnerOuter.tmpl.h b/templates/SweepInnerOuter.tmpl.h index bb687f9a4cbd20ea8003e73f7ee9be03ce085e52..2bdde0a994961ab5ebf14417a821ba113a8c0078 100644 --- a/templates/SweepInnerOuter.tmpl.h +++ b/templates/SweepInnerOuter.tmpl.h @@ -23,6 +23,7 @@ #include "field/GhostLayerField.h" {%- elif target is equalto 'gpu' -%} #include "cuda/GPUField.h" +#include "cuda/ParallelStreams.h" {%- endif %} #include "field/SwapableCompare.h" #include "domain_decomposition/BlockDataID.h" @@ -50,19 +51,25 @@ namespace {{namespace}} { class {{class_name}} { public: - {{class_name}}( {{kernel|generate_constructor_parameters}}{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %}) - : {{ kernel|generate_constructor_initializer_list }}, stream_(stream) + {{class_name}}( {{kernel|generate_constructor_parameters}}) + : {{ kernel|generate_constructor_initializer_list }} {}; - void operator() ( IBlock * block ); + void operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} ); - void inner( IBlock * block ); - void outer( IBlock * block ); + void inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} ); + void outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} ); + void setOuterPriority(int priority ) { + {%if target is equalto 'gpu'%} + parallelStreams_.setStreamPriority(priority); + {%endif%} + } private: {{kernel|generate_members|indent(4)}} + {%if target is equalto 'gpu'%} - cudaStream_t stream_; + cuda::ParallelStreams parallelStreams_; {% endif %} };