Skip to content
Snippets Groups Projects
Commit 0523675f authored by Martin Bauer's avatar Martin Bauer
Browse files

UniformGridGenerated: padding after f

parent 96146de7
No related branches found
No related tags found
No related merge requests found
Pipeline #20381 failed
DomainSetup
{
blocks < 1, 1, 1 >;
cellsPerBlock < 64, 64, 64 >;
cellsPerBlock < 300, 64, 64 >;
periodic < 1, 1, 1 >;
}
Parameters
{
timeStepMode twoField;
timeStepMode aa;
// twoField: normal src-dst update with two fields [default]
// twoFieldKernelOnly: same as above but without communication and periodicity
// aa: AA single-field udate pattern
......@@ -22,11 +22,13 @@ Parameters
// manualD3Q19: manual D3Q19
timesteps 200; // time steps of one performance measurement default 60
timesteps 2000; // time steps of one performance measurement default 60
warmupSteps 1; // number of steps to run before measurement starts
outerIterations 4; // how many measurements to conduct
vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled
outerIterations 1; // how many measurements to conduct
vtkWriteFrequency 100; // write a VTK file every n'th step, if zero VTK output is disabled
remainingTimeLoggerFrequency 6; // interval in seconds to log the estimated remaining time
fPadding 3;
useGui 0;
......
......@@ -43,31 +43,91 @@ using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
using VelocityField_T = GhostLayerField< real_t, 3 >;
void pinOpenMP(const char * pinString)
template <typename T>
class OuterPaddingFieldAllocator : public field::FieldAllocator<T>
{
public:
OuterPaddingFieldAllocator( uint_t paddingElements )
: padding_(paddingElements)
{}
virtual T * allocate(const field::Layout & layout,
uint_t xSize, uint_t ySize, uint_t zSize, uint_t fSize,
uint_t & xAllocSize, uint_t & yAllocSize,uint_t & zAllocSize,uint_t & fAllocSize,
cell_idx_t & xStride, cell_idx_t & yStride, cell_idx_t & zStride, cell_idx_t & fStride)
{
T * ptr;
if (layout == field::fzyx ) {
ptr = field::FieldAllocator<T>::allocateField(fSize, zSize, ySize, xSize, fAllocSize, zAllocSize, yAllocSize, xAllocSize);
WALBERLA_CHECK_LESS_EQUAL( fSize * xAllocSize * yAllocSize * zAllocSize + xSize + ySize * xAllocSize + zSize * xAllocSize * yAllocSize,
std::numeric_limits< cell_idx_t >::max(),
"The data type 'cell_idx_t' is too small for your field size! Your field is too large.\nYou may have to set 'cell_idx_t' to an 'int64t'." );
fStride = cell_idx_c(xAllocSize * yAllocSize * zAllocSize + padding_);
zStride = cell_idx_c(xAllocSize * yAllocSize);
yStride = cell_idx_c(xAllocSize);
xStride = 1;
} else {
WALBERLA_ABORT("OuterPaddingFieldAllocator works only for fzyx layout");
}
return ptr;
}
virtual T * allocateMemory ( uint_t size0, uint_t size1, uint_t size2, uint_t size3,
uint_t & allocSize0, uint_t & allocSize1, uint_t & allocSize2, uint_t & allocSize3 )
{
allocSize0 = size0;
allocSize1 = size1;
allocSize2 = size2;
allocSize3 = size3;
return new T[allocSize0 * allocSize1 * allocSize2 * allocSize3 + padding_ * size0];
}
virtual T * allocateMemory ( uint_t size )
{
return new T[size];
}
virtual void deallocate(T *& values) {
delete[] values;
values = 0;
}
private:
uint_t padding_;
};
void pinOpenMP( const char *pinString )
{
#ifdef WALBERLA_BUILD_WITH_OPENMP
if (pinString != NULL) {
#pragma omp parallel
if ( pinString != NULL )
{
#pragma omp parallel
{
int threadId = omp_get_thread_num();
int err;
err = PinCurrentThreadByCpuList(pinString, threadId);
err = PinCurrentThreadByCpuList( pinString, threadId );
if (err) {
WALBERLA_ABORT("Pinning of " << threadId << "failed");
if ( err )
{
WALBERLA_ABORT( "Pinning of " << threadId << "failed" );
}
const char * cpuList = PinCpuListAsString();
WALBERLA_ASSERT(cpuList != NULL);
const char *cpuList = PinCpuListAsString();
WALBERLA_ASSERT( cpuList != NULL );
// Not so nice hack to print the thread ids ordered.
#pragma omp for ordered
for (int i = 0; i < omp_get_num_threads(); ++i) {
#pragma omp ordered
WALBERLA_LOG_INFO("Thread " << threadId << " pinned to core(s) " << cpuList);
#pragma omp for ordered
for ( int i = 0; i < omp_get_num_threads(); ++i )
{
#pragma omp ordered
WALBERLA_LOG_INFO( "Thread " << threadId << " pinned to core(s) " << cpuList );
}
free((void *)cpuList);
free((void *) cpuList );
}
}
#endif
......@@ -75,194 +135,219 @@ void pinOpenMP(const char * pinString)
int main( int argc, char **argv )
{
mpi::Environment env( argc, argv );
for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
{
WALBERLA_MPI_WORLD_BARRIER();
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" );
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const std::string timeStepMode = parameters.getParameter<std::string>( "timeStepMode", "twoField");
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 ));
const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", 0.02);
const bool directComm = parameters.getParameter<bool>("directComm", false);
const std::string pinning = parameters.getParameter<std::string>("pinning", "");
if( !pinning.empty() )
pinOpenMP(pinning.c_str());
auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage * const storage) {
return new PdfField_T(storage->getNumberOfXCells(*block),
storage->getNumberOfYCells(*block),
storage->getNumberOfZCells(*block),
uint_t(1),
field::fzyx,
make_shared<field::AllocateAligned<real_t, 64>>());
};
// Creating fields
BlockDataID pdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
pystencils::GenMacroSetter setterKernel(pdfFieldId, velFieldId);
pystencils::GenMacroGetter getterKernel(pdfFieldId, velFieldId);
if( shearVelocityMagnitude > 0 )
initShearVelocity(blocks, velFieldId, shearVelocityMagnitude);
for( auto & b : *blocks)
setterKernel(&b);
// Buffered Comm
blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks );
twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) );
blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm(blocks);
aaPullComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPull>(pdfFieldId));
blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks);
aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId));
// Direct Comm
blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect(blocks);
twoFieldCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfo>(pdfFieldId));
blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect(blocks);
aaPullCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPull>(pdfFieldId));
blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect(blocks);
aaPushCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPush>(pdfFieldId));
const std::string twoFieldKernelType = parameters.getParameter<std::string>( "twoFieldKernelType", "generated");
std::function<void(IBlock*)> twoFieldKernel;
if( twoFieldKernelType == "generated") {
twoFieldKernel = pystencils::GenLbKernel(pdfFieldId, omega);
} else if (twoFieldKernelType == "manualGeneric") {
using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
twoFieldKernel = StreamPullCollideGeneric<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
} else if (twoFieldKernelType == "manualD3Q19") {
using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
twoFieldKernel = StreamPullCollideD3Q19<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
} else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid option for \"twoFieldKernelType\", "
"valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\"");
}
using F = std::function<void()>;
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
if( timeStepMode == "twoField")
{
timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
<< Sweep( twoFieldKernel, "LB stream & collide1" );
timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
<< Sweep( twoFieldKernel, "LB stream & collide2" );
} else if ( timeStepMode == "twoFieldKernelOnly") {
timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide1" );
timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" );
} else if ( timeStepMode == "aa") {
timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
timeLoop.add() << BeforeFunction( directComm ? F(aaPullCommDirect) : F(aaPullComm) )
<< Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd")
<< AfterFunction( directComm ? F(aaPushCommDirect) : F(aaPushComm) );
} else if ( timeStepMode == "aaKernelOnly") {
timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd");
} else {
WALBERLA_ABORT("Invalid value for timeStepMode");
}
int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
for(int i=0; i < warmupSteps; ++i )
timeLoop.singleStep();
auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
if (remainingTimeLoggerFrequency > 0) {
auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency );
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
// VTK
uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
if( vtkWriteFrequency > 0 )
{
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 );
auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" );
vtkOutput->addCellDataWriter( velWriter );
vtkOutput->addBeforeFunction( [&]()
{ for( auto & b : *blocks)
getterKernel(&b);
} );
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
}
bool useGui = parameters.getParameter<bool>( "useGui", false );
if( useGui )
{
GUI gui( timeLoop, blocks, argc, argv);
gui.run();
}
else
{
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
auto threads = omp_get_max_threads();
simTimer.start();
timeLoop.run();
simTimer.end();
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
using std::setw;
WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode <<
" procs: " << setw(6) << MPIManager::instance()->numProcesses() <<
" threads: " << threads <<
" direct_comm: " << directComm <<
" time steps: " << timesteps <<
setw(15) << " block size: " << cellsPerBlock <<
" mlups/core: " << int(mlupsPerProcess/ threads) <<
" mlups: " << int(mlupsPerProcess) * MPIManager::instance()->numProcesses());
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode );
pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType );
pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict );
pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() );
pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() );
pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() );
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
}
return 0;
mpi::Environment env( argc, argv );
for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
{
WALBERLA_MPI_WORLD_BARRIER();
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" );
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const std::string timeStepMode = parameters.getParameter< std::string >( "timeStepMode", "twoField" );
const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 ));
uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 60 ));
const real_t shearVelocityMagnitude = parameters.getParameter< real_t >( "shearVelocityMagnitude", 0.02 );
const bool directComm = parameters.getParameter< bool >( "directComm", false );
const uint_t fPadding = parameters.getParameter<uint_t>("fPadding", 0);
const std::string pinning = parameters.getParameter< std::string >( "pinning", "" );
if ( !pinning.empty())
pinOpenMP( pinning.c_str());
auto pdfFieldAdder = [fPadding]( IBlock *const block, StructuredBlockStorage *const storage )
{
shared_ptr< field::FieldAllocator<real_t> > allocator;
if( fPadding > 0)
allocator = make_shared< OuterPaddingFieldAllocator<real_t> >( fPadding );
else
allocator = make_shared< field::AllocateAligned< real_t, 64> >();
return new PdfField_T( storage->getNumberOfXCells( *block ),
storage->getNumberOfYCells( *block ),
storage->getNumberOfZCells( *block ),
uint_t( 1 ),
field::fzyx,
allocator);
};
// Creating fields
BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >( pdfFieldAdder, "pdfs" );
BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
pystencils::GenMacroSetter setterKernel( pdfFieldId, velFieldId );
pystencils::GenMacroGetter getterKernel( pdfFieldId, velFieldId );
if ( shearVelocityMagnitude > 0 )
initShearVelocity( blocks, velFieldId, shearVelocityMagnitude );
for ( auto &b : *blocks )
setterKernel( &b );
// Buffered Comm
blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm( blocks );
twoFieldComm.addPackInfo( make_shared< pystencils::GenPackInfo >( pdfFieldId ));
blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm( blocks );
aaPullComm.addPackInfo( make_shared< pystencils::GenPackInfoAAPull >( pdfFieldId ));
blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm( blocks );
aaPushComm.addPackInfo( make_shared< pystencils::GenPackInfoAAPush >( pdfFieldId ));
// Direct Comm
blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect( blocks );
twoFieldCommDirect.addDataToCommunicate( make_shared< pystencils::GenMpiDtypeInfo >( pdfFieldId ));
blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect( blocks );
aaPullCommDirect.addDataToCommunicate( make_shared< pystencils::GenMpiDtypeInfoAAPull >( pdfFieldId ));
blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect( blocks );
aaPushCommDirect.addDataToCommunicate( make_shared< pystencils::GenMpiDtypeInfoAAPush >( pdfFieldId ));
const std::string twoFieldKernelType = parameters.getParameter< std::string >( "twoFieldKernelType", "generated" );
std::function< void( IBlock * ) > twoFieldKernel;
if ( twoFieldKernelType == "generated" )
{
twoFieldKernel = pystencils::GenLbKernel( pdfFieldId, omega );
}
else if ( twoFieldKernelType == "manualGeneric" )
{
using MyLM = lbm::D3Q19< lbm::collision_model::SRT >;
BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData< PdfField_T >( pdfFieldAdder, "pdfs" );
twoFieldKernel = StreamPullCollideGeneric< MyLM >( pdfFieldId, tmpPdfFieldId, omega );
}
else if ( twoFieldKernelType == "manualD3Q19" )
{
using MyLM = lbm::D3Q19< lbm::collision_model::SRT >;
BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData< PdfField_T >( pdfFieldAdder, "pdfs" );
twoFieldKernel = StreamPullCollideD3Q19< MyLM >( pdfFieldId, tmpPdfFieldId, omega );
}
else
{
WALBERLA_ABORT_NO_DEBUG_INFO( "Invalid option for \"twoFieldKernelType\", "
"valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\"" );
}
using F = std::function< void() >;
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
if ( timeStepMode == "twoField" )
{
timeLoop.add() << BeforeFunction( directComm ? F( twoFieldCommDirect ) : F( twoFieldComm ), "communication" )
<< Sweep( twoFieldKernel, "LB stream & collide1" );
timeLoop.add() << BeforeFunction( directComm ? F( twoFieldCommDirect ) : F( twoFieldComm ), "communication" )
<< Sweep( twoFieldKernel, "LB stream & collide2" );
}
else if ( timeStepMode == "twoFieldKernelOnly" )
{
timeLoop.add() << Sweep( pystencils::GenLbKernel( pdfFieldId, omega ), "LB stream & collide1" );
timeLoop.add() << Sweep( pystencils::GenLbKernel( pdfFieldId, omega ), "LB stream & collide2" );
}
else if ( timeStepMode == "aa" )
{
timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven( pdfFieldId, omega ), "AA Even" );
timeLoop.add() << BeforeFunction( directComm ? F( aaPullCommDirect ) : F( aaPullComm ))
<< Sweep( pystencils::GenLbKernelAAOdd( pdfFieldId, omega ), "AA Odd" )
<< AfterFunction( directComm ? F( aaPushCommDirect ) : F( aaPushComm ));
}
else if ( timeStepMode == "aaKernelOnly" )
{
timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven( pdfFieldId, omega ), "AA Even" );
timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd( pdfFieldId, omega ), "AA Odd" );
}
else
{
WALBERLA_ABORT( "Invalid value for timeStepMode" );
}
int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 );
int outerIterations = parameters.getParameter< int >( "outerIterations", 1 );
for ( int i = 0; i < warmupSteps; ++i )
timeLoop.singleStep();
auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
if ( remainingTimeLoggerFrequency > 0 )
{
auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency );
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
// VTK
uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 );
if ( vtkWriteFrequency > 0 )
{
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 );
auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" );
vtkOutput->addCellDataWriter( velWriter );
vtkOutput->addBeforeFunction( [&]()
{
for ( auto &b : *blocks )
getterKernel( &b );
} );
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
}
bool useGui = parameters.getParameter< bool >( "useGui", false );
if ( useGui )
{
GUI gui( timeLoop, blocks, argc, argv );
gui.run();
}
else
{
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
auto threads = omp_get_max_threads();
simTimer.start();
timeLoop.run();
simTimer.end();
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
using std::setw;
WALBERLA_LOG_INFO_ON_ROOT( setw( 18 ) << timeStepMode <<
" procs: " << setw( 6 ) << MPIManager::instance()->numProcesses() <<
" threads: " << threads <<
" direct_comm: " << directComm <<
" time steps: " << timesteps <<
setw( 15 ) << " block size: " << cellsPerBlock <<
" mlups/core: " << int( mlupsPerProcess / threads ) <<
" mlups: " << int( mlupsPerProcess ) * MPIManager::instance()->numProcesses());
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode );
pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType );
pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict );
pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1());
pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags());
pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine());
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
}
return 0;
}
......@@ -150,16 +150,19 @@ with CodeGeneration() as ctx:
'cse_global': opts['aa_odd_cse_global'],
'cse_pdfs': opts['aa_odd_cse_pdfs']}, **options)
vec = { 'assume_aligned': True, 'assume_inner_stride_one': True}
vec = {'assume_aligned': True, 'assume_inner_stride_one': True}
# Sweeps
vec['nontemporal'] = opts['two_field_nt_stores']
vec['assume_aligned'] = opts['two_field_nt_stores']
generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')],
cpu_vectorize_info=vec)
vec['nontemporal'] = opts['aa_even_nt_stores']
vec['assume_aligned'] = opts['aa_even_nt_stores']
generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec,
cpu_openmp=True, ghost_layers=1)
vec['nontemporal'] = opts['aa_odd_nt_stores']
vec['assume_aligned'] = opts['aa_odd_nt_stores']
generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec,
cpu_openmp=True, ghost_layers=1)
......
......@@ -51,7 +51,7 @@ def domain_decomposition_func_full(processes, threads, block_size):
class BenchmarkScenario:
def __init__(self, block_size=(256, 128, 128), direct_comm=True,
time_step_mode='aa', two_field_kernel_type='generated',
domain_decomposition_func=domain_decomposition_func_z,
domain_decomposition_func=domain_decomposition_func_z, pinning="", f_padding=0,
db_file_name='uniform_grid_gen.sqlite'):
self.block_size = block_size
self.direct_comm = direct_comm
......@@ -61,6 +61,8 @@ class BenchmarkScenario:
self.threads = int(os.environ['OMP_NUM_THREADS'])
self.processes = wlb.mpi.numProcesses()
self.db_file_name = db_file_name
self.pinning = pinning
self.f_padding = f_padding
@wlb.member_callback
def config(self, **kwargs):
......@@ -81,6 +83,8 @@ class BenchmarkScenario:
'timeStepMode': self.time_step_mode,
'twoFieldKernelType': self.two_field_kernel_type,
'directComm': self.direct_comm,
'pinning': self.pinning,
'fPadding': self.f_padding,
}
}
cfg['DomainSetup'].update(self.domain_decomposition_func(self.processes, self.threads, self.block_size))
......@@ -168,4 +172,20 @@ def weak_scaling():
continue
scenarios.add(sc)
single_node_benchmark()
def padding_test():
scenarios = wlb.ScenarioManager()
for block_size in [(300, 100, 100), (500, 100, 100)]:
for direct_comm in (False,):
for time_step_mode in ['aa', 'aaKernelOnly']:
for f_padding in range(16):
sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
time_step_mode=time_step_mode, domain_decomposition_func=domain_decomposition_func_z,
f_padding=f_padding, pinning="0")
if not block_size_ok(sc):
continue
scenarios.add(sc)
#single_node_benchmark()
padding_test()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment