diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.cpp b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp index 624f466ea87ac857c10c955b54b8669c87630430..0617986e5e8f3264c25b568807eefb922bb1903f 100644 --- a/apps/showcases/PorousMediaGPU/DragEvaluation.cpp +++ b/apps/showcases/PorousMediaGPU/DragEvaluation.cpp @@ -39,12 +39,12 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks, maxLevel_ = blocks->getDepth(); blocks->getBlocks(finestBlocks_, maxLevel_); - dragFilename_ = parameters.getParameter< std::string >("filename"); checkFrequency_ = parameters.getParameter< uint_t >("evaluationFrequency"); writeCounter_ = 1000 * checkFrequency_; dt_ = setup.dt / real_c( 1 << setup.refinementLevels ); timesteps_ = setup.timesteps * ( 1 << setup.refinementLevels ); totalExecutionCounter_ = 0; + rampUpTime_ = parameters.getParameter< uint_t >("evaluationStart") * ( 1 << setup.refinementLevels ); if(writeCounter_ == uint_c(0)) writeCounter_++; @@ -52,10 +52,16 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks, const real_t radius = wallParameters.getParameter< real_t >("hemispheresRadius"); const real_t distance = wallParameters.getParameter< real_t >("hemispheresDistance"); - filesystem::path hdfFilePath( dragFilename_.c_str() ); - if( filesystem::exists( hdfFilePath ) ) - std::remove( hdfFilePath.string().c_str() ); + const auto outputFolder = setup_.outputFolder; + const auto fileStem = setup_.fileStem; + const auto fileName = outputFolder + "/" + fileStem + "Drag.h5"; + filesystem::path hdfFilePath( fileName.c_str() ); + WALBERLA_ROOT_SECTION() + { + if( filesystem::exists( hdfFilePath ) ) + std::remove( hdfFilePath.string().c_str() ); + } extendSize[0] = 0; selectStart[0] = 0; @@ -111,11 +117,13 @@ DragEvaluation::DragEvaluation(std::shared_ptr< StructuredBlockForest >& blocks, WALBERLA_CHECK_GREATER_EQUAL(surfaceAreaWall_, real_c(0.0)) meanVelocity_ = setup.latticeVelocity; + totalSurface_ = surfaceAreaHemiSpheres_ + surfaceAreaWall_; WALBERLA_LOG_INFO_ON_ROOT( "Evaluation initialised:" "\n + Total surface area of Hemispheres: " << surfaceAreaHemiSpheres_ << "\n + Surface area of the wall: " << surfaceAreaWall_ + << "\n + Total Surface area: " << totalSurface_ ) } @@ -125,7 +133,7 @@ void DragEvaluation::forceCalculation(const uint_t level) if(level == maxLevel_){ ++totalExecutionCounter_; } - if (checkFrequency_ == uint_c(0) || (totalExecutionCounter_ % checkFrequency_) != uint_c(0)){ + if (checkFrequency_ == uint_c(0) || rampUpTime_ > totalExecutionCounter_ || (totalExecutionCounter_ % checkFrequency_) != uint_c(0)){ return; } @@ -139,8 +147,8 @@ void DragEvaluation::forceCalculation(const uint_t level) const double t = double_c(totalExecutionCounter_ - 1) * dt_; const double meanU2 = double_c(meanVelocity_) * double_c(meanVelocity_); - const double cDRealArea = (double_c(2.0) * force_[0]) / (meanU2 * double_c(surfaceAreaHemiSpheres_)); - const double cLRealArea = (double_c(2.0) * force_[1]) / (meanU2 * double_c(surfaceAreaHemiSpheres_)); + const double cDRealArea = (double_c(2.0) * force_[0]) / (meanU2 * double_c(totalSurface_)); + const double cLRealArea = (double_c(2.0) * force_[1]) / (meanU2 * double_c(totalSurface_)); dragResults_[quantities_["timestep"]*writeCounter_ + executionCounter_] = float_c(t); dragResults_[quantities_["Fx"]*writeCounter_ + executionCounter_] = float_c(force_[0]); @@ -150,10 +158,13 @@ void DragEvaluation::forceCalculation(const uint_t level) dragResults_[quantities_["CLReal"]*writeCounter_ + executionCounter_] = float_c(cLRealArea); } + sumFx_ += force_[0]; + force_[0] = double_c(0.0); force_[1] = double_c(0.0); force_[2] = double_c(0.0); + ++meanQuantitiesCounter_; ++executionCounter_; if(executionCounter_ >= writeCounter_ || totalExecutionCounter_ == timesteps_){ writeResults(); @@ -185,4 +196,170 @@ void DragEvaluation::writeResults() executionCounter_ = 0; } +Plotter::Plotter(std::shared_ptr< StructuredBlockForest >& blocks, const Setup& setup, const IDs& ids) + : blocks_(blocks), setup_(setup), ids_(ids) +{ + + const auto outputFolder = setup_.outputFolder; + const auto fileStem = setup_.fileStem; + const auto fileName = outputFolder + "/" + fileStem + "MeanQuantities.h5"; + filesystem::path hdfFilePath( fileName.c_str() ); + + WALBERLA_ROOT_SECTION() + { + if( filesystem::exists( hdfFilePath ) ) + std::remove( hdfFilePath.string().c_str() ); + } + + H5::FileAccPropList fileAccProps; +#ifdef WALBERLA_BUILD_WITH_MPI + // Enable MPI-IO + H5Pset_fapl_mpio(fileAccProps.getId(), mpi::MPIManager::instance()->comm(), MPI_INFO_NULL); +#endif + h5file_ = std::make_unique<H5::H5File>(hdfFilePath.c_str(), H5F_ACC_CREAT | H5F_ACC_RDWR, H5::FileCreatPropList::DEFAULT, fileAccProps); + H5::Group dataGroup = h5file_->createGroup("Data"); +} + +void Plotter::writeResult(const real_t frictionVelocity, const real_t welfordCounter){ + + auto finalDomain = blocks_->getDomain(); + + std::map<real_t, uint_t> centerToIntex; + uint_t idx = 0; + + Vector3<real_t> p = finalDomain.min(); + IBlock* block = blocks_->getBlock(p); + while(block){ + const uint_t ny = blocks_->getNumberOfYCells(*block); + for(uint_t y = 0; y < ny; ++y){ + const Cell localCell(cell_idx_c(0), cell_idx_c(y), cell_idx_c(0)); + Vector3<real_t> globalCellCenter; + blocks_->getBlockLocalCellCenter(*block, localCell, globalCellCenter); + centerToIntex[globalCellCenter[1]] = idx; + idx++; + } + p[1] = block->getAABB().yMax() + setup_.dxF; + block = blocks_->getBlock(p); + if(block == nullptr){ + WALBERLA_CHECK_GREATER(p[1], finalDomain.yMax(), "This method does is not yet implemented for MPI parallel simulations") + } + } + + const real_t viscosity = setup_.viscosity; + const real_t channelHalfHeight = setup_.referenceLength; + + addAttribute(viscosity, "viscosity"); + addAttribute(channelHalfHeight, "channelHalfHeight"); + addAttribute(frictionVelocity, "frictionVelocity"); + addAttribute(welfordCounter, "welfordCounter"); + + std::vector<real_t> yCellCenter(centerToIntex.size(), real_c(0.0)); + std::vector<real_t> yRel(centerToIntex.size(), real_c(0.0)); + std::vector<real_t> yPlus(centerToIntex.size(), real_c(0.0)); + std::vector<uint_t> cellCounterI(centerToIntex.size(), uint_c(0)); + std::vector<uint_t> cellCounterE(centerToIntex.size(), uint_c(0)); + + std::vector<real_t> meanVelocityI(centerToIntex.size(), real_c(0.0)); + std::vector<real_t> meanVelocityE(centerToIntex.size(), real_c(0.0)); + std::vector<real_t> uPlusI(centerToIntex.size(), real_c(0.0)); + std::vector<real_t> uPlusE(centerToIntex.size(), real_c(0.0)); + std::vector<real_t> reynoldsStressVector(centerToIntex.size() * TensorField_T::F_SIZE, 0_r); + + for (auto const& it : centerToIntex){ + yCellCenter[it.second] = it.first; + yRel[it.second] = real_c(it.second) / channelHalfHeight; + yPlus[it.second] = real_c(it.second) * frictionVelocity / viscosity; + } + + for( auto bIt = blocks_->begin(); bIt != blocks_->end(); ++bIt) { + auto * meanVelocityField = bIt->template getData<VelocityField_T>(ids_.meanVelocityField); + auto * sosField = bIt->template getData<TensorField_T>(ids_.sosField); + auto * flagField = bIt->template getData<FlagField_T>(ids_.flagField); + auto domainFlag = flagField->getFlag(setup_.fluidUID); + const auto ci = meanVelocityField->xyzSize(); + for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) { + Vector3<real_t> globalCellCenter; + blocks_->getBlockLocalCellCenter(*bIt, *cellIt, globalCellCenter); + + if (!centerToIntex.count(globalCellCenter[1])){ + WALBERLA_ABORT("y coordinate of center to index map was not contained") + } + + idx = centerToIntex[globalCellCenter[1]]; + cellCounterI[idx]++; + + if(isFlagSet(flagField->get(*cellIt), domainFlag)){ + cellCounterE[idx]++; + meanVelocityE[idx] += meanVelocityField->get(*cellIt, 0); + meanVelocityI[idx] += meanVelocityField->get(*cellIt, 0); + + for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) { + reynoldsStressVector[i * centerToIntex.size() + idx] = sosField->get(*cellIt, i) / welfordCounter; + } + } + } + } + + for(uint_t n = 0; n < centerToIntex.size(); ++n){ + meanVelocityI[n] /= real_c(cellCounterI[n]); + meanVelocityE[n] /= real_c(cellCounterE[n]); + + for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) { + reynoldsStressVector[i * centerToIntex.size() + n] /= real_c(cellCounterE[n]); + } + + uPlusI[n] = meanVelocityI[n] / frictionVelocity; + uPlusE[n] = meanVelocityE[n] / frictionVelocity; + } + + writeData(yCellCenter.data(), centerToIntex.size(), "y"); + writeData(yRel.data(), centerToIntex.size(), "yRel"); + writeData(yPlus.data(), centerToIntex.size(), "yPlus"); + writeData(meanVelocityI.data(), centerToIntex.size(), "meanVelocityI"); + writeData(meanVelocityE.data(), centerToIntex.size(), "meanVelocityE"); + writeData(uPlusI.data(), centerToIntex.size(), "uPlusI"); + writeData(uPlusE.data(), centerToIntex.size(), "uPlusE"); + writeData(&reynoldsStressVector[0 * centerToIntex.size()], centerToIntex.size(), "uu_rms"); + writeData(&reynoldsStressVector[1 * centerToIntex.size()], centerToIntex.size(), "uv_rms"); + writeData(&reynoldsStressVector[2 * centerToIntex.size()], centerToIntex.size(), "uw_rms"); + writeData(&reynoldsStressVector[3 * centerToIntex.size()], centerToIntex.size(), "vu_rms"); + writeData(&reynoldsStressVector[4 * centerToIntex.size()], centerToIntex.size(), "vv_rms"); + writeData(&reynoldsStressVector[5 * centerToIntex.size()], centerToIntex.size(), "vw_rms"); + writeData(&reynoldsStressVector[6 * centerToIntex.size()], centerToIntex.size(), "wu_rms"); + writeData(&reynoldsStressVector[7 * centerToIntex.size()], centerToIntex.size(), "wv_rms"); + writeData(&reynoldsStressVector[8 * centerToIntex.size()], centerToIntex.size(), "ww_rms"); +} + +void Plotter::writeData(const real_t* data, const uint_t size, const std::string& dataSetName){ + const hsize_t dsetSize[1]{ 1 }; + const hsize_t maxSize[1]{ size }; + const hsize_t chunkSize[1]{ size }; + + H5::DataSpace arrayDataSpace(1, dsetSize, maxSize); + H5::DSetCreatPropList props; + props.setChunk(1, chunkSize); + + const hsize_t selectCount[1] { size }; + const hsize_t selectStart[1] { 0 }; + + auto group = h5file_->openGroup("Data"); + H5::DataSet dataset = group.createDataSet(dataSetName, io::getH5ExportType<real_t>(), arrayDataSpace, props); + dataset.extend( selectCount ); + + H5::DataSpace dataspace = dataset.getSpace(); + dataspace.selectHyperslab(H5S_SELECT_SET, selectCount, selectStart); + + H5::DataSpace memSpace(1, selectCount); + dataset.write(data, io::getH5ExportType<real_t>(), memSpace, dataspace); +} + +void Plotter::addAttribute(const real_t value, const std::string& attributeName){ + hsize_t attributeShape[1]{ 1 }; + H5::DataSpace attributeShapeDspace(1, attributeShape, attributeShape); + auto group = h5file_->openGroup("Data"); + H5::Attribute attr = group.createAttribute(attributeName, io::getH5ExportType<real_t>(), attributeShapeDspace); + real_t valueArray[1]{value}; + attr.write(io::getH5ExportType<real_t>(), valueArray); +} + } \ No newline at end of file diff --git a/apps/showcases/PorousMediaGPU/DragEvaluation.h b/apps/showcases/PorousMediaGPU/DragEvaluation.h index 7e0a4461d685a76781d822306a057d8185e747a0..4d621325b6d1374dd4b87bc3b8e185ed5257b6c1 100644 --- a/apps/showcases/PorousMediaGPU/DragEvaluation.h +++ b/apps/showcases/PorousMediaGPU/DragEvaluation.h @@ -58,6 +58,13 @@ class DragEvaluation return [this](uint_t level) { forceCalculation(level); }; } + double getMeanFrictionVelocity(){ + if(meanQuantitiesCounter_ == 0){ + return double_c(0.0); + } + return (sumFx_ / double_c(meanQuantitiesCounter_)) / totalSurface_; + } + protected: bool initialized_{false}; @@ -72,11 +79,13 @@ class DragEvaluation real_t dt_; uint_t timesteps_; - std::string dragFilename_; uint_t checkFrequency_; uint_t writeCounter_; uint_t executionCounter_{0}; uint_t totalExecutionCounter_{0}; + uint_t rampUpTime_{100000}; + + uint_t meanQuantitiesCounter_{0}; std::unique_ptr<H5::H5File> h5file_; hsize_t extendSize[1]; @@ -86,10 +95,41 @@ class DragEvaluation double surfaceAreaHemiSpheres_; double surfaceAreaWall_; + double totalSurface_; double meanVelocity_; Vector3< double > force_; + double sumFx_{0.0}; + }; // class DragEvaluation +class Plotter +{ + public: + Plotter(std::shared_ptr< StructuredBlockForest >& blocks, const Setup& setup, const IDs& ids); + + // void forceCalculation(const uint_t level); // for calculating the force + // void writeResults(); + + void writeResult(const real_t frictionVelocity, const real_t welfordCounter); + void writeData(const real_t* data, const uint_t size, const std::string& dataSetName); + void addAttribute(const real_t value, const std::string& attributeName); + + protected: + bool initialized_{false}; + + std::shared_ptr< StructuredBlockForest > blocks_; + Setup setup_; + IDs ids_; + + std::unique_ptr<H5::H5File> h5file_; + + uint_t executionCounter_{0}; + uint_t totalExecutionCounter_{0}; + + uint_t rampUpTime_{100000}; + uint_t checkFrequency_{10000}; +}; // class Plotter + } \ No newline at end of file diff --git a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp index b0c7c191dd9d82d587fb23de243697da0c8e60ab..fcae3b718a1a6e99b98bc5243bf1e88b79a97009 100644 --- a/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp +++ b/apps/showcases/PorousMediaGPU/InitializerFunctions.cpp @@ -101,15 +101,16 @@ void setVelocityFieldsAsmuth(const shared_ptr< StructuredBlockStorage >& blocks, const real_t kappa = real_c(0.41); auto domainSize = blocks->getDomain().max(); + const real_t delta = domainSize[1] / real_c(2.0); + domainSize /= setup.dxC; const real_t frictionVelocity = setup.frictionVelocity; + const real_t dx = setup.dxC; + const real_t viscosity = setup.viscosity; for( auto block = blocks->begin(); block != blocks->end(); ++block ) { const uint_t level = blocks->getLevel(*block.get()); - const real_t levelScaleFactor = real_c(uint_c(1) << level); - const real_t dx = setup.dxC / (real_c(1.0) / levelScaleFactor); - auto * velocityField = block->template getData<VelocityField_T>(ids.velocityField); const auto ci = velocityField->xyzSizeWithGhostLayer(); @@ -118,29 +119,49 @@ void setVelocityFieldsAsmuth(const shared_ptr< StructuredBlockStorage >& blocks, Cell globalCell(*cellIt); blocks->transformBlockLocalToGlobalCell(globalCell, *block); Vector3<real_t> cellCenter; - blocks->getCellCenter(cellCenter, globalCell); + blocks->getCellCenter(cellCenter, globalCell, level); + const auto y = cellCenter[1]; const auto rel_x = cellCenter[0] / domainSize[0]; - const auto rel_y = cellCenter[1] / domainSize[1]; + // const auto rel_y = cellCenter[1] / domainSize[1]; const auto rel_z = cellCenter[2] / domainSize[2]; + const real_t pos = std::max(delta - std::abs(y - delta - 0_r), 0.05_r); + const auto rel_y = pos / delta; + + auto initialVel = frictionVelocity * (std::log(frictionVelocity * pos / viscosity) / kappa + B); + + Vector3<real_t> vel; + vel[0] = initialVel; + + vel[2] = 2_r * frictionVelocity / kappa * std::sin(math::pi * 16_r/dx * rel_x) * + std::sin(math::pi * 8_r * rel_y) / (std::pow(rel_y, 2_r) + 1_r); + + vel[1] = 8_r * frictionVelocity / kappa * + (std::sin(math::pi * 8_r/dx * rel_z) * std::sin(math::pi * 8_r * rel_y) + + std::sin(math::pi * 8_r/dx * rel_x)) / (std::pow(0.5_r * delta - pos, 2_r) + 1_r); + + +/* + const real_t pos = std::max(delta - std::abs(y - delta), real_c(2.0)); const real_t t1 = std::sin(math::pi * 8.0 / dx * rel_z) * std::sin(math::pi * 8.0 / dx * rel_y) + std::sin(math::pi * 8.0 / dx * rel_x); - const real_t t2 = real_c(1.0); // (np.power(0.5 * delta - pos, 2.0) + 1.0); + const real_t t2 = std::pow(1.0_r * delta - pos, 2_r) + 1_r; const real_t t3 = std::sin(math::pi * 16.0_r / dx * rel_x) * std::sin(math::pi * 8.0_r / dx * rel_y); Vector3<real_t> vel; - - vel[0] = setup.latticeVelocity; - vel[1] = 5.0_r * frictionVelocity / kappa * t1 / t2; + + const real_t intVelX = frictionVelocity * (std::log(frictionVelocity * pos / viscosity) / kappa + B); + vel[0] = setup.latticeVelocity; // intVelX > setup.latticeVelocity ? setup.latticeVelocity : intVelX; + vel[1] = 1.0_r * frictionVelocity / kappa * t1 / t2; vel[2] = 2.0_r * frictionVelocity / kappa * t3 / (std::pow(rel_y, 2.0_r) + 1.0_r); - + +*/ velocityField->get(*cellIt, 0) = vel[0]; velocityField->get(*cellIt, 1) = vel[1]; velocityField->get(*cellIt, 2) = vel[2]; } } - } // function setVelocityFieldsHenrik } // namespace walberla diff --git a/apps/showcases/PorousMediaGPU/InitializerFunctions.h b/apps/showcases/PorousMediaGPU/InitializerFunctions.h index cb69db2eb743ff15e024424b54c5fb68121203c4..de76286fc600dc33fdb7c997783ebbeedb5629cb 100644 --- a/apps/showcases/PorousMediaGPU/InitializerFunctions.h +++ b/apps/showcases/PorousMediaGPU/InitializerFunctions.h @@ -36,6 +36,7 @@ namespace walberla{ void initGrid(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID flagFieldID, const Setup& setup); void initHemisphers(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID flagFieldID, const Setup& setup, const Config::BlockHandle& wallParameters); +void initGenericPorousMatrix(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID flagFieldID, const Setup& setup, const Config::BlockHandle& gpmParameters); bool hemispheresContains(const Vector3<real_t>& point, const Vector3<real_t>& center, const real_t radiusSquared); void setVelocityFieldsAsmuth(const shared_ptr< StructuredBlockStorage >& blocks, const Setup& setup, const IDs& ids); } // namespace walberla diff --git a/apps/showcases/PorousMediaGPU/JinCase.prm b/apps/showcases/PorousMediaGPU/JinCase.prm new file mode 100644 index 0000000000000000000000000000000000000000..8215f12bf9e8fe4ed022bebce95ceab08aac95db --- /dev/null +++ b/apps/showcases/PorousMediaGPU/JinCase.prm @@ -0,0 +1,108 @@ +Parameters +{ + reynoldsNumber 400; + bulkReynoldsNumber 4466; + latticeVelocity 0.05; + initialiseWithlatticeVelocity true; + initialiseWithTurbulence true; + forceUpdateFrequency 10; + + welfordSamplingStart 200000; + welfordSamplingFrequency 1; + + timesteps 10; + + processMemoryLimit 512; // MiB + innerOuterSplit <1, 1, 1>; + + // GPU related Parameters, only used if GPU is enabled + gpuEnabledMPI false; +} + +Turbulence +{ + useGrid false; + turbulentInflow false; + U_rms 0.01; + characteristicLength 0.4; + numberOfFourrierModes 128; + U_c < 2.0, 0.0, 0.0 >; +} + +RoughWall +{ + hemispheresRadius 1; + hemispheresDistance 4; +} + +DomainSetup +{ + domainSize < 48.4, 40, 12 >; + cellsPerBlock < 121, 10, 30 >; + blocks < 1, 10, 1 >; + + // domainSize < 48.4, 40, 12 >; + // cellsPerBlock < 242, 20, 60 >; + // blocks < 1, 10, 1 >; + + periodic < true, false, true >; + refinementLevels 0; + numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used + blockForestFilestem porousMediaBlockForest; +} +//! [domainSetup] + +Boundaries +{ + Border { direction N; walldistance -1; flag Obstacle; } + Border { direction S; walldistance -1; flag Obstacle; } +} + + +StabilityChecker +{ + checkFrequency 0; + streamOutput false; + vtkOutput true; +} + +VTKWriter +{ + vtkWriteFrequency 2000; + vtkGhostLayers 0; + velocity true; + writeVelocityAsMagnitude false; + density false; + meanVelocity true; + flag false; + writeOnlySlice true; + sliceThickness 1; + writeXZSlice false; + amrFileFormat false; + oneFilePerProcess false; + samplingResolution -1; + initialWriteCallsToSkip 0; +} + +Logging +{ + logLevel info; // info progress detail tracing + writeSetupForestAndReturn false; // When only one process is used the decomposition is writen and the program terminates + readSetupFromFile false; + remainingTimeLoggerFrequency 20; // in seconds + writeSurfaceMeshOfTPMS false; + outputFolder output; + filestem Re400; +} + +Probes +{ + evaluationFrequency 1; + writeCounter 1000; // after how many evaluations the results are written to file +} + +DragEvaluation +{ + evaluationStart 200000; + evaluationFrequency 10; +} \ No newline at end of file diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp index 387d10542b73a1b0b5e0abe6abd2e208e64bd48e..35af02ce8f28bb5727b7d9a61b89f4360855e181 100644 --- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp +++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp @@ -92,16 +92,12 @@ #include "PorousMediaGPUStaticDefines.h" using namespace walberla; -using macroFieldType = VelocityField_T::value_type; #if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT) -using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >; using gpu::communication::NonUniformGPUScheme; using lbm_generated::NonuniformGeneratedGPUPdfPackInfo; #else -using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >; using blockforest::communication::UniformBufferedScheme; - using lbm_generated::UniformGeneratedPdfPackInfo; #endif @@ -121,15 +117,17 @@ class HemisphereWallDistance const real_t yMin = domain.yMin(); const real_t yMax = domain.yMax(); + // WALBERLA_CHECK(!hemispheresContains(fluid, center, radius_), "fluid cell is in contained in Hemisphere (" << fluid[0] << ", " << fluid[1] << ", " << fluid[2] << "). The block local cell is " << fluidCell) + // WALBERLA_CHECK(hemispheresContains(boundary, center, radius_), "boundary cell is not in contained in Hemisphere (" << boundary[0] << ", " << boundary[1] << ", " << boundary[2] << "). The block local cell is " << boundaryCell) + Vector3< real_t > boundary = SbF->getBlockLocalCellCenter( block, boundaryCell ); Vector3< real_t > fluid = SbF->getBlockLocalCellCenter( block, fluidCell ); auto center = getCenterOfHemisphere(boundary, hemispheresDistance_, yMin, yMax); - hemispheresContains(boundary, center, radius_); - - WALBERLA_CHECK(!hemispheresContains(fluid, center, radius_), "fluid cell is in contained in Hemisphere (" << fluid[0] << ", " << fluid[1] << ", " << fluid[2] << "). The block local cell is " << fluidCell) - WALBERLA_CHECK(hemispheresContains(boundary, center, radius_), "boundary cell is not in contained in Hemisphere (" << boundary[0] << ", " << boundary[1] << ", " << boundary[2] << "). The block local cell is " << boundaryCell) + if(hemispheresContains(fluid, center, radius_) || !hemispheresContains(boundary, center, radius_)){ + return real_c(0.5); + } return delta(fluid, boundary, center); } @@ -190,6 +188,31 @@ void refineWalls(SetupBlockForest& forest, const real_t hemispheresRadius, const } } +real_t calculateBulkVelocity(const shared_ptr< StructuredBlockStorage >& blocks, const IDs& ids, const FlagUID domainFlagUID) { + real_t bulkVelocity = real_c(0.0); + real_t numCells = real_c(0); + + for( auto block = blocks->begin(); block != blocks->end(); ++block) { + const uint_t level = blocks->getLevel(*block); + const real_t scaleFactor = real_c(1.0) / real_c( 1 << level ); + const real_t cellVolume = scaleFactor * scaleFactor * scaleFactor; + auto * velocityField = block->template getData<VelocityField_T>(ids.velocityField); + auto * flagField = block->template getData<FlagField_T>(ids.flagField); + auto domainFlag = flagField->getFlag(domainFlagUID); + const auto ci = velocityField->xyzSize(); + for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) { + if(isFlagSet(flagField->get(*cellIt), domainFlag)){ + bulkVelocity += (velocityField->get(*cellIt, 0) * cellVolume); + numCells += cellVolume; + } + } + } + + mpi::allReduceInplace< real_t >(bulkVelocity, mpi::SUM); + bulkVelocity /= real_c(numCells); + return bulkVelocity; +} + int main(int argc, char** argv){ @@ -223,6 +246,7 @@ int main(int argc, char** argv){ "\n + omega: " << setup.omegaF << " (on the finest grid)" << "\n + inflow velocity: " << setup.referenceVelocity << " [m/s]" << "\n + lattice velocity: " << setup.latticeVelocity << " [dx/dt]" << + "\n + friction velocity: " << setup.frictionVelocity << " [dx/dt]" << "\n + Reynolds number: " << setup.reynoldsNumber << "\n + Mach number: " << setup.machNumber << "\n + driving force: " << setup.drivingForce << @@ -231,7 +255,8 @@ int main(int argc, char** argv){ "\n + dt (coarsest grid): " << setup.dt << " [s]" << "\n + conversion Faktor: " << setup.conversionFactor << "\n + #time steps: " << setup.timesteps << " (on the coarsest grid, " << ( real_c(1.0) / setup.dt ) << " for 1s of real time)" << - "\n + simulation time: " << ( real_c( setup.timesteps ) * setup.dt ) << " [s]" ) + "\n + simulation time: " << ( real_c( setup.timesteps ) * setup.dt ) << " [s]" + "\n + flow through cycles: " << ( setup.flowThroughCycles )) bool writeSetupForestAndReturn = loggingParameters.getParameter< bool >("writeSetupForestAndReturn", false); if (uint_c(MPIManager::instance()->numProcesses()) > 1) writeSetupForestAndReturn = false; @@ -261,22 +286,24 @@ int main(int argc, char** argv){ auto allocator = make_shared< gpu::HostFieldAllocator<macroFieldType> >(); // use pinned memory allocator for faster CPU-GPU memory transfers ids.velocityField = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator); ids.meanVelocityField = field::addToStorage< VelocityField_T >(blocks, "mean velocity", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator); + ids.sosField = field::addToStorage< TensorField_T >(blocks, "sos velocity", macroFieldType(0.0), field::fzyx, setup.numGhostLayers, allocator); ids.densityField = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, setup.numGhostLayers, allocator); ids.flagField = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", setup.numGhostLayers); ids.pdfFieldGPU = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, ids.pdfField, StorageSpec, "pdfs on GPU", true); ids.velocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityField, "velocity on GPU", true); ids.meanVelocityFieldGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.meanVelocityField, "mean velocity on GPU", true); + ids.sosFieldGPU = gpu::addGPUFieldToStorage< TensorField_T >(blocks, ids.sosField, "sos field on GPU", true); ids.densityFieldGPU = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, ids.densityField, "velocity on GPU", true); + ids.flagFieldGPU = gpu::addGPUFieldToStorage< FlagField_T >(blocks, ids.flagField, "flag on GPU", true); std::function< Vector3< uint_t > (const shared_ptr< StructuredBlockStorage >&, IBlock * const) > getInflowSliceSize = inflowSliceSize; ids.velocityFieldInflowSlice = field::addToStorage< VelocityField_T >(blocks, "velocity Inflow", getInflowSliceSize, macroFieldType(0.0), field::fzyx, setup.numGhostLayers); ids.velocityFieldInflowSliceGPU = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, ids.velocityFieldInflowSlice, "velocity Inflow on GPU", true); const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1))); - SweepCollection_T sweepCollection(blocks, ids.pdfFieldGPU, ids.densityFieldGPU, ids.velocityFieldGPU, setup.drivingForce, setup.omega, innerOuterSplit); - WelfordSweep_T welfordSweep(ids.meanVelocityFieldGPU, ids.velocityFieldGPU, real_c(0.0)); - + SweepCollection_T sweepCollection(blocks, ids.densityFieldGPU, ids.pdfFieldGPU, ids.velocityFieldGPU, setup.drivingForce, setup.omega, innerOuterSplit); + WelfordSweep_T welfordSweep(ids.meanVelocityFieldGPU, ids.sosFieldGPU, ids.velocityFieldGPU, real_c(0.0)); WALBERLA_LOG_INFO_ON_ROOT(++step << ". Initialising data") if(setup.initialiseWithlatticeVelocity){ @@ -288,13 +315,13 @@ int main(int argc, char** argv){ } } - setVelocityFieldsAsmuth(blocks, setup, ids); + if(parameters.getParameter< bool >("initialiseWithTurbulence", false)){ + setVelocityFieldsAsmuth(blocks, setup, ids); + } gpu::fieldCpy< gpu::GPUField< VelocityField_T::value_type >, VelocityField_T >(blocks, ids.velocityFieldGPU, ids.velocityField); - for (auto& block : *blocks){ sweepCollection.initialise(&block, cell_idx_c(setup.numGhostLayers - 1)); - auto velocityInflow = block.getData< VelocityField_T >(ids.velocityFieldInflowSlice); WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velocityInflow, velocityInflow->get(x, y, z, 0) = setup.latticeVelocity; @@ -305,6 +332,8 @@ int main(int argc, char** argv){ auto inflow = std::make_shared<Inflow>(blocks, setup, ids); auto probe = std::make_shared<Probe>(blocks, setup, ids, config->getOneBlock("Probes")); + auto plotter = std::make_shared<Plotter>(blocks, setup, ids); + /* const real_t lenght = 8.0; @@ -316,9 +345,16 @@ int main(int argc, char** argv){ } */ + const uint_t nx = blocks->getXSize() * blocks->getNumberOfXCells(); + for(uint_t i = 0; i < nx; ++i){ + probe->addPoint(Vector3<real_t>(real_c(i) * setup.dxC, finalDomain.center()[1], finalDomain.center()[2] )); + } + + probe->addPoint(finalDomain.center()); probe->initialise(); + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// LB SWEEPS AND BOUNDARY HANDLING /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -373,6 +409,9 @@ int main(int argc, char** argv){ } } + gpu::fieldCpy< GPUFlagField_T, FlagField_T >(blocks, ids.flagFieldGPU, ids.flagField); + auto bvc = std::make_shared<BulkVelocityCalculator>(blocks, ids, setup); + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// COMMUNICATION SCHEME /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -405,65 +444,41 @@ int main(int argc, char** argv){ LBMMeshRefinement.addPostBoundaryHandlingBlockFunction(evaluation->forceCalculationFunctor()); } - - const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - setup.dxF, - finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + setup.dxF); - -/* - CellInterval globalCellInterval; - blocks->getCellBBFromAABB( globalCellInterval, sliceXY, uint_c(0) ); - - std::map<IBlock*, CellInterval> sliceCellIntervals; - - for (auto& block : *blocks){ - CellInterval cellBB; - blocks->getCellBBFromAABB( cellBB, sliceXY, blocks->getLevel(block) ); - blocks->transformGlobalToBlockLocalCellInterval( cellBB, block ); - - Cell maxCell(cell_idx_c(0), cell_idx_c(0), cell_idx_c(0)); - Cell minCell(cell_idx_c( blocks->getNumberOfXCells( block ) ), cell_idx_c( blocks->getNumberOfYCells( block ) ), cell_idx_c( blocks->getNumberOfZCells( block ) )); - - for( cell_idx_t z = cell_idx_c(0); z != cell_idx_c( blocks->getNumberOfZCells( block ) ); ++z ){ - for( cell_idx_t y = cell_idx_c(0); y != cell_idx_c( blocks->getNumberOfYCells( block ) ); ++y ){ - for( cell_idx_t x = cell_idx_c(0); x != cell_idx_c( blocks->getNumberOfXCells( block ) ); ++x ){ - if( cellBB.contains(x,y,z) ){ - if(Cell(x, y, z) < minCell){ - minCell = Cell(x, y, z); - } - if(!(Cell(x, y, z) < maxCell)){ - maxCell = Cell(x, y, z); - } - } - } + const uint_t forceUpdateFrequency = parameters.getParameter< uint_t >("forceUpdateFrequency", 0); + if(forceUpdateFrequency > 0){ + auto updateForce = [&](){ + if(timeloop.getCurrentTimeStep() % forceUpdateFrequency == 0) { + const real_t correctedDrivingForce = bvc->getDrivingForce(); + sweepCollection.setFx(correctedDrivingForce); } - } - CellInterval localCellInterval(minCell, maxCell); - if(!localCellInterval.empty()){ - sliceCellIntervals[&block] = localCellInterval; - } + }; + timeloop.addFuncBeforeTimeStep(updateForce, "Update force"); } - timeloop.add() << BeforeFunction([&]() - { - for (auto const& sliceCellInterval : sliceCellIntervals){ - sweepCollection.calculateMacroscopicParametersCellInterval(sliceCellInterval.first, sliceCellInterval.second); - } - - const uint_t velCtr = uint_c(welfordSweep.getCounter()); - welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0)); - }) << Sweep(welfordSweep.getSweepOnCellInterval(blocks, globalCellInterval), "welford sweep"); - -*/ + const uint_t welfordSamplingStart = parameters.getParameter< uint_t >("welfordSamplingStart", 0); + const uint_t welfordSamplingFrequency = parameters.getParameter< uint_t >("welfordSamplingFrequency", 0); - /*SweepTimeloop timeloop(blocks->getBlockStorage(), setup.timesteps); - timeloop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication") - << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions"); - timeloop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");*/ + if(welfordSamplingFrequency > 0){ + auto welfordUpdate = [&](){ + if(timeloop.getCurrentTimeStep() > welfordSamplingStart && (timeloop.getCurrentTimeStep() % welfordSamplingFrequency) == 0) { + const uint_t velCtr = uint_c(welfordSweep.getCounter()); + welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0)); + for (auto& block : *blocks){ + welfordSweep.run(&block); + } + }; + }; + timeloop.addFuncBeforeTimeStep(welfordUpdate, "Welford Sampling"); + } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// VTK Output /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + auto copyVelocity = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.velocityField, ids.velocityFieldGPU); + auto copyDensity = gpu::fieldCpyFunctor< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(ids.densityField, ids.densityFieldGPU); + auto copyMeanVelocity = gpu::fieldCpyFunctor< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(ids.meanVelocityField, ids.meanVelocityFieldGPU); + // VTK auto VTKWriter = config->getOneBlock("VTKWriter"); const uint_t vtkWriteFrequency = VTKWriter.getParameter< uint_t >("vtkWriteFrequency", 0); @@ -475,28 +490,30 @@ int main(int argc, char** argv){ const real_t sliceThickness = VTKWriter.getParameter< real_t >("sliceThickness", real_c(1.0)); if (vtkWriteFrequency > 0){ - const std::string vtkFolder = "VTKPorousMediaRE_" + std::to_string(uint64_c(setup.reynoldsNumber)); + const std::string identifier = setup.fileStem; auto vtkOutput = - vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, vtkGhostLayers, false, vtkFolder, + vtk::createVTKOutput_BlockData(*blocks, identifier, vtkWriteFrequency, vtkGhostLayers, false, setup.outputFolder, "simulation_step", false, true, true, false, 0, amrFileFormat, oneFilePerProcess); vtkOutput->setInitialWriteCallsToSkip(initialWriteCallsToSkip); - vtkOutput->addBeforeFunction([&]() { + for (auto& block : *blocks){ - sweepCollection.calculateMacroscopicParameters(&block); + if (VTKWriter.getParameter< bool >("velocity")){copyVelocity(&block);} + if (VTKWriter.getParameter< bool >("density")){copyDensity(&block);} + if (VTKWriter.getParameter< bool >("meanVelocity")){copyMeanVelocity(&block);} } + auto drivingForeceGPU = bvc->getDrivingForce(); + auto initialDrivingForce = setup.drivingForce; - // for (auto const& sliceCellInterval : sliceCellIntervals){ - // sweepCollection.calculateMacroscopicParametersCellInterval(sliceCellInterval.first, sliceCellInterval.second); - // } + auto bulkVelocityCPU = calculateBulkVelocity(blocks, ids, setup.fluidUID); + auto blukVelocityGPU = bvc->bulkVelocity(); + WALBERLA_LOG_DEVEL_VAR(bulkVelocityCPU); + WALBERLA_LOG_DEVEL_VAR(blukVelocityGPU); + WALBERLA_LOG_DEVEL_VAR(blukVelocityGPU); + WALBERLA_LOG_DEVEL_VAR(drivingForeceGPU); -#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT) - gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.velocityField, ids.velocityFieldGPU); - // gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.meanVelocityField, ids.meanVelocityFieldGPU); - // gpu::fieldCpy< ScalarField_T, gpu::GPUField< ScalarField_T::value_type > >(blocks, ids.densityField, ids.densityFieldGPU); -#endif }); vtkOutput->setSamplingResolution(samplingResolution); @@ -507,8 +524,8 @@ int main(int argc, char** argv){ if (VTKWriter.getParameter< bool >("writeOnlySlice", true)){ - // const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - sliceThickness * setup.dxF, - // finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + sliceThickness * setup.dxF); + const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - sliceThickness * setup.dxF, + finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + sliceThickness * setup.dxF); vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY)); if (VTKWriter.getParameter< bool >("writeXZSlice", false)){ @@ -589,6 +606,15 @@ int main(int argc, char** argv){ #endif WALBERLA_MPI_BARRIER() simTimer.end(); + + gpu::fieldCpy< TensorField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.sosField, ids.sosFieldGPU); + + const double ut = evaluation->getMeanFrictionVelocity(); + const double welfordCounter = welfordSweep.getCounter(); + WALBERLA_LOG_DEVEL_VAR(ut) + WALBERLA_LOG_DEVEL_VAR(welfordCounter) + plotter->writeResult(ut, welfordCounter); + WALBERLA_LOG_INFO_ON_ROOT("Simulation finished") double time = double_c(simTimer.max()); WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); } diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.py b/apps/showcases/PorousMediaGPU/PorousMediaGPU.py index 009f0742b89acbc3163115db810aa1bd2763c7e9..fbd9e7e4f1f3832d8cdffba57b7ecee7b4871100 100644 --- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.py +++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.py @@ -7,7 +7,7 @@ from pystencils.simp import insert_aliases, insert_constants from lbmpy import LBStencil, LBMConfig, LBMOptimisation from lbmpy.boundaries.boundaryconditions import NoSlip, NoSlipLinearBouzidi, ExtrapolationOutflow, UBB, FixedDensity from lbmpy.creationfunctions import create_lb_collision_rule -from lbmpy.enums import Method, Stencil, ForceModel +from lbmpy.enums import Method, Stencil, ForceModel, SubgridScaleModel from lbmpy.flow_statistics import welford_assignments from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep @@ -39,15 +39,16 @@ with CodeGeneration() as ctx: dtype = 'float64' if ctx.double_accuracy else 'float32' pdf_dtype = 'float64' - stencil = LBStencil(Stencil.D3Q27) + stencil = LBStencil(Stencil.D3Q19) q = stencil.Q dim = stencil.D - + streaming_pattern = 'esopull' pdfs = fields(f"pdfs({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx') velocity_field, density_field = fields(f"velocity({dim}), density(1) : {dtype}[{dim}D]", layout='fzyx') mean_velocity = fields(f"mean_velocity({dim}): {dtype}[{stencil.D}D]", layout='fzyx') + sum_of_squares_field = fields(f"sum_of_squares_field({stencil.D**2}): {dtype}[{stencil.D}D]", layout='fzyx') macroscopic_fields = {'density': density_field, 'velocity': velocity_field} @@ -59,9 +60,10 @@ with CodeGeneration() as ctx: relaxation_rate=omega, compressible=True, force=force_vector, - # subgrid_scale_model=SubgridScaleModel.QR, - fourth_order_correction=0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False, - streaming_pattern=streaming_pattern + force_model=ForceModel.GUO, + # fourth_order_correction=0.1 if method_enum == Method.CUMULANT and stencil.Q == 27 else False, + streaming_pattern=streaming_pattern, + output={'density': density_field, 'velocity': velocity_field} ) lbm_opt = LBMOptimisation(cse_global=False, cse_pdfs=False, field_layout="fzyx") @@ -110,11 +112,12 @@ with CodeGeneration() as ctx: target=target, data_type=dtype, pdfs_data_type=pdf_dtype, max_threads=max_threads) - welford_update = welford_assignments(field=velocity_field, mean_field=mean_velocity) + welford_update = welford_assignments(field=velocity_field, mean_field=mean_velocity, sum_of_squares_field=sum_of_squares_field) generate_sweep(ctx, "Welford", welford_update, target=target) field_typedefs = {'VelocityField_T': velocity_field, - 'ScalarField_T': density_field} + 'ScalarField_T': density_field, + 'TensorField_T': sum_of_squares_field} # Info header containing correct template definitions for stencil and field generate_info_header(ctx, f'{name_prefix}InfoHeader', diff --git a/apps/showcases/PorousMediaGPU/Probe.cpp b/apps/showcases/PorousMediaGPU/Probe.cpp index 160297ac6dd76b180e7f79b9737127ce1155ae7a..37323d336e2e5ff1a8be124faa5fc3c3892f386c 100644 --- a/apps/showcases/PorousMediaGPU/Probe.cpp +++ b/apps/showcases/PorousMediaGPU/Probe.cpp @@ -23,7 +23,6 @@ namespace walberla{ void Probe::initialise(){ - if(probePoints_.empty()){return;} const AABB domain = blocks_->getDomain(); @@ -35,22 +34,26 @@ void Probe::initialise(){ const Cell localCell = blocks_->getBlockLocalCell(*block, p); Vector3<real_t> localCellCenter; blocks_->getCellCenter(localCellCenter, localCell); - pointsPerBlock_[block].emplace_back(localCellCenter); } } - for (auto& block : *blocks_){ - auto * dataVector = block.getData< DataVector > ( probeDataID ); + uint_t totalNumberOfPoints = 0; + for (auto const& it : pointsPerBlock_){ + IBlock* block = it.first; + std::vector<Vector3<real_t>> points = it.second; + + auto * dataVector = block->getData< DataVector > ( probeDataID ); auto & cpuCellVector = dataVector->cpuCellVector(); auto & cpuDataVector = dataVector->cpuDataVector(); - const uint_t n = uint_c(pointsPerBlock_[&block].size()); + const uint_t n = points.size(); + totalNumberOfPoints += n; cpuCellVector.resize(3 * n); uint_t i = 0; - for(auto p: pointsPerBlock_[&block]){ - const Cell localCell = blocks_->getBlockLocalCell(block, p); + for(auto p: points){ + const Cell localCell = blocks_->getBlockLocalCell(*block, p); cpuCellVector[i] = localCell[0]; cpuCellVector[i + n] = localCell[1]; @@ -67,14 +70,15 @@ void Probe::initialise(){ getBlockAndGridSize(s, threadPerBlock, numberBlocks); - gpuBlock_[&block] = threadPerBlock; - gpuGrid_[&block] = numberBlocks; + gpuBlock_[block] = threadPerBlock; + gpuGrid_[block] = numberBlocks; } extendSize[0] = 0; selectStart[0] = 0; writeHeaderFiles(); + WALBERLA_LOG_INFO_ON_ROOT("Added " << totalNumberOfPoints << " probes for postprocessing") } void Probe::writeHeaderFiles(){ @@ -87,10 +91,16 @@ void Probe::writeHeaderFiles(){ H5::DSetCreatPropList props; props.setChunk(1, chunkSize); - filesystem::path hdfFilePath( filename_.c_str() ); - if( filesystem::exists( hdfFilePath ) ) - std::remove( hdfFilePath.string().c_str() ); + const auto outputFolder = setup_.outputFolder; + const auto fileStem = setup_.fileStem; + const auto fileName = outputFolder + "/" + fileStem + "Probe.h5"; + filesystem::path hdfFilePath( fileName.c_str() ); + WALBERLA_ROOT_SECTION() + { + if( filesystem::exists( hdfFilePath ) ) + std::remove( hdfFilePath.string().c_str() ); + } H5::FileAccPropList fileAccProps; #ifdef WALBERLA_BUILD_WITH_MPI @@ -103,8 +113,8 @@ void Probe::writeHeaderFiles(){ timeGroup.createDataSet("t", io::h5Float(), arrayDataSpace, props); uint_t counter = 0; - for (auto blockIterator = blocks_->begin(); blockIterator != blocks_->end(); ++blockIterator){ - auto block = blockIterator.get(); + for (auto const& it : pointsPerBlock_){ + IBlock* block = it.first; auto* dataVector = block->getData< DataVector > ( probeDataID ); auto & cpuCellVector = dataVector->cpuCellVector(); const uint_t n = uint_c(dataVector->size()); @@ -144,8 +154,8 @@ void Probe::writeProbes(){ writeData(tmp, "time", "t"); - for (auto blockIterator = blocks_->begin(); blockIterator != blocks_->end(); ++blockIterator){ - auto block = blockIterator.get(); + for (auto const& it : pointsPerBlock_){ + IBlock* block = it.first; auto* dataVector = block->getData< DataVector > ( probeDataID ); dataVector->syncCPU(); dataVector->clear(); @@ -153,8 +163,8 @@ void Probe::writeProbes(){ const uint_t nDatasets = dataSetNames.size(); - for (auto blockIterator = blocks_->begin(); blockIterator != blocks_->end(); ++blockIterator){ - auto block = blockIterator.get(); + for (auto const& it : pointsPerBlock_){ + IBlock* block = it.first; auto* dataVector = block->getData< DataVector > ( probeDataID ); auto & cpuDataVector = dataVector->cpuDataVector(); const uint_t n = uint_c(dataVector->size()); @@ -193,19 +203,19 @@ void Probe::update(){ ++totalExecutionCounter_; if (probePoints_.empty() || (totalExecutionCounter_ - uint_c(1)) % frequency_ != 0) return; - const real_t velocityScalingFactor = setup_.conversionFactor; - - for (auto& block : *blocks_){ + const float velocityScalingFactor = float_c(setup_.conversionFactor); - auto pdfs = block.getData< gpu::GPUField<double> >(ids_.pdfFieldGPU); + for (auto const& it : pointsPerBlock_){ + IBlock* block = it.first; + auto density = block->getData< gpu::GPUField<double> >(ids_.densityFieldGPU); + auto velocity = block->getData< gpu::GPUField<double> >(ids_.velocityFieldGPU); + auto* dataVector = block->getData< DataVector > ( probeDataID ); - auto* dataVector = block.getData< DataVector > ( probeDataID ); int64_t* gpuCellVector = dataVector->gpuCellVector(); float* gpuDataVector = dataVector->advanceGPUDataVector(); - const int64_t n = dataVector->size(); - updateProbe(pdfs, gpuCellVector, gpuDataVector, n, velocityScalingFactor, gpuBlock_[&block], gpuGrid_[&block]); + updateProbe(density, velocity, gpuCellVector, gpuDataVector, n, velocityScalingFactor, gpuBlock_[block], gpuGrid_[block]); } ++executionCounter_; diff --git a/apps/showcases/PorousMediaGPU/Probe.h b/apps/showcases/PorousMediaGPU/Probe.h index 7dec08cf8ee025362edd42cf6e61dc45db3ae7d1..f787233adfd2e4d310be8d401ba3575b2f0fb729 100644 --- a/apps/showcases/PorousMediaGPU/Probe.h +++ b/apps/showcases/PorousMediaGPU/Probe.h @@ -106,7 +106,6 @@ class Probe { frequency_ = probeParameters.getParameter< uint_t >("evaluationFrequency"); - filename_ = probeParameters.getParameter< std::string >("filename"); writeCounter_ = probeParameters.getParameter< uint_t >("writeCounter"); auto createProbeData = []( IBlock * const , StructuredBlockStorage * const ) { return new DataVector(); }; @@ -130,7 +129,6 @@ class Probe BlockDataID probeDataID; uint_t frequency_; - std::string filename_; uint_t writeCounter_; uint_t executionCounter_; uint_t totalExecutionCounter_; diff --git a/apps/showcases/PorousMediaGPU/Setup.h b/apps/showcases/PorousMediaGPU/Setup.h index 09d0723dbf70d78463efc97b642ec0f10df56bb1..d156bccc23ac1f68288bd794c6a8a05191fc9a24 100644 --- a/apps/showcases/PorousMediaGPU/Setup.h +++ b/apps/showcases/PorousMediaGPU/Setup.h @@ -28,14 +28,12 @@ #include <string> #include "Types.h" +#include "util.h" +#include "core/Filesystem.h" namespace walberla { -inline real_t calculateDrivingForce(const real_t frictionVelocity, const real_t channelHalfHeight){ - return frictionVelocity * frictionVelocity / channelHalfHeight; -} - struct Setup { @@ -75,22 +73,23 @@ struct Setup uc = turbulence.getParameter< Vector3< real_t > >("U_c"); if(config->getNumBlocks("RoughWall") > 0){ - kinViscosity = parameters.getParameter< real_t >("viscosity"); - const real_t bulkReynoldsNumber = parameters.getParameter< real_t >("bulkReynoldsNumber"); + // kinViscosity = parameters.getParameter< real_t >("viscosity"); referenceLength *= real_c(0.5); - - const real_t bulkVelocity = (bulkReynoldsNumber * kinViscosity) / referenceLength; - dt = latticeVelocity / bulkVelocity * coarseMeshSize; - viscosity = (kinViscosity * dt) / (coarseMeshSize * coarseMeshSize); const real_t latticeReferenceLength = referenceLength / coarseMeshSize; + const real_t bulkReynoldsNumber = parameters.getParameter< real_t >("bulkReynoldsNumber"); + + viscosity = latticeReferenceLength * latticeVelocity / bulkReynoldsNumber; + referenceVelocity = latticeVelocity; + dt = latticeVelocity / referenceVelocity * coarseMeshSize; + + kinViscosity = referenceLength * referenceVelocity / bulkReynoldsNumber; - referenceVelocity = (reynoldsNumber * kinViscosity) / referenceLength; const real_t speedOfSound = real_c(real_c(1.0) / std::sqrt( real_c(3.0) )); machNumber = latticeVelocity / speedOfSound; omega = real_c(real_c(1.0) / (real_c(3.0) * viscosity + real_c(0.5))); frictionVelocity = (reynoldsNumber * viscosity) / latticeReferenceLength; - drivingForce = calculateDrivingForce(frictionVelocity, latticeReferenceLength); + drivingForce = calculateDrivingForce(frictionVelocity, latticeReferenceLength, latticeVelocity, latticeVelocity); } else{ referenceVelocity = parameters.getParameter< real_t >("referenceVelocity"); @@ -116,6 +115,8 @@ struct Setup timesteps = parameters.getParameter< uint_t >("timesteps"); numGhostLayers = refinementLevels == 0 ? uint_c(1) : uint_c(2); + flowThroughCycles = (real_c(timesteps) * dt) / (domainSize[0] / (latticeVelocity * conversionFactor)); + valuesPerCell = (Stencil_T::Q + VelocityField_T::F_SIZE + uint_c(1) * ScalarField_T::F_SIZE); memoryPerCell = memory_t(valuesPerCell * sizeof(PdfField_T::value_type) + uint_c(1)); processMemoryLimit = parameters.getParameter< memory_t >( "processMemoryLimit", memory_t( 512 ) ) * memory_t( 1024 * 1024 ); @@ -133,6 +134,14 @@ struct Setup writeSetupForestAndReturn = logging.getParameter< bool >("writeSetupForestAndReturn", false); remainingTimeLoggerFrequency = logging.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0); + outputFolder = logging.getParameter< std::string >("outputFolder", "output"); + fileStem = logging.getParameter< std::string >("filestem", "data"); + + WALBERLA_ROOT_SECTION(){ + if (!filesystem::is_directory(outputFolder) || !filesystem::exists(outputFolder)) { // Check if output folder exists + filesystem::create_directory(outputFolder); // create output folder + } + } } std::string blockForestFilestem; @@ -172,6 +181,7 @@ struct Setup real_t dxC; real_t dxF; real_t dt; + real_t flowThroughCycles; real_t drivingForce; real_t conversionFactor; @@ -201,6 +211,8 @@ struct Setup bool writeSetupForestAndReturn; real_t remainingTimeLoggerFrequency; + std::string outputFolder; + std::string fileStem; }; } \ No newline at end of file diff --git a/apps/showcases/PorousMediaGPU/Types.h b/apps/showcases/PorousMediaGPU/Types.h index 5d2fbe11f65c22752d59d698a3c4dbaa78ca2f3d..d27d1fbf78e93c68935940c08f61c6343e24ffbb 100644 --- a/apps/showcases/PorousMediaGPU/Types.h +++ b/apps/showcases/PorousMediaGPU/Types.h @@ -36,11 +36,21 @@ using BoundaryCollection_T = lbm::PorousMediaGPUBoundaryCollection< FlagField_T using SweepCollection_T = lbm::PorousMediaGPUSweepCollection; using WelfordSweep_T = pystencils::Welford; +using macroFieldType = VelocityField_T::value_type; + +#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT) +#include "lbm_generated/gpu/GPUPdfField.h" +using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >; +using GPUField_T = gpu::GPUField< VelocityField_T::value_type >; +using GPUFlagField_T = gpu::GPUField< FlagField_T::value_type >; +#endif + struct IDs { BlockDataID pdfField; BlockDataID velocityField; BlockDataID meanVelocityField; + BlockDataID sosField; BlockDataID velocityFieldInflowSlice; BlockDataID densityField; BlockDataID flagField; @@ -48,6 +58,8 @@ struct IDs BlockDataID pdfFieldGPU; BlockDataID velocityFieldGPU; BlockDataID meanVelocityFieldGPU; + BlockDataID sosFieldGPU; BlockDataID velocityFieldInflowSliceGPU; BlockDataID densityFieldGPU; + BlockDataID flagFieldGPU; }; \ No newline at end of file diff --git a/apps/showcases/PorousMediaGPU/roughWall.prm b/apps/showcases/PorousMediaGPU/roughWall.prm index a23472152e1ed181709a8ab8aa16f4a624a898fa..fc4b740ff9bc6e4baa03ef0500a3290447982397 100644 --- a/apps/showcases/PorousMediaGPU/roughWall.prm +++ b/apps/showcases/PorousMediaGPU/roughWall.prm @@ -2,11 +2,15 @@ Parameters { reynoldsNumber 400; bulkReynoldsNumber 4466; - viscosity 1.48e-5; latticeVelocity 0.05; initialiseWithlatticeVelocity true; + initialiseWithTurbulence true; + forceUpdateFrequency 10; - timesteps 1; + welfordSamplingStart 200000; + welfordSamplingFrequency 1; + + timesteps 10; processMemoryLimit 512; // MiB innerOuterSplit <1, 1, 1>; @@ -25,20 +29,24 @@ Turbulence U_c < 2.0, 0.0, 0.0 >; } -RoughWall +genericPorousMatrix { - hemispheresRadius 1; - hemispheresDistance 4; + poreSize 2; + barSize 1; } DomainSetup { - domainSize < 34.6, 40, 4 >; - cellsPerBlock < 346, 40, 40 >; - + domainSize < 48.4, 40, 12 >; + cellsPerBlock < 121, 10, 30 >; blocks < 1, 10, 1 >; + + // domainSize < 48.4, 40, 12 >; + // cellsPerBlock < 242, 20, 60 >; + // blocks < 1, 10, 1 >; + periodic < true, false, true >; - refinementLevels 2; + refinementLevels 0; numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used blockForestFilestem porousMediaBlockForest; } @@ -46,8 +54,8 @@ DomainSetup Boundaries { - Border { direction N; walldistance -1; flag NoSlip; } - Border { direction S; walldistance -1; flag NoSlip; } + Border { direction N; walldistance -1; flag Obstacle; } + Border { direction S; walldistance -1; flag Obstacle; } } @@ -60,12 +68,12 @@ StabilityChecker VTKWriter { - vtkWriteFrequency 1000; + vtkWriteFrequency 2000; vtkGhostLayers 0; velocity true; writeVelocityAsMagnitude false; density false; - meanVelocity false; + meanVelocity true; flag false; writeOnlySlice true; sliceThickness 1; @@ -83,17 +91,18 @@ Logging readSetupFromFile false; remainingTimeLoggerFrequency 20; // in seconds writeSurfaceMeshOfTPMS false; + outputFolder output; + filestem Re400; } Probes { evaluationFrequency 1; writeCounter 1000; // after how many evaluations the results are written to file - filename probeData.h5; } DragEvaluation { - evaluationFrequency 40; - filename drag.h5; + evaluationStart 200000; + evaluationFrequency 10; } \ No newline at end of file diff --git a/apps/showcases/PorousMediaGPU/util.cpp b/apps/showcases/PorousMediaGPU/util.cpp index bc7e9ceae4cb5808330ebd68c0fe9440055d73be..c1a374962b11f51ac25d569b2dffd67cebd9e4bc 100644 --- a/apps/showcases/PorousMediaGPU/util.cpp +++ b/apps/showcases/PorousMediaGPU/util.cpp @@ -130,4 +130,8 @@ void getAllHemisphereCenters(std::vector<Vector3<real_t>>& centers, const real_t } } +real_t calculateDrivingForce(const real_t targetFrictionVelocity, const real_t referenceLenght, const real_t targetBulkVelocity, const real_t bulkVelocity){ + return targetFrictionVelocity * targetFrictionVelocity / referenceLenght + (targetBulkVelocity - bulkVelocity) * targetBulkVelocity / referenceLenght; +} + } // namespace walberla \ No newline at end of file diff --git a/apps/showcases/PorousMediaGPU/util.h b/apps/showcases/PorousMediaGPU/util.h index 2e05b671b3b9553c8206712cea633ee7850413e3..f2c76729e684f325ab1ec6e6283cf3c02edf5a34 100644 --- a/apps/showcases/PorousMediaGPU/util.h +++ b/apps/showcases/PorousMediaGPU/util.h @@ -35,6 +35,8 @@ Vector3<real_t> getCenterOfHemisphere(const Vector3<real_t>& p, const real_t hem const real_t domainYMin, const real_t domainYMax); bool hemispheresContains(const Vector3<real_t>& point, const Vector3<real_t>& center, const real_t radius); -void getAllHemisphereCenters(std::vector<Vector3<real_t>>& centers, const real_t hemispheresRadius, const real_t hemispheresDistance, const math::AABB& domain); +void getAllHemisphereCenters(std::vector<Vector3<real_t>>& centers, const real_t hemispheresRadius, const real_t hemispheresDistance, const math::AABB& domain); + +real_t calculateDrivingForce(const real_t targetFrictionVelocity, const real_t referenceLenght, const real_t targetBulkVelocity, const real_t bulkVelocity); } // namespace walberla \ No newline at end of file diff --git a/apps/showcases/PorousMediaGPU/utilGPU.cu b/apps/showcases/PorousMediaGPU/utilGPU.cu index d6b56626cfe5c67ba92bfedc021648c1ff64e3b0..c1c6f47cc95c6240dcd93bac34ea54b4daca10e6 100644 --- a/apps/showcases/PorousMediaGPU/utilGPU.cu +++ b/apps/showcases/PorousMediaGPU/utilGPU.cu @@ -25,50 +25,22 @@ namespace walberla namespace internal{ -static __device__ void densityVelocityEven(double* const _data_pdfs, - double* density, double* uX, double* uY, double* uZ, - int64_t const ctr_0, int64_t const ctr_1, int64_t const ctr_2, - int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +#if __CUDA_ARCH__ < 600 +__device__ double atomicAdd(double* address, double val) { - const double vel0Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3]; - const double momdensity_0 = vel0Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3]; - const double vel1Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3]; - const double momdensity_1 = vel1Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3]; - const double vel2Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3]; - const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2]; - const double momdensity_2 = vel2Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3]; - const double rho = delta_rho + 1.0; - const double u_0 = momdensity_0*((1.0) / (rho)); - const double u_1 = momdensity_1*((1.0) / (rho)); - const double u_2 = momdensity_2*((1.0) / (rho)); - *density = rho; - *uX = u_0; - *uY = u_1; - *uZ = u_2; -} + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; -static __device__ void densityVelocityOdd(double* const _data_pdfs, - double* density, double* uX, double* uY, double* uZ, - int64_t const ctr_0, int64_t const ctr_1, int64_t const ctr_2, - int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - const double vel0Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3]; - const double momdensity_0 = vel0Term - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 13*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3]; - const double vel1Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3]; - const double momdensity_1 = vel1Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 12*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3]; - const double vel2Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 5*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3]; - const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 6*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3]; - const double momdensity_2 = vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 14*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 15*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 6*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3]; - const double rho = delta_rho + 1.0; - const double u_0 = momdensity_0*((1.0) / (rho)); - const double u_1 = momdensity_1*((1.0) / (rho)); - const double u_2 = momdensity_2*((1.0) / (rho)); - *density = rho; - *uX = u_0; - *uY = u_1; - *uZ = u_2; -} + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#endif static __device__ void singleMode(double* rX, double* rY, double* rZ, const double kappaN, const double ut, const double pX, const double pY, const double pZ, const int64_t linear_index) @@ -195,9 +167,10 @@ static __global__ void summOfFourrierModesKernel(double * velocity, double * con } -static __global__ void evaluateProbePoints(double* const pdfs, int64_t* cellData, float* probeData, - const int64_t n, const uint8_t timestep, const real_t velocityScalingFactor, - const int64_t xStride, const int64_t yStride, const int64_t zStride, const int64_t fStride) +static __global__ void evaluateProbePoints(double* const density, double* const velocity, int64_t* cellData, float* probeData, + const int64_t n, const float velocityScalingFactor, + const int64_t xStrideDensity, const int64_t yStrideDensity, const int64_t zStrideDensity, + const int64_t xStrideVelocity, const int64_t yStrideVelocity, const int64_t zStrideVelocity, const int64_t fStrideVelocity) { if(blockDim.x*blockIdx.x + threadIdx.x < n) { @@ -207,19 +180,34 @@ static __global__ void evaluateProbePoints(double* const pdfs, int64_t* cellData const int64_t y = cellData[ctr + n]; const int64_t z = cellData[ctr + 2*n]; - double rho, uX, uY, uZ; + const float rho = ((float)(density[xStrideDensity*x + yStrideDensity*y + zStrideDensity*z])); + const float uX = ((float)(velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z])); + const float uY = ((float)(velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z + fStrideVelocity])); + const float uZ = ((float)(velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z + 2*fStrideVelocity])); - if(((timestep & 1) ^ 1)) { - densityVelocityEven(pdfs, &rho, &uX, &uY, &uZ, x, y, z, xStride, yStride, zStride, fStride); - } - else{ - densityVelocityOdd(pdfs, &rho, &uX, &uY, &uZ, x, y, z, xStride, yStride, zStride, fStride); - } + probeData[ctr] = rho; + probeData[ctr + n] = velocityScalingFactor * uX; + probeData[ctr + 2*n] = velocityScalingFactor * uY; + probeData[ctr + 3*n] = velocityScalingFactor * uZ; + } +} + +static __global__ void sumVelocity(double* result, double* const velocity, uint8_t* const flagField, const int64_t n, const double cellVolume, const uint8_t domainFlag, + const int64_t xSize, const int64_t ySize, const int64_t zSize, + const int64_t xStrideVelocity, const int64_t yStrideVelocity, const int64_t zStrideVelocity, const int64_t fStrideVelocity, + const int64_t xStrideFlag, const int64_t yStrideFlag, const int64_t zStrideFlag) +{ + if(blockDim.x*blockIdx.x + threadIdx.x < xSize && blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize) + { + const int64_t x = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t y = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t z = blockDim.z*blockIdx.z + threadIdx.z; + + const double uX = cellVolume * velocity[xStrideVelocity*x + yStrideVelocity*y + zStrideVelocity*z + 0*fStrideVelocity]; - probeData[ctr] = ((float)(rho)); - probeData[ctr + n] = ((float)(velocityScalingFactor)) * ((float)(uX)); - probeData[ctr + 2*n] = ((float)(velocityScalingFactor)) * ((float)(uY)); - probeData[ctr + 3*n] = ((float)(velocityScalingFactor)) * ((float)(uZ)); + if( ( flagField[xStrideFlag*x + yStrideFlag*y + zStrideFlag*z] & domainFlag) == domainFlag ){ + atomicAdd(&result[n], uX); + } } } @@ -273,22 +261,75 @@ void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real } -void updateProbe(gpu::GPUField<real_t>* pdfs, int64_t* cellData, float* probeData, const int64_t n, const real_t velocityScalingFactor, +void updateProbe(gpu::GPUField<real_t>* density, gpu::GPUField<real_t>* velocity, int64_t* cellData, float* probeData, const int64_t n, const float velocityScalingFactor, const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid){ dim3 block_(block[0], block[1], block[2]); dim3 grid_(grid[0], grid[1], grid[2]); - double * data_pdfs = pdfs->dataAt(0, 0, 0, 0); - const uint8_t timestep = pdfs->getTimestep(); + double * data_density = density->dataAt(0, 0, 0, 0); + double * data_velocity = velocity->dataAt(0, 0, 0, 0); - const int64_t xStride = int64_c(pdfs->xStride()); - const int64_t yStride = int64_c(pdfs->yStride()); - const int64_t zStride = int64_c(pdfs->zStride()); - const int64_t fStride = int64_c(pdfs->fStride()); - internal::evaluateProbePoints<<<grid_, block_, 0, nullptr>>>(data_pdfs, cellData, probeData, n, timestep, velocityScalingFactor, - xStride, yStride, zStride, fStride); + const int64_t xStrideDensity = int64_c(density->xStride()); + const int64_t yStrideDensity = int64_c(density->yStride()); + const int64_t zStrideDensity = int64_c(density->zStride()); + + const int64_t xStrideVelocity = int64_c(velocity->xStride()); + const int64_t yStrideVelocity = int64_c(velocity->yStride()); + const int64_t zStrideVelocity = int64_c(velocity->zStride()); + const int64_t fStrideVelocity = int64_c(velocity->fStride()); + internal::evaluateProbePoints<<<grid_, block_, 0, nullptr>>>(data_density, data_velocity, cellData, probeData, n, velocityScalingFactor, + xStrideDensity, yStrideDensity, zStrideDensity, xStrideVelocity, yStrideVelocity, zStrideVelocity, fStrideVelocity); + +} + +void BulkVelocityCalculator::calculateBulkVelocity() { + dim3 b(block_[0], block_[1], block_[2]); + dim3 g(grid_[0], grid_[1], grid_[2]); + + int64_t counter = 0; + for( auto it = blocks_->begin(); it != blocks_->end(); ++it ){ + auto* block = dynamic_cast< Block* >(it.get()); + const uint_t level = block->getLevel(); + const double scaleFactor = double_c(1.0) / double_c( 1 << level ); + const double cellVolume = scaleFactor * scaleFactor * scaleFactor; + GPUField_T* velocityField = block->template getData<GPUField_T>(ids_.velocityFieldGPU); + GPUFlagField_T* flagField = block->template getData<GPUFlagField_T>(ids_.flagFieldGPU); + + double* const data_velocity = velocityField->dataAt(0, 0, 0, 0); + uint8_t* const data_flagField = flagField->dataAt(0, 0, 0, 0); + + const int64_t xSize = int64_c(velocityField->xSize()); + const int64_t ySize = int64_c(velocityField->ySize()); + const int64_t zSize = int64_c(velocityField->zSize()); + + const int64_t xStrideVelocity = int64_c(velocityField->xStride()); + const int64_t yStrideVelocity = int64_c(velocityField->yStride()); + const int64_t zStrideVelocity = int64_c(velocityField->zStride()); + const int64_t fStrideVelocity = int64_c(velocityField->fStride()); + + const int64_t xStrideFlag = int64_c(flagField->xStride()); + const int64_t yStrideFlag = int64_c(flagField->yStride()); + const int64_t zStrideFlag = int64_c(flagField->zStride()); + + internal::sumVelocity<<<g, b, 0, nullptr>>>(resultGPU_, data_velocity, data_flagField, counter, cellVolume, setup_.fluidFlag, + xSize, ySize, zSize, + xStrideVelocity, yStrideVelocity, zStrideVelocity, fStrideVelocity, + xStrideFlag, yStrideFlag, zStrideFlag); + + + } + WALBERLA_GPU_CHECK(gpuMemcpy(&resultCPU_[0], resultGPU_, sizeof(double) * resultCPU_.size(), gpuMemcpyDeviceToHost )); + + double bulkVelocity = double_c(0.0); + for (auto& bulkVelocityPerBlock : resultCPU_){ + bulkVelocity += bulkVelocityPerBlock; + } + + mpi::allReduceInplace< double >(bulkVelocity, mpi::SUM); + bulkVelocity_ = double_c(bulkVelocity) / volume_; + WALBERLA_GPU_CHECK(gpuMemset(resultGPU_, 0.0, sizeof(double) * resultCPU_.size())); } diff --git a/apps/showcases/PorousMediaGPU/utilGPU.h b/apps/showcases/PorousMediaGPU/utilGPU.h index 6b0ac4cf9ce1c3cca0f3303b842bcb50888b0d90..7fff9bffc84046754d3e7cfcd8c52f1fdb2ddc5c 100644 --- a/apps/showcases/PorousMediaGPU/utilGPU.h +++ b/apps/showcases/PorousMediaGPU/utilGPU.h @@ -19,6 +19,8 @@ //====================================================================================================================== #pragma once +#include "blockforest/Block.h" + #include "core/DataTypes.h" #include "core/math/Constants.h" @@ -27,14 +29,87 @@ #include "gpu/FieldCopy.h" #include "gpu/GPUField.h" +#include "Types.h" +#include "Setup.h" + namespace walberla{ + void getBlockAndGridSize(Vector3<uint_t> s, Vector3<uint32_t>& block, Vector3<uint32_t>& grid); void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real_t* ut, const Vector3<real_t>& globalMinCorner, const Vector3<real_t>& blockMinCorner, const Vector3<real_t>& uc, const real_t dx, const real_t t, const uint_t n, const real_t latticeVelocity, const int64_t nGhostLayers, const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid); -void updateProbe(gpu::GPUField<real_t>* pdfs, int64_t* cellData, float* probeData, const int64_t n, const real_t velocityScalingFactor, +void updateProbe(gpu::GPUField<real_t>* density, gpu::GPUField<real_t>* velocity, int64_t* cellData, float* probeData, const int64_t n, const float velocityScalingFactor, const Vector3<uint32_t>& block, const Vector3<uint32_t>& grid); + + +class BulkVelocityCalculator { +public: + BulkVelocityCalculator(const std::shared_ptr<StructuredBlockForest> & blocks, const IDs& ids, const Setup& setup) + : blocks_(blocks), setup_(setup), ids_(ids) + { +#if __CUDA_ARCH__ < 600 + WALBERLA_LOG_WARNING_ON_ROOT("Warning atomic add in double precision not supported. Falling back to software implementation") +#endif + + volume_ = 0.0; + uint_t nBlocks = 0; + for( auto it = blocks_->begin(); it != blocks_->end(); ++it ){ + auto* block = dynamic_cast< Block* >(it.get()); + const uint_t level = block->getLevel(); + + const double scaleFactor = double_c(1.0) / double_c( 1 << level ); + const double cellVolume = scaleFactor * scaleFactor * scaleFactor; + auto * flagField = block->template getData<FlagField_T>(ids.flagField); + auto domainFlag = flagField->getFlag(setup.fluidUID); + const auto ci = flagField->xyzSize(); + for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) { + if(isFlagSet(flagField->get(*cellIt), domainFlag)){ + volume_ += cellVolume; + } + } + nBlocks++; + } + + const Vector3<uint_t> s(blocks_->getNumberOfXCellsPerBlock(), blocks_->getNumberOfYCellsPerBlock(), blocks_->getNumberOfZCellsPerBlock()); + getBlockAndGridSize(s, block_, grid_); + mpi::allReduceInplace< double >(volume_, mpi::SUM); + + + resultCPU_.resize(nBlocks); + + WALBERLA_GPU_CHECK(gpuMalloc((void **)&resultGPU_, sizeof(double) * resultCPU_.size() )); + WALBERLA_GPU_CHECK(gpuMemset(resultGPU_, 0.0, sizeof(double) * resultCPU_.size())); + + } + + double bulkVelocity() const { return bulkVelocity_; } + void setBulkVelocity(const double bulkVelocity) { bulkVelocity_ = bulkVelocity; } + void calculateBulkVelocity(); + + real_t getDrivingForce() { + calculateBulkVelocity(); + + const real_t latticeReferenceLength = setup_.referenceLength / setup_.dxC; + return calculateDrivingForce(setup_.frictionVelocity, latticeReferenceLength, setup_.latticeVelocity, bulkVelocity_); + } + +private: + const std::shared_ptr< StructuredBlockForest > blocks_; + const Setup setup_; + const IDs ids_; + + double volume_; + double bulkVelocity_{0.0}; + + std::vector<double> resultCPU_; + double* resultGPU_; + + Vector3<uint32_t> block_; + Vector3<uint32_t> grid_; +}; + + } // namespace walberla \ No newline at end of file diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py index b1c61b589aaa585406b554e19861bf2f6c8e193d..d61adf2a5c156c7e610c7f4f5a53e6932bb404be 100644 --- a/python/pystencils_walberla/jinja_filters.py +++ b/python/pystencils_walberla/jinja_filters.py @@ -43,7 +43,7 @@ delete_loop = """ """ standard_parameter_registration = """ -for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++) +for (uint_t level = 0; level < blocks_->getNumberOfLevels(); level++) {{ const {dtype} level_scale_factor = {dtype}(uint_t(1) << level); {function_rules} @@ -760,11 +760,19 @@ def generate_setter(kernel_infos, parameter_registration=None): for kernel_info in kernel_infos: for param in kernel_info.parameters: if not param.is_field_parameter and param.symbol.name not in params_to_skip: - if param.symbol.name in scaling_identifiers: - continue dtype = param.symbol.dtype name = param.symbol.name - code_line = f"inline void set{name.capitalize()}(const {dtype} value) {{ {name}_ = value; }}" + + if param.symbol.name in scaling_identifiers: + idx = scaling_identifiers.index(param.symbol.name) + lambda_expr = parameter_registration.scaling_info[idx][2] + raw_scaling = convert_to_raw_form(name, lambda_expr, variable_name="value", setter=True) + function_rule = raw_scaling.format(dtype=dtype) + code = standard_parameter_registration.format(dtype=dtype, function_rules=function_rule) + code_line = f"inline void set{name.capitalize()}(const {dtype} value) {{ {code}}}" + else: + code_line = f"inline void set{name.capitalize()}(const {dtype} value) {{ {name}_ = value; }}" + result.append(code_line) params_to_skip.append(param.symbol.name) @@ -803,7 +811,9 @@ def generate_parameter_registration(kernel_infos, parameter_registration): return "\n".join(result) -def convert_to_raw_form(name, expression): +def convert_to_raw_form(name, expression, variable_name=None, setter=False): + if variable_name is None: + variable_name = name expr_str = str(expression) # Extract the body of the lambda expression @@ -812,8 +822,12 @@ def convert_to_raw_form(name, expression): # Replace numeric constants with the {dtype}(number) format body = re.sub(r'(?<!\w)(\d+(\.\d*)?|\.\d+)(?!\w)', lambda match: f'{{dtype}}({match.group(1)})', body) + body = body.replace(name, variable_name) - return f"{name}Vector.push_back({{dtype}}({body}));" + if setter: + return f"{name}Vector[level] = {{dtype}}({body});" + else: + return f"{name}Vector.push_back({{dtype}}({body}));" def generate_constructor(kernel_infos, parameter_registration): diff --git a/src/io/hdfWrapper.h b/src/io/hdfWrapper.h index bd090771383ea156dbb2c8612bd7affd8b9d4175..46dcafd56520106df540fdf0ebae67e5c8b22746 100644 --- a/src/io/hdfWrapper.h +++ b/src/io/hdfWrapper.h @@ -36,6 +36,28 @@ H5::FloatType h5Double(); H5::FloatType h5Float(); H5::FloatType h5FileFloat(); +template< typename T > +H5::DataType getH5ExportType(); + +template<> +inline H5::DataType getH5ExportType< uint8_t >() +{ + return H5::FloatType(H5::PredType::STD_U8LE); +} + +template<> +inline H5::DataType getH5ExportType< float >() +{ + return H5::FloatType(H5::PredType::IEEE_F32LE); +} + +template<> +inline H5::DataType getH5ExportType< double >() +{ + return H5::FloatType(H5::PredType::IEEE_F64LE); +} + + } \ No newline at end of file