Compare revisions

Markus Holzer · Markus Holzer · d26560e2 · d26560e2 · d26560e2 · d26560e2
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -32,5 +32,9 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
 endif()
+if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_CUDA )
+   add_subdirectory( FlagFieldGPU )
+endif()
--- a/apps/benchmarks/FlagFieldGPU/CMakeLists.txt
+++ b/apps/benchmarks/FlagFieldGPU/CMakeLists.txt
+waLBerla_link_files_to_builddir( "*.prm" )
+if (WALBERLA_BUILD_WITH_CUDA)
+    waLBerla_generate_target_from_python(NAME FlagFieldGPUGenerated
+            FILE FlagFieldGPUCodeGen.py
+            OUT_FILES FlagFieldGPU_LbSweep.cu FlagFieldGPU_LbSweep.h
+            FlagFieldGPU_MacroSetter.cu FlagFieldGPU_MacroSetter.h
+            FlagFieldGPU_UBB.cu FlagFieldGPU_UBB.h
+            FlagFieldGPU_NoSlip.cu FlagFieldGPU_NoSlip.h
+            FlagFieldGPU_Outflow.cu FlagFieldGPU_Outflow.h
+            FlagFieldGPU_PackInfo.cu FlagFieldGPU_PackInfo.h
+            FlagFieldGPU_InfoHeader.h)
+    waLBerla_add_executable( NAME FlagFieldGPU FILE FlagFieldGPU.cpp
+            DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk FlagFieldGPUGenerated)
+endif()
\ No newline at end of file
--- a/apps/benchmarks/FlagFieldGPU/FlagFieldGPU.cpp
+++ b/apps/benchmarks/FlagFieldGPU/FlagFieldGPU.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FlagFieldGPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "blockforest/all.h"
+#include "core/all.h"
+#include "domain_decomposition/all.h"
+#include "field/all.h"
+#include "geometry/all.h"
+#include "timeloop/all.h"
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+#   include "cuda/AddGPUFieldToStorage.h"
+#   include "cuda/DeviceSelectMPI.h"
+#   include "cuda/HostFieldAllocator.h"
+#   include "cuda/NVTX.h"
+#   include "cuda/ParallelStreams.h"
+#   include "cuda/communication/GPUPackInfo.h"
+#   include "cuda/communication/UniformGPUScheme.h"
+#endif
+// CodeGen includes
+#include "FlagFieldGPU_InfoHeader.h"
+namespace walberla
+{
+typedef lbm::FlagFieldGPU_PackInfo PackInfo_T;
+typedef walberla::uint8_t flag_t;
+typedef FlagField< flag_t > FlagField_T;
+typedef cuda::GPUField< real_t > GPUField;
+typedef cuda::GPUField< uint8_t > GPUField_int;
+auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) {
+   return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block),
+                         storage->getNumberOfZCells(*block), uint_t(1), field::fzyx,
+                         make_shared< field::AllocateAligned< real_t, 64 > >());
+};
+int main(int argc, char** argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+   cuda::selectDeviceBasedOnMpiRank();
+   auto config = walberlaEnv.config();
+   auto blocks = blockforest::createUniformBlockGridFromConfig(config);
+   // read parameters
+   auto parameters                 = config->getOneBlock("Parameters");
+   const uint_t timesteps       = parameters.getParameter< uint_t >("timesteps", uint_c(10));
+   const real_t omega           = parameters.getParameter< real_t >("omega", real_t(1.9));
+   const real_t u_max           = parameters.getParameter< real_t >("u_max", real_t(0.05));
+   const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000));
+   const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   // create fields
+   BlockDataID pdfFieldID     = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "PDFs");
+   BlockDataID velFieldID     = field::addToStorage< VelocityField_T >(blocks, "velocity", real_t(0), field::fzyx);
+   BlockDataID densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_t(0), field::fzyx);
+   BlockDataID pdfFieldIDGPU = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "PDFs on GPU", true);
+   BlockDataID velFieldIDGPU =
+      cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldID, "velocity on GPU", true);
+   BlockDataID densityFieldIDGPU =
+      cuda::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldID, "density on GPU", true);
+   BlockDataID flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+   BlockDataID flagFieldId_gpu = cuda::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldId, "flag on GPU", true);
+   // initialise all PDFs
+   pystencils::FlagFieldGPU_MacroSetter setterSweep(pdfFieldIDGPU, velFieldIDGPU);
+   for (auto& block : *blocks)
+      setterSweep(&block);
+   cuda::fieldCpy< PdfField_T, GPUField >(blocks, pdfFieldID, pdfFieldIDGPU);
+   // Create communication
+   cuda::communication::UniformGPUScheme< Stencil_T > communication(blocks, false);
+   communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldIDGPU));
+   auto comm = std::function< void() >([&]() { communication.communicate(nullptr); });
+   // create and initialize boundary handling
+   const FlagUID fluidFlagUID("Fluid");
+   auto boundariesConfig = config->getOneBlock("Boundaries");
+   lbm::FlagFieldGPU_UBB ubb(blocks, pdfFieldIDGPU);
+   lbm::FlagFieldGPU_NoSlip noSlip(blocks, pdfFieldIDGPU);
+   lbm::FlagFieldGPU_Outflow outflow(blocks, pdfFieldIDGPU, pdfFieldID);
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+   ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("UBB"), fluidFlagUID);
+   noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
+   outflow.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("Outflow"), fluidFlagUID);
+   cuda::fieldCpy< GPUField_int, FlagField_T >(blocks, flagFieldId_gpu, flagFieldId);
+   pystencils::FlagFieldGPU_LbSweep lbSweep(densityFieldIDGPU, flagFieldId_gpu, pdfFieldIDGPU, velFieldIDGPU, omega);
+   // create time loop
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+   // add LBM sweep and communication to time loop
+   timeloop.add() << BeforeFunction(comm, "communication") << Sweep(outflow, "outflow boundary");
+   timeloop.add() << Sweep(ubb, "ubb boundary");
+   timeloop.add() << Sweep(noSlip, "noSlip boundary");
+   timeloop.add() << Sweep(lbSweep, "LB update rule");
+   // LBM stability check
+   timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >(
+                                    config, blocks, pdfFieldID, flagFieldId, fluidFlagUID)),
+                                 "LBM stability check");
+   // log remaining time
+   timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+                                 "remaining time logger");
+   // add VTK output to time loop
+   uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                      "simulation_step", false, true, true, false, 0);
+      vtkOutput->addBeforeFunction([&]() {
+         cuda::fieldCpy< VelocityField_T, GPUField >(blocks, velFieldID, velFieldIDGPU);
+         cuda::fieldCpy< ScalarField_T, GPUField >(blocks, densityFieldID, densityFieldIDGPU);
+      });
+      auto velWriter     = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "velocity");
+      auto densityWriter = make_shared< field::VTKWriter< ScalarField_T > >(densityFieldID, "density");
+      auto flagWriter    = make_shared< field::VTKWriter< FlagField_T > >(flagFieldId, "flagField");
+      vtkOutput->addCellDataWriter(velWriter);
+      vtkOutput->addCellDataWriter(densityWriter);
+      vtkOutput->addCellDataWriter(flagWriter);
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+   WcTimer simTimer;
+   WALBERLA_LOG_INFO_ON_ROOT("Simulating flow around sphere:"
+                             "\n timesteps:               "
+                             << timesteps << "\n reynolds number:         " << reynolds_number
+                             << "\n relaxation rate:         " << omega << "\n maximum inflow velocity: " << u_max
+                             << "\n diameter_sphere:         " << diameter_sphere)
+   simTimer.start();
+   timeloop.run();
+   simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   auto time = simTimer.last();
+   // get the number of fluid cells on the block
+   uint64_t nrOfFluidCells = 0;
+   uint64_t nrOfBoundaryCells = 0;
+   for (auto& block : *blocks)
+   {
+      auto* flagField = block.getData< FlagField_T >(flagFieldId);
+      auto domainFlag = flagField->getFlag(fluidFlagUID);
+      for (auto it = flagField->begin(); it != flagField->end(); ++it)
+      {
+         if (isFlagSet(it, domainFlag)) { nrOfFluidCells += 1; }
+         if (!isFlagSet(it, domainFlag)) { nrOfBoundaryCells += 1; }
+      }
+   }
+   auto mlupsPerProcess = real_c(nrOfFluidCells) * real_c(timesteps) / time * 1e-6;
+   // TODO: when going to multiple GPUs the performance should be measured on each GPU. At the moment only performance
+   // on root is considered.
+   WALBERLA_LOG_RESULT_ON_ROOT("Fluid Cells on the block " << nrOfFluidCells)
+   WALBERLA_LOG_RESULT_ON_ROOT("Boundary Cells on the block " << nrOfBoundaryCells)
+   WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
+   WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
+   return EXIT_SUCCESS;
+}
+} // namespace walberla
+int main(int argc, char** argv) { walberla::main(argc, argv); }
--- a/apps/benchmarks/FlagFieldGPU/FlagFieldGPUCodeGen.py
+++ b/apps/benchmarks/FlagFieldGPU/FlagFieldGPUCodeGen.py
+from pystencils.field import fields
+from pystencils.astnodes import Block, Conditional
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+from lbmpy.stencils import get_stencil
+from lbmpy.creationfunctions import create_lb_update_rule, create_lb_method
+from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
+from lbmpy_walberla.additional_data_handler import UBBAdditionalDataHandler, OutflowAdditionalDataHandler
+from lbmpy_walberla import generate_boundary, generate_lb_pack_info
+import sympy as sp
+with CodeGeneration() as ctx:
+    dtype = 'float64' if ctx.double_accuracy else 'float32'
+    stencil = get_stencil("D3Q19")
+    q = len(stencil)
+    dim = len(stencil[0])
+    pdfs, pdfs_tmp = fields(f"pdfs({q}), pdfs_tmp({q}) : {dtype}[{dim}D]", layout='fzyx')
+    velocity_field, density_field = fields(f"velocity({dim}), density(1) : {dtype}[{dim}D]", layout='fzyx')
+    flag = fields(f"flag_field: uint8[{dim}D]", layout='fzyx')
+    omega = sp.Symbol("omega")
+    u_max = sp.Symbol("u_max")
+    output = {
+        'density': density_field,
+        'velocity': velocity_field
+    }
+    method = create_lb_method(stencil=stencil, method='srt', relaxation_rate=omega, compressible=True)
+    update_rule = create_lb_update_rule(lb_method=method,
+                                        output=output,
+                                        optimization={"symbolic_field": pdfs,
+                                                      "symbolic_temporary_field": pdfs_tmp,
+                                                      "double_precision": True if ctx.double_accuracy else False},
+                                        kernel_type='stream_pull_collide')
+    update_rule = [Conditional(sp.Eq(flag.center(), 8), Block(update_rule))]
+    # getter & setter
+    setter_assignments = macroscopic_values_setter(method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs, density=1.0)
+    stencil_typedefs = {'Stencil_T': stencil}
+    field_typedefs = {'PdfField_T': pdfs,
+                      'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+    target = 'gpu'
+    # sweeps
+    generate_sweep(ctx, 'FlagFieldGPU_LbSweep', update_rule,
+                   field_swaps=[(pdfs, pdfs_tmp)], target=target)
+    generate_sweep(ctx, 'FlagFieldGPU_MacroSetter', setter_assignments, target=target)
+    # boundaries
+    ubb = UBB((0.05, 0, 0))
+    outflow = ExtrapolationOutflow(stencil[4], method)
+    outflow_data_handler = OutflowAdditionalDataHandler(stencil, outflow, target=target)
+    generate_boundary(ctx, 'FlagFieldGPU_UBB', ubb, method, target=target)
+    generate_boundary(ctx, 'FlagFieldGPU_Outflow', outflow, method,
+                      target=target, additional_data_handler=outflow_data_handler)
+    generate_boundary(ctx, 'FlagFieldGPU_NoSlip', NoSlip(), method, target=target, streaming_pattern='pull')
+    # communication
+    generate_lb_pack_info(ctx, 'FlagFieldGPU_PackInfo', stencil, pdfs, target=target)
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'FlagFieldGPU_InfoHeader',
+                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs)
--- a/apps/benchmarks/FlagFieldGPU/FlagFieldGPUParameters.prm
+++ b/apps/benchmarks/FlagFieldGPU/FlagFieldGPUParameters.prm
+Parameters 
+{
+	omega           1.8;
+	timesteps       1001;
+	u_max           0.05;
+	vtkWriteFrequency 250;
+	reynolds_number 100;
+	diameter_sphere 32;
+}
+DomainSetup
+{
+   blocks        <  1,    1, 1 >;
+   cellsPerBlock <  128, 64, 64 >;
+   periodic      <  0,    0, 0 >;
+}
+Boundaries 
+{
+	Border { direction W;    walldistance -1;  flag UBB; }
+	Border { direction E;    walldistance -1;  flag Outflow; }
+    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction N;    walldistance -1;  flag NoSlip; }
+    Border { direction T;    walldistance -1;  flag NoSlip; }
+    Border { direction B;    walldistance -1;  flag NoSlip; }
+    Body
+    {
+       shape Sphere;
+       midpoint <64, 32, 32>;
+       radius 16;
+       flag NoSlip;
+    }
+}
No results found