diff --git a/apps/showcases/PorousMediaGPU/CMakeLists.txt b/apps/showcases/PorousMediaGPU/CMakeLists.txt index 5de0f2ccfad8fddca102f5bc9b08416a2751cf81..c0622b0c32e0e062267ebe977c9e2aec6aecf2bd 100644 --- a/apps/showcases/PorousMediaGPU/CMakeLists.txt +++ b/apps/showcases/PorousMediaGPU/CMakeLists.txt @@ -18,11 +18,11 @@ waLBerla_generate_target_from_python(NAME PorousMediaGPUCodeGen if (WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP) waLBerla_add_executable ( NAME PorousMediaGPU - FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp util.cpp utilGPU.cu + FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp utilGPU.cu DEPENDS blockforest boundary core field gpu lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen ) else() waLBerla_add_executable ( NAME PorousMediaGPU - FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp util.cpp + FILES PorousMediaGPU.cpp GridGeneration.h Setup.h Types.h InitializerFunctions.cpp TPMS.cpp Inflow.cpp Probe.cpp util.cpp DEPENDS blockforest boundary core field lbm_generated postprocessing stencil timeloop vtk PorousMediaGPUCodeGen ) endif(WALBERLA_BUILD_WITH_CUDA OR WALBERLA_BUILD_WITH_HIP) diff --git a/apps/showcases/PorousMediaGPU/Inflow.cpp b/apps/showcases/PorousMediaGPU/Inflow.cpp index f3b009b41155696fef3201f21e1984dc6033a0a3..03ee9423c5d7d6fa80d95e18b06950c7ea9c8d43 100644 --- a/apps/showcases/PorousMediaGPU/Inflow.cpp +++ b/apps/showcases/PorousMediaGPU/Inflow.cpp @@ -13,7 +13,7 @@ // You should have received a copy of the GNU General Public License along // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // -//! \file InitializerFunctions.h +//! \file Inflow.cpp //! \author Markus Holzer <markus.holzer@fau.de> // //====================================================================================================================== @@ -101,24 +101,23 @@ void Inflow::init(){ epsilonVKP_ = std::pow(urms_, real_c(3.0)) / le_; kappaKolmogorov_ = std::pow(epsilonVKP_ / std::pow(nu_, real_c(3.0)), real_c(1.0)/real_c(4.0)); - firstMode_ = std::log10(kappaVKP_ / real_c(1000.0)); - lastMode_ = std::log10(kappaKolmogorov_ * real_c(1000.0)); + firstMode_ = std::log10(kappaVKP_ / real_c(10.0)); + lastMode_ = std::log10(kappaKolmogorov_ * real_c(10.0)); const real_t linDeltaKappa = (lastMode_ - firstMode_) / (real_c(n_) - real_c(1.0)); - kappa_.resize(n_); - ut_.resize(n_); real_t utSquaredSum = 0.0; for(uint_t i = 0; i < n_; ++i){ const real_t kappaN = std::pow(real_c(10.0), firstMode_ + (real_c(i) * linDeltaKappa)); const real_t kappaN1 = std::pow(real_c(10.0), firstMode_ + (real_c(i + 1) * linDeltaKappa)); const real_t deltaKappa = kappaN1 - kappaN; - - kappa_[i] = kappaN; const real_t tmp = std::sqrt(getKarmanPaoEnergy(kappaN, alpha_, kappaVKP_, kappaKolmogorov_, urms_) * deltaKappa); - ut_[i] = tmp; + + kappa_.emplace_back(kappaN); + ut_.emplace_back(tmp); utSquaredSum += tmp*tmp; } + n_ = uint_c(kappa_.size()); const real_t correctionTKE = std::sqrt(((real_c(3.0) / real_c(2.0)) * (urms_*urms_)) / utSquaredSum); for(uint_t i = 0; i < n_; ++i){ @@ -130,9 +129,6 @@ void Inflow::init(){ const Vector3<uint_t> s(n_, setup_.cellsPerBlock[1], setup_.cellsPerBlock[2]); getBlockAndGridSize(s, block_, grid_); - WALBERLA_LOG_DEVEL_VAR(block_) - WALBERLA_LOG_DEVEL_VAR(grid_) - WALBERLA_GPU_CHECK(gpuMalloc((void **)&kappaGPU_, sizeof(real_t) * kappa_.size() )); WALBERLA_GPU_CHECK(gpuMemcpy( kappaGPU_, &kappa_[0], sizeof(real_t) * kappa_.size(), gpuMemcpyHostToDevice )); diff --git a/apps/showcases/PorousMediaGPU/Inflow.h b/apps/showcases/PorousMediaGPU/Inflow.h index 6950214eb494145672f1315a4c1f43f312ff784e..7b5aa1efd1f24726fa70eacf9abd3e22877a5a38 100644 --- a/apps/showcases/PorousMediaGPU/Inflow.h +++ b/apps/showcases/PorousMediaGPU/Inflow.h @@ -26,8 +26,6 @@ #include "gpu/ErrorChecking.h" #include "gpu/FieldCopy.h" -// #include <curand_kernel.h> - #include "Setup.h" #include "util.h" #include "utilGPU.h" diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp index 63354ab574da6de1da6ce6a72370bc803e9a299b..8245b0eb69776e0e66188a3e48217e950239f7b6 100644 --- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp +++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.cpp @@ -83,6 +83,7 @@ #include "InitializerFunctions.h" #include "GridGeneration.h" #include "Setup.h" +#include "Probe.h" #include "TPMS.h" #include "Types.h" @@ -195,29 +196,18 @@ int main(int argc, char** argv){ auto inflow = std::make_shared<Inflow>(blocks, setup, ids); - const bool cpu = false; - if(cpu){ - WALBERLA_LOG_INFO_ON_ROOT("CPU Version") - inflow->updateCPU(); - } - else{ - WALBERLA_LOG_INFO_ON_ROOT("GPU Version") - inflow->updateGPU(); - gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, ids.velocityFieldInflowSlice, ids.velocityFieldInflowSliceGPU); - } - + auto probe = std::make_shared<Probe>(blocks, setup, ids); - for (auto& block : *blocks){ - auto velocityFieldInflowSlice = block.getData< VelocityField_T >(ids.velocityFieldInflowSlice); + // Vector3<real_t> test(1.5, 0.5, 0.5); + probe->addPoint(Vector3<real_t>(1.5, 0.5, 0.5 )); + probe->addPoint(Vector3<real_t>(3.51, 0.5, 0.5)); + probe->addPoint(Vector3<real_t>(100.51, 0.5, 0.5)); - WALBERLA_LOG_DEVEL_VAR(velocityFieldInflowSlice->get(0,20,20,0)) - WALBERLA_LOG_DEVEL_VAR(velocityFieldInflowSlice->get(0,20,20,1)) - WALBERLA_LOG_DEVEL_VAR(velocityFieldInflowSlice->get(0,20,20,2)) - } + probe->initialise(); return EXIT_SUCCESS; - + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// LB SWEEPS AND BOUNDARY HANDLING /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/apps/showcases/PorousMediaGPU/PorousMediaGPU.prm b/apps/showcases/PorousMediaGPU/PorousMediaGPU.prm index f3243cf9faab4a2fb95272cf88f4017f5ba1eb7d..5757f1d68a791c27116774368a630434f8cb8cf4 100644 --- a/apps/showcases/PorousMediaGPU/PorousMediaGPU.prm +++ b/apps/showcases/PorousMediaGPU/PorousMediaGPU.prm @@ -7,7 +7,7 @@ Parameters latticeVelocity 0.05; initialiseWithInletVelocity true; - timesteps 1; + timesteps 10001; processMemoryLimit 512; // MiB innerOuterSplit <1, 1, 1>; @@ -19,10 +19,10 @@ Parameters Turbulence { useGrid false; - U_rms 0.05; + U_rms 0.01; characteristicLength 0.4; - numberOfFourrierModes 100; - U_c < 0.05, 0.0, 0.0 >; + numberOfFourrierModes 128; + U_c < 2.0, 0.0, 0.0 >; } TPMS @@ -30,17 +30,18 @@ TPMS waveNumber 6.2831853072; scaling 10.0; strut 1.0; - start 2.0; - end 5.0; + start 2.5; + end 5.5; type gyroid; } //! [domainSetup] DomainSetup { - domainSize < 7, 1, 1 >; - // cellsPerBlock < 1344, 192, 192 >; - cellsPerBlock < 336, 48, 48 >; + domainSize < 8, 1, 1 >; + cellsPerBlock < 384, 48, 48 >; + // cellsPerBlock < 768, 96, 96 >; + // cellsPerBlock < 1536, 192, 192 >; blocks < 1, 1, 1 >; periodic < false, true, true >; refinementLevels 0; @@ -65,13 +66,13 @@ StabilityChecker VTKWriter { - vtkWriteFrequency 0; + vtkWriteFrequency 10000; vtkGhostLayers 0; velocity true; - writeVelocityAsMagnitude false; + writeVelocityAsMagnitude true; density false; - flag true; - writeOnlySlice false; + flag false; + writeOnlySlice true; sliceThickness 1; writeXZSlice false; amrFileFormat false; diff --git a/apps/showcases/PorousMediaGPU/Probe.cpp b/apps/showcases/PorousMediaGPU/Probe.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6e88e548287f5ed9b74fda4802b5bef4371bddb6 --- /dev/null +++ b/apps/showcases/PorousMediaGPU/Probe.cpp @@ -0,0 +1,68 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file Probe.cpp +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== +#include "Probe.h" + +namespace walberla{ + +void Probe::initialise(){ + + std::map<IBlock*, std::vector<Vector3<real_t>> > pointsPerBlock; + + const AABB domain = blocks_->getDomain(); + const uint_t timesteps = setup_.timesteps; + + for(auto p: probePoints_){ + if(!domain.contains(p)){continue;} + IBlock* block = blocks_->getBlock(p); + if(block != nullptr){ + const Cell localCell = blocks_->getBlockLocalCell(*block, p); + Vector3<real_t> localCellCenter; + blocks_->getCellCenter(localCellCenter, localCell); + + pointsPerBlock[block].emplace_back(localCellCenter); + } + } + + for (auto& block : *blocks_){ + auto * dataVector = block.getData< DataVector > ( probeDataID ); + auto & cpuCellVector = dataVector->cpuCellVector(); + auto & cpuDataVector = dataVector->cpuDataVector(); + + const uint_t n = uint_c(pointsPerBlock[&block].size()); + cpuCellVector.resize(3 * n); + + uint_t i = 0; + for(auto p: pointsPerBlock[&block]){ + const Cell localCell = blocks_->getBlockLocalCell(block, p); + + cpuCellVector[i] = localCell[0]; + cpuCellVector[i + n] = localCell[1]; + cpuCellVector[i + 2*n] = localCell[2]; + ++i; + } + + cpuDataVector.resize(4 * n * timesteps); + dataVector->syncGPU(); + } +} + + + +} // namespace walberla diff --git a/apps/showcases/PorousMediaGPU/Probe.h b/apps/showcases/PorousMediaGPU/Probe.h new file mode 100644 index 0000000000000000000000000000000000000000..4f6017aaa913781bf38c19a9719544571b99d2df --- /dev/null +++ b/apps/showcases/PorousMediaGPU/Probe.h @@ -0,0 +1,115 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file Probe.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== +#pragma once + +#include "domain_decomposition/IBlock.h" + +#include "core/logging/Initialization.h" +#include "core/math/Constants.h" + +#include "gpu/GPUWrapper.h" +#include "gpu/ErrorChecking.h" +#include "gpu/FieldCopy.h" + +#include "Setup.h" +#include "utilGPU.h" + +namespace walberla{ + +class Probe +{ + public: + class DataVector{ + public: + DataVector() = default; + bool operator==(DataVector const &other) const { return other.cpuCellVector_ == cpuCellVector_ && other.cpuDataVector_ == cpuDataVector_; } + + ~DataVector() { + if(synced){ + if(!cpuCellVector_.empty()){WALBERLA_GPU_CHECK(gpuFree(gpuCellVector_))} + if(!cpuDataVector_.empty()){WALBERLA_GPU_CHECK(gpuFree(gpuDataVector_))} + } + } + + std::vector<int64_t>& cpuCellVector() { return cpuCellVector_; } + std::vector<real_t>& cpuDataVector() { return cpuDataVector_; } + + int64_t* gpuCellVector() {return gpuCellVector_;} + real_t* gpuDataVector() {return gpuDataVector_;} + + void syncGPU() + { + WALBERLA_GPU_CHECK(gpuMalloc((void **)&gpuCellVector_, sizeof(int64_t) * cpuCellVector_.size() )); + WALBERLA_GPU_CHECK(gpuMemcpy( gpuCellVector_, &cpuCellVector_[0], sizeof(int64_t) * cpuCellVector_.size(), gpuMemcpyHostToDevice )); + + WALBERLA_GPU_CHECK(gpuMalloc((void **)&gpuDataVector_, sizeof(real_t) * cpuDataVector_.size() )); + WALBERLA_GPU_CHECK(gpuMemcpy( gpuDataVector_, &cpuDataVector_[0], sizeof(real_t) * cpuDataVector_.size(), gpuMemcpyHostToDevice )); + + cur_ = gpuDataVector_; + + synced = true; + } + + private: + std::vector<int64_t> cpuCellVector_; + std::vector<real_t> cpuDataVector_; + + int64_t *gpuCellVector_; + real_t *gpuDataVector_; + + real_t *cur_ = nullptr; + + bool synced{false}; + }; + + + + Probe(const std::shared_ptr< StructuredBlockForest >& blocks, const Setup& setup, const IDs& ids) + : blocks_(blocks), setup_(setup), ids_(ids), frequency_(uint_c(1)) + { + auto createProbeData = []( IBlock * const , StructuredBlockStorage * const ) { return new DataVector(); }; + probeDataID = blocks_->addStructuredBlockData< DataVector >( createProbeData, "Probe Data"); + + + } + + void addPoint(const Vector3<real_t>& point) { + probePoints_.emplace_back(point); + + } + + void initialise(); + + + private: + const std::shared_ptr< StructuredBlockForest > blocks_; + const Setup setup_; + const IDs ids_; + BlockDataID probeDataID; + + uint_t frequency_; + + std::vector<Vector3<real_t>> probePoints_; + + uint_t executionCounter_; + + +}; // class Probe +} // namespace walberla diff --git a/apps/showcases/PorousMediaGPU/Setup.h b/apps/showcases/PorousMediaGPU/Setup.h index d45dfa2dc9d63fb0c1643c8d1006a0c5e143fdde..5d5b9f6ec6bcd84e1c1141f217fb1ae4ba8e180f 100644 --- a/apps/showcases/PorousMediaGPU/Setup.h +++ b/apps/showcases/PorousMediaGPU/Setup.h @@ -48,7 +48,7 @@ struct Setup cellsPerBlock = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock"); rootBlocks = domainParameters.getParameter< Vector3< uint_t > >("blocks"); cells = Vector3<uint_t>(rootBlocks[0] * cellsPerBlock[0], rootBlocks[1] * cellsPerBlock[1], rootBlocks[2] * cellsPerBlock[2]); - periodic = domainParameters.getParameter< Vector3< bool > >("periodic"); + periodic = domainParameters.getParameter< Vector3< bool > >("periodic"); refinementLevels = domainParameters.getParameter< uint_t >("refinementLevels"); const real_t coarseMeshSize = domainSize[0] / real_c(rootBlocks[0] * cellsPerBlock[0]); diff --git a/apps/showcases/PorousMediaGPU/util.cpp b/apps/showcases/PorousMediaGPU/util.cpp index 90d79836a241ba8bf276c7f837ba869e1ae24ff3..cfab8533b63b1bca90d8b2bf11691eb3ed41303e 100644 --- a/apps/showcases/PorousMediaGPU/util.cpp +++ b/apps/showcases/PorousMediaGPU/util.cpp @@ -26,16 +26,16 @@ Vector3<real_t> summOfFourrierModes(const Vector3<real_t>& p, const Vector3<real const std::vector<real_t>& kappa, const std::vector<real_t>& ut, const real_t t){ Vector3<real_t> result(real_c(0.0), real_c(0.0), real_c(0.0)); - const real_t pi2 = real_c(2.0) * math::pi; + const real_t pi2 = real_c(6.28318530717958647693); const uint_t n = uint_c(kappa.size()); for(uint_t i = 0; i < n; ++i){ - const real_t a = 0.1; // math::realRandom(real_c(0.0), real_c(1.0)); - const real_t phi = 0.2; //math::realRandom(real_c(0.0), real_c(1.0)) * pi2; - const real_t psi = 0.3; //math::realRandom(real_c(0.0), real_c(1.0)) * pi2; - const real_t zetaX = -0.4; //math::realRandom(real_c(-1.0), real_c(1.0)); - const real_t zetaY = -0.6; //math::realRandom(real_c(-1.0), real_c(1.0)); - const real_t zetaZ = 0.8; // math::realRandom(real_c(-1.0), real_c(1.0)); + const real_t a = math::realRandom(real_c(0.0), real_c(1.0)); + const real_t phi = math::realRandom(real_c(0.0), real_c(1.0)) * pi2; + const real_t psi = math::realRandom(real_c(0.0), real_c(1.0)) * pi2; + const real_t zetaX = math::realRandom(real_c(-1.0), real_c(1.0)); + const real_t zetaY = math::realRandom(real_c(-1.0), real_c(1.0)); + const real_t zetaZ = math::realRandom(real_c(-1.0), real_c(1.0)); const real_t theta = std::acos(real_c(2.0) * a - real_c(1.0)); @@ -51,10 +51,8 @@ Vector3<real_t> summOfFourrierModes(const Vector3<real_t>& p, const Vector3<real const real_t zetaXNormalized = zetaX / zettaLength; const real_t zetaYNormalized = zetaY / zettaLength; const real_t zetaZNormalized = zetaZ / zettaLength; - // WALBERLA_LOG_INFO_ON_ROOT("zeta: [" << zetaXNormalized << ", " << zetaYNormalized << ", " << zetaZNormalized << "]") - const real_t sigmaX = kappaY * zetaZNormalized - kappaZ * zetaYNormalized; const real_t sigmaY = kappaZ * zetaXNormalized - kappaX * zetaZNormalized; const real_t sigmaZ = kappaX * zetaYNormalized - kappaY * zetaXNormalized; @@ -65,7 +63,6 @@ Vector3<real_t> summOfFourrierModes(const Vector3<real_t>& p, const Vector3<real const real_t sigmaXNormalized = sigmaX / sigmaLength; const real_t sigmaYNormalized = sigmaY / sigmaLength; const real_t sigmaZNormalized = sigmaZ / sigmaLength; - // WALBERLA_LOG_INFO_ON_ROOT("Sigma: [" << sigmaXNormalized << ", " << sigmaYNormalized << ", " << sigmaZNormalized << "]") const real_t pX = p[0] - (t * uc[0]); diff --git a/apps/showcases/PorousMediaGPU/utilGPU.cu b/apps/showcases/PorousMediaGPU/utilGPU.cu index 7cb5912b78e2ccdcbc5fef2af615c5e15b265aab..87ae21283fc2be0c164349019d335933dc736ded 100644 --- a/apps/showcases/PorousMediaGPU/utilGPU.cu +++ b/apps/showcases/PorousMediaGPU/utilGPU.cu @@ -13,39 +13,100 @@ // You should have received a copy of the GNU General Public License along // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // -//! \file util.cu +//! \file utilGPU.cu //! \author Markus Holzer <markus.holzer@fau.de> // //====================================================================================================================== #include "utilGPU.h" +#include <curand_kernel.h> namespace walberla { namespace internal{ +static __device__ void densityVelocityEven(double* const _data_pdfs, + double* density, double* uX, double* uY, double* uZ, + int64_t const ctr_0, int64_t const ctr_1, int64_t const ctr_2, + int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + const double vel0Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3]; + const double momdensity_0 = vel0Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3]; + const double vel1Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3]; + const double momdensity_1 = vel1Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3]; + const double vel2Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3]; + const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2]; + const double momdensity_2 = vel2Term - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3]; + const double rho = delta_rho + 1.0; + const double u_0 = momdensity_0*((1.0) / (rho)); + const double u_1 = momdensity_1*((1.0) / (rho)); + const double u_2 = momdensity_2*((1.0) / (rho)); + *density = rho; + *uX = u_0; + *uY = u_1; + *uZ = u_2; +} + +static __device__ void densityVelocityOdd(double* const _data_pdfs, + double* density, double* uX, double* uY, double* uZ, + int64_t const ctr_0, int64_t const ctr_1, int64_t const ctr_2, + int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + const double vel0Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3]; + const double momdensity_0 = vel0Term - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 13*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3]; + const double vel1Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3]; + const double momdensity_1 = vel1Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 12*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3]; + const double vel2Term = _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 5*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 22*_stride_pdfs_3]; + const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 6*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3]; + const double momdensity_2 = vel2Term + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 19*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 14*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 21*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 11*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 15*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 6*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 16*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 20*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 24*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 17*_stride_pdfs_3] - _data_pdfs[_stride_pdfs_0*ctr_0 - _stride_pdfs_0 + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 26*_stride_pdfs_3]; + const double rho = delta_rho + 1.0; + const double u_0 = momdensity_0*((1.0) / (rho)); + const double u_1 = momdensity_1*((1.0) / (rho)); + const double u_2 = momdensity_2*((1.0) / (rho)); + *density = rho; + *uX = u_0; + *uY = u_1; + *uZ = u_2; +} + +__device__ void warpReduce(volatile double *uX, volatile double *uY, volatile double *uZ, unsigned int tid) +{ + uX[tid] += uX[tid+32]; uY[tid] += uY[tid+32]; uZ[tid] += uZ[tid+32]; + uX[tid] += uX[tid+16]; uY[tid] += uY[tid+16]; uZ[tid] += uZ[tid+16]; + uX[tid] += uX[tid+ 8]; uY[tid] += uY[tid+ 8]; uZ[tid] += uZ[tid+ 8]; + uX[tid] += uX[tid+ 4]; uY[tid] += uY[tid+ 4]; uZ[tid] += uZ[tid+ 4]; + uX[tid] += uX[tid+ 2]; uY[tid] += uY[tid+ 2]; uZ[tid] += uZ[tid+ 2]; + uX[tid] += uX[tid+ 1]; uY[tid] += uY[tid+ 1]; uZ[tid] += uZ[tid+ 1]; +} + static __device__ void singleMode(double* rX, double* rY, double* rZ, const double kappaN, const double ut, - const double pX, const double pY, const double pZ) + const double pX, const double pY, const double pZ, const int64_t linear_index) { - /* - curandState state; - curand_init(42, xStride*x + yStride*y + zStride*z, 0, &state); + const real_t pi2 = 6.28318530717958647693; - const real_t a = curand_uniform(&state); - const real_t phi = curand_uniform(&state) * pi2; - const real_t psi = curand_uniform(&state) * pi2; - const real_t zetaX = real_t(2.0) * curand_uniform(&state) - real_t(1.0); - const real_t zetaY = real_t(2.0) * curand_uniform(&state) - real_t(1.0); - const real_t zetaZ = real_t(2.0) * curand_uniform(&state) - real_t(1.0); - */ + curandStatePhilox4_32_10_t state; + curand_init(42, linear_index, 0, &state); + const float4 r1 = curand_uniform4(&state); + const float4 r2 = curand_uniform4(&state); + + const real_t a = real_t(r1.x); + const real_t phi = real_t(r1.y) * pi2; + const real_t psi = real_t(r1.z) * pi2; + const real_t zetaX = real_t(2.0) * real_t(r1.w) - real_t(1.0); + const real_t zetaY = real_t(2.0) * real_t(r2.x) - real_t(1.0); + const real_t zetaZ = real_t(2.0) * real_t(r2.y) - real_t(1.0); + +/* + // For debugging const real_t a = 0.1; // math::realRandom(real_c(0.0), real_c(1.0)); const real_t phi = 0.2; //math::realRandom(real_c(0.0), real_c(1.0)) * pi2; const real_t psi = 0.3; //math::realRandom(real_c(0.0), real_c(1.0)) * pi2; const real_t zetaX = -0.4; //math::realRandom(real_c(-1.0), real_c(1.0)); const real_t zetaY = -0.6; //math::realRandom(real_c(-1.0), real_c(1.0)); const real_t zetaZ = 0.8; // math::realRandom(real_c(-1.0), real_c(1.0)); +*/ const real_t theta = acos(real_t(2.0) * a - real_t(1.0)); @@ -53,21 +114,21 @@ static __device__ void singleMode(double* rX, double* rY, double* rZ, const doub const real_t kappaY = sin(theta) * sin(phi) * kappaN; const real_t kappaZ = cos(theta) * kappaN; - const real_t zettaLength = sqrt(zetaX*zetaX + zetaY*zetaY + zetaZ*zetaZ); + const real_t zettaLength = rsqrt(zetaX*zetaX + zetaY*zetaY + zetaZ*zetaZ); - const real_t zetaXNormalized = zetaX / zettaLength; - const real_t zetaYNormalized = zetaY / zettaLength; - const real_t zetaZNormalized = zetaZ / zettaLength; + const real_t zetaXNormalized = zetaX * zettaLength; + const real_t zetaYNormalized = zetaY * zettaLength; + const real_t zetaZNormalized = zetaZ * zettaLength; const real_t sigmaX = kappaY * zetaZNormalized - kappaZ * zetaYNormalized; const real_t sigmaY = kappaZ * zetaXNormalized - kappaX * zetaZNormalized; const real_t sigmaZ = kappaX * zetaYNormalized - kappaY * zetaXNormalized; - const real_t sigmaLength = sqrt(sigmaX*sigmaX + sigmaY*sigmaY + sigmaZ*sigmaZ); + const real_t sigmaLength = rsqrt(sigmaX*sigmaX + sigmaY*sigmaY + sigmaZ*sigmaZ); - const real_t sigmaXNormalized = sigmaX / sigmaLength; - const real_t sigmaYNormalized = sigmaY / sigmaLength; - const real_t sigmaZNormalized = sigmaZ / sigmaLength; + const real_t sigmaXNormalized = sigmaX * sigmaLength; + const real_t sigmaYNormalized = sigmaY * sigmaLength; + const real_t sigmaZNormalized = sigmaZ * sigmaLength; const real_t arg = real_t(2.0) * ut * cos(kappaX * pX + kappaY * pY + kappaZ * pZ + psi); @@ -82,46 +143,77 @@ static __global__ void summOfFourrierModesKernel(double * velocity, double * con const real_t yMin, const real_t zMin, const real_t minCornerY, const real_t minCornerZ, const real_t dx, const real_t ucX, const real_t ucY, const real_t ucZ, const real_t t, const uint_t n, const real_t latticeVelocity, - const int64_t xSize, const int64_t ySize, const int64_t zSize, - const int64_t xStride, const int64_t yStride, const int64_t zStride, const int64_t fStride) + const int64_t ySize, const int64_t zSize, + const int64_t yStride, const int64_t zStride, const int64_t fStride) { - if(blockDim.x*blockIdx.x + threadIdx.x < xSize && blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize) + if(blockDim.x*blockIdx.x + threadIdx.x < n && blockDim.y*blockIdx.y + threadIdx.y < ySize && blockDim.z*blockIdx.z + threadIdx.z < zSize) { - const int64_t x = 0; // blockDim.x*blockIdx.x + threadIdx.x; - const int64_t y = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t z = blockDim.z*blockIdx.z + threadIdx.z; - + extern __shared__ double sharedMemory[]; - real_t resultX = 0.0; - real_t resultY = 0.0; - real_t resultZ = 0.0; + double* uPrimeX = &sharedMemory[0]; + double* uPrimeY = &sharedMemory[n]; + double* uPrimeZ = &sharedMemory[2*n]; - const real_t xCenter = real_t(0.0); - const real_t yCenter = (yMin + ( real_t( y ) + real_t(0.5) ) * dx) + minCornerY; - const real_t zCenter = (zMin + ( real_t( z ) + real_t(0.5) ) * dx) + minCornerZ; - const real_t pi2 = real_t(2.0) * real_t(3.14159265358979323846); + // const int64_t thx = threadIdx.x; + const int64_t x = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t y = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t z = blockDim.z*blockIdx.z + threadIdx.z; - const real_t pX = xCenter - (t * ucX); - const real_t pY = yCenter - (t * ucY); - const real_t pZ = zCenter - (t * ucZ); + const int64_t linear_index = x + (y * n) + (z * n * ySize); + + const real_t xCenter = real_t(0.0); + const real_t yCenter = (yMin + ( real_t( y ) + real_t(0.5) ) * dx) + minCornerY; + const real_t zCenter = (zMin + ( real_t( z ) + real_t(0.5) ) * dx) + minCornerZ; - for(int64_t i = 0; i < n; ++i){ + const real_t pX = xCenter - (t * ucX); + const real_t pY = yCenter - (t * ucY); + const real_t pZ = zCenter - (t * ucZ); double rX, rY, rZ; - singleMode(&rX, &rY, &rZ, kappa[i], ut[i], pX, pY, pZ); - - resultX += rX; - resultY += rY; - resultZ += rZ; - } - - - velocity[xStride*x + yStride*y + zStride*z + 0*fStride] = resultX; - velocity[xStride*x + yStride*y + zStride*z + 1*fStride] = resultY; - velocity[xStride*x + yStride*y + zStride*z + 2*fStride] = resultZ; - + singleMode(&rX, &rY, &rZ, kappa[x], ut[x], pX, pY, pZ, linear_index); + + uPrimeX[x] = rX; + uPrimeY[x] = rY; + uPrimeZ[x] = rZ; + + __syncthreads(); + +/* + for(unsigned int stride = (blockDim.x/2); stride > 32 ; stride /=2){ + if(thx < stride){ + uPrimeX[thx] += uPrimeX[thx + stride]; + uPrimeY[thx] += uPrimeY[thx + stride]; + uPrimeZ[thx] += uPrimeZ[thx + stride]; + } + } + + if(thx < 32){ + warpReduce(uPrimeX, uPrimeY, uPrimeZ, thx); + } + + if(thx == 0){ + velocity[yStride*y + zStride*z + 0*fStride] = uPrimeX[0]; + velocity[yStride*y + zStride*z + 1*fStride] = uPrimeY[0]; + velocity[yStride*y + zStride*z + 2*fStride] = uPrimeZ[0]; + } +*/ + + if(x == 0){ + real_t resultX = 0.0; + real_t resultY = 0.0; + real_t resultZ = 0.0; + + for(int64_t i = 0; i < n; ++i){ + resultX += uPrimeX[i]; + resultY += uPrimeY[i]; + resultZ += uPrimeZ[i]; + } + + velocity[yStride*y + zStride*z + 0*fStride] = latticeVelocity + resultX; + velocity[yStride*y + zStride*z + 1*fStride] = resultY; + velocity[yStride*y + zStride*z + 2*fStride] = resultZ; + } } - } } @@ -150,20 +242,21 @@ void updateVelocityPlane(gpu::GPUField<real_t>* velocityGPU, real_t* kappa, real dim3 block_(block[0], block[1], block[2]); dim3 grid_(grid[0], grid[1], grid[2]); + + // In shared memory we store the result of the u_prime (x, y, z) + const uint_t sharedMemorySize = 3 * n * sizeof(real_t); double * data_velocity = velocityGPU->dataAt(0, 0, 0, 0); - const int64_t size_velocity_0 = int64_c(velocityGPU->xSize()); const int64_t size_velocity_1 = int64_c(velocityGPU->ySize()); const int64_t size_velocity_2 = int64_c(velocityGPU->zSize()); - const int64_t stride_velocity_0 = int64_c(velocityGPU->xStride()); const int64_t stride_velocity_1 = int64_c(velocityGPU->yStride()); const int64_t stride_velocity_2 = int64_c(velocityGPU->zStride()); const int64_t stride_velocity_3 = int64_c(velocityGPU->fStride()); - internal::summOfFourrierModesKernel<<<grid_, block_, 0, nullptr>>>(data_velocity, kappa, ut, + internal::summOfFourrierModesKernel<<<grid_, block_, sharedMemorySize, nullptr>>>(data_velocity, kappa, ut, globalMinCorner[1], globalMinCorner[2], blockMinCorner[1], blockMinCorner[2], dx, uc[0], uc[1], uc[2], t, n, latticeVelocity, - size_velocity_0, size_velocity_1, size_velocity_2, - stride_velocity_0, stride_velocity_1, stride_velocity_2, stride_velocity_3); + size_velocity_1, size_velocity_2, + stride_velocity_1, stride_velocity_2, stride_velocity_3); }