diff --git a/apps/tutorials/sycl/SYCLTest.cpp b/apps/tutorials/sycl/SYCLTest.cpp index 84799ef2afeb7437c02aed0757b616e5c68e9c1a..5d6d717bb303e904aaf9046d5456da5b9aa8db56 100644 --- a/apps/tutorials/sycl/SYCLTest.cpp +++ b/apps/tutorials/sycl/SYCLTest.cpp @@ -142,8 +142,19 @@ int main(int argc, char** argv) auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config()); #if defined(WALBERLA_BUILD_WITH_SYCL) WALBERLA_LOG_PROGRESS("Create SYCL queue") - auto syclQueue = make_shared<sycl::queue> (sycl::default_selector_v); - WALBERLA_LOG_INFO("Running SYCL for MPI process " << mpi::MPIManager::instance()->worldRank() << " on " << (*syclQueue).get_device().get_info<cl::sycl::info::device::name>()) + auto mpiRank = mpi::MPIManager::instance()->worldRank(); + + //FOR MULTI GPU + std::vector<sycl::device> Devs; + for (const auto &plt : sycl::platform::get_platforms()) { + if (plt.get_backend() == sycl::backend::ext_oneapi_cuda) + Devs.push_back(plt.get_devices()[0]); + } + auto syclQueue = make_shared<sycl::queue> (Devs[mpiRank]); + + //FOR CPU + //auto syclQueue = make_shared<sycl::queue> (sycl::default_selector_v); + WALBERLA_LOG_INFO("Running SYCL for MPI process " << mpiRank << " on " << (*syclQueue).get_device().get_info<cl::sycl::info::device::name>()) //WALBERLA_LOG_INFO("Max hardware threads are " << sycl::default_selector().select_device().get_info<sycl::info::device::max_compute_units>()); blocks->setSYCLQueue(syclQueue); #endif @@ -239,7 +250,6 @@ int main(int argc, char** argv) // Timeloop if (timestepStrategy == "fullSim") { - communication(); timeloop.add() << BeforeFunction(communication, "communication") << Sweep(deviceSyncWrapper(SYCLTestSweep), "SYCL Sweep"); } else if (timestepStrategy == "kernelOnly") { diff --git a/apps/tutorials/sycl/SYCLTest.prm b/apps/tutorials/sycl/SYCLTest.prm index 73f1352b4723dd7b4ca2ecb813c963416c12dd17..d008429183967191a01aa317e3669bb7d2eaf3a7 100644 --- a/apps/tutorials/sycl/SYCLTest.prm +++ b/apps/tutorials/sycl/SYCLTest.prm @@ -22,7 +22,7 @@ ShearFlowSetup DomainSetup { - blocks < 1, 1, 1 >; + blocks < 2, 1, 1 >; cellsPerBlock < 1000, 1000, 1 >; periodic < 0, 0, 0 >; oneBlockPerProcess true;