diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d49a1e63bbcc36307d83ce80e1440048b2ed8207..13e7ac49ebbf92c6718f00e275af7077e7e17539 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,7 +24,7 @@ add_subdirectory( blockforest )
 add_subdirectory( boundary )
 add_subdirectory( communication )
 add_subdirectory( core )
-add_subdirectory(gpu)
+add_subdirectory( gpu )
 add_subdirectory( domain_decomposition )
 add_subdirectory( executiontree )
 if ( WALBERLA_BUILD_WITH_FFT AND FFTW3_FOUND )
diff --git a/src/boundary/CMakeLists.txt b/src/boundary/CMakeLists.txt
index c122cbddc6a715ce446071a60c22dc11bb849b03..7da3ebfe360dd08ec1c60035e47944a078285139 100644
--- a/src/boundary/CMakeLists.txt
+++ b/src/boundary/CMakeLists.txt
@@ -14,4 +14,5 @@ target_sources( boundary
       BoundaryHandlingCollection.h
       Boundary.cpp
       BoundaryUID.h
+      ShiftedPeriodicity.h
       )
diff --git a/src/boundary/ShiftedPeriodicity.h b/src/boundary/ShiftedPeriodicity.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce2a32e550d5524c2f291bf901323211ec12f099
--- /dev/null
+++ b/src/boundary/ShiftedPeriodicity.h
@@ -0,0 +1,611 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ShiftedPeriodicity.h
+//! \ingroup boundary
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Block.h"
+#include "blockforest/BlockID.h"
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/debug/Debug.h"
+#include "core/logging/Logging.h"
+#include "core/math/AABBFwd.h"
+#include "core/math/Vector3.h"
+#include "core/mpi/MPIWrapper.h"
+#include "core/mpi/MPIManager.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/IBlockID.h"
+#include "domain_decomposition/MapPointToPeriodicDomain.h"
+
+#include <array>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+
+namespace walberla {
+namespace boundary {
+
+//*******************************************************************************************************************
+/*!
+ * A periodicity boundary condition that adds a user-defined spatial shift to the field when applied.
+ * This shift can prevent the locking of large-scale turbulent features in the flow direction, see e.g.,
+ * Munters et al. (https://doi.org/10.1063/1.4941912).
+ *
+ * Periodicity defined in the blockforest must be turned off in the normal-direction. Only uniform grids are supported
+ * for the moment. Normal direction and shift direction must not coincide.
+ * Moreover, this boundary condition is only applied at the minimum and maximum of the domain in a given direction, but
+ * cannot be applied in the interior of the domain for now.
+ *
+ * This base class handles the logistics of splitting the field and communication.
+ *
+ * @tparam Field_T Type of the ghost-layer field that is shifted periodically
+ */
+//*******************************************************************************************************************
+template<typename Derived_T, typename Field_T>
+class ShiftedPeriodicityBase {
+
+ public:
+
+   using FieldType = Field_T;
+   using ValueType = typename FieldType::value_type;
+   using ShiftType = int;
+
+   ShiftedPeriodicityBase( const std::weak_ptr<StructuredBlockForest> & blockForest,
+                           const BlockDataID & fieldID, const uint_t fieldGhostLayers,
+                           const uint_t normalDir, const uint_t shiftDir, const ShiftType shiftValue )
+      : blockForest_(blockForest), normalDir_(normalDir), shiftDir_(shiftDir),
+        fieldID_( fieldID ), fieldGhostLayers_(fieldGhostLayers)
+   {
+      auto sbf = blockForest_.lock();
+
+      WALBERLA_ASSERT_NOT_NULLPTR( sbf )
+
+      WALBERLA_CHECK(sbf->storesUniformBlockGrid(), "Periodic shift is currently only implemented for uniform grids.")
+      WALBERLA_CHECK(sbf->containsGlobalBlockInformation(), "For the periodic shift, the blockforest must be constructed to retain global information.")
+
+      shift_ = Vector3<ShiftType>{};
+      auto adaptedShiftValue = shiftValue % ShiftType(sbf->getNumberOfCells(shiftDir_));
+      shift_[shiftDir_] = ShiftType(adaptedShiftValue >= 0 ? adaptedShiftValue : adaptedShiftValue + ShiftType(sbf->getNumberOfCells(shiftDir_)));
+
+      // sanity checks
+      WALBERLA_CHECK_UNEQUAL( shiftDir_, normalDir_, "Direction of periodic shift and boundary normal must not coincide." )
+
+      WALBERLA_CHECK( sbf->isPeriodic(shiftDir_), "Blockforest must be periodic in direction " << shiftDir_ << "!" )
+      WALBERLA_CHECK( !sbf->isPeriodic(normalDir_), "Blockforest must NOT be periodic in direction " << normalDir_ << "!" )
+
+   }
+
+   Vector3<ShiftType> shift() const { return shift_; }
+   uint_t shiftDirection() const { return shiftDir_; }
+   uint_t normalDirection() const { return normalDir_; }
+
+   void operator()() {
+
+      auto mpiInstance = mpi::MPIManager::instance();
+      const auto currentRank = numeric_cast<mpi::MPIRank>(mpiInstance->rank());
+
+      const auto sbf = blockForest_.lock();
+      WALBERLA_ASSERT_NOT_NULLPTR( sbf )
+
+      // only setup send and receive information once at the beginning
+      if(!setupPeriodicity_){
+         setupPeriodicity();
+         setupPeriodicity_ = true;
+      }
+
+      // set up local to avoid temporary fields; key is unique tag to ensure correct communication
+      // if several messages have the same source and destination
+      // thanks to the unique tag, the same buffer can be used for both local and MPI communication
+      std::map<int, std::vector<ValueType>> buffer;
+      std::map<BlockID, std::map<int, MPI_Request>> sendRequests;
+      std::map<BlockID, std::map<int, MPI_Request>> recvRequests;
+
+      std::vector<IBlock*> localBlocks{};
+      sbf->getBlocks(localBlocks);
+
+      /// packing and schedule sends
+
+      for( auto block : localBlocks ) {
+
+         const auto blockID = dynamic_cast<Block*>(block)->getId();
+
+         WALBERLA_ASSERT_EQUAL(sendAABBs_[blockID].size(), recvAABBs_[blockID].size() )
+
+         for(uint_t i = 0; i < sendAABBs_[blockID].size(); ++i) {
+
+            auto & sendInfo = sendAABBs_[blockID][i];
+
+            const auto sendRank = std::get<0>(sendInfo);
+            const auto sendTag = std::get<3>(sendInfo);
+
+            const auto sendCI = std::get<2>(sendInfo);
+
+            const auto sendSize = sendCI.numCells() * uint_c(fSize_);
+
+            buffer[sendTag].resize(sendSize);
+            static_cast<Derived_T*>(this)->packBuffer(block, sendCI, buffer[sendTag]);
+
+            WALBERLA_MPI_SECTION() {
+               // schedule sends if MPI
+               if (sendRank != currentRank)
+               {
+                  MPI_Isend(buffer[sendTag].data(), mpi::MPISize(buffer[sendTag].size() * sizeof(ValueType)), MPI_BYTE,
+                            sendRank, sendTag, mpiInstance->comm(), &sendRequests[blockID][sendTag]);
+               }
+            }
+
+         }
+
+      }
+
+      /// schedule receives
+
+      for( auto block : localBlocks ) {
+
+         const auto blockID = dynamic_cast<Block*>(block)->getId();
+
+         WALBERLA_ASSERT_EQUAL(sendAABBs_[blockID].size(), recvAABBs_[blockID].size() )
+
+         for(uint_t i = 0; i < recvAABBs_[blockID].size(); ++i) {
+
+            auto & recvInfo = recvAABBs_[blockID][i];
+
+            const auto recvRank = std::get<0>(recvInfo);
+            const auto recvTag   = std::get<3>(recvInfo);
+            const auto recvCI = std::get<2>(recvInfo);
+
+            WALBERLA_MPI_SECTION() {
+               // do not schedule receives for local communication
+               if (recvRank != currentRank) {
+                  const auto recvSize = recvCI.numCells() * uint_c(fSize_);
+                  buffer[recvTag].resize(recvSize);
+
+                  // Schedule receives
+                  MPI_Irecv(buffer[recvTag].data(), mpi::MPISize(buffer[recvTag].size() * sizeof(ValueType)), MPI_BYTE,
+                            recvRank, recvTag, mpiInstance->comm(), &recvRequests[blockID][recvTag]);
+               }
+            }
+
+         }
+
+      }
+
+      /// unpacking
+
+      for( auto block : localBlocks ) {
+
+         const auto blockID = dynamic_cast<Block*>(block)->getId();
+
+         for(uint_t i = 0; i < recvAABBs_[blockID].size(); ++i) {
+
+            auto & recvInfo = recvAABBs_[blockID][i];
+
+            const auto recvRank = std::get<0>(recvInfo);
+            const auto recvTag = std::get<3>(recvInfo);
+
+            const auto recvCI = std::get<2>(recvInfo);
+
+            if(recvRank == currentRank) {
+               WALBERLA_ASSERT_GREATER(buffer.count(recvTag), 0)
+               static_cast<Derived_T*>(this)->unpackBuffer(block, recvCI, buffer[recvTag]);
+            } else {
+               WALBERLA_MPI_SECTION() {
+                  MPI_Status status;
+                  MPI_Wait(&recvRequests[blockID][recvTag], &status);
+
+                  WALBERLA_ASSERT_GREATER(buffer.count(recvTag), 0)
+                  static_cast< Derived_T* >(this)->unpackBuffer(block, recvCI, buffer[recvTag]);
+               }
+            }
+
+         }
+
+      }
+
+      WALBERLA_MPI_SECTION() {
+         // wait for all communication to be finished
+         for (auto& block : localBlocks)
+         {
+            const auto blockID = dynamic_cast< Block* >(block)->getId();
+
+            if (sendRequests[blockID].empty()) return;
+
+            std::vector< MPI_Request > v;
+            std::transform(sendRequests[blockID].begin(), sendRequests[blockID].end(), std::back_inserter(v),
+                           second(sendRequests[blockID]));
+            MPI_Waitall(int_c(sendRequests[blockID].size()), v.data(), MPI_STATUSES_IGNORE);
+         }
+      }
+
+   }
+
+ private:
+
+   /*!
+    * Creates send and receive information (source, target rank, source / target blocks, source and destination CI
+    * (here in form of an AABB), and unique communication tag) per block if the shift does not coincide with the
+    * block size.
+    *
+    * If the shift does not coincide with the block size, each block is split into two send sub-blocks and two receive
+    * sub-blocks whose source / target block will differ (maybe also the rank).
+    * Afterwards, all the information necessary for the communication is determined and saved in the corresponding
+    * data structure.
+    */
+   void processDoubleAABB( const AABB & originalAABB, const std::shared_ptr<StructuredBlockForest> & sbf,
+                          const real_t nGL, const BlockID & blockID, const int normalShift ) {
+
+      WALBERLA_ASSERT(normalShift == -1 || normalShift == 1)
+
+      const auto localShift = normalShift == 1 ? shift_ : -shift_;
+
+      const auto offset = ShiftType(int_c(shift_[shiftDir_]) % int_c(sbf->getNumberOfCellsPerBlock(shiftDir_)));
+      Vector3<ShiftType> normalShiftVector{};
+      normalShiftVector[normalDir_] = ShiftType(normalShift * int_c(sbf->getNumberOfCellsPerBlock(normalDir_)));
+
+      const auto remDir = uint_t(3) - normalDir_ - shiftDir_;
+
+      AABB aabb1 = originalAABB;
+      AABB aabb2 = originalAABB;
+
+      if( normalShift == 1 ) {
+         aabb1.setAxisBounds(shiftDir_, aabb1.min(shiftDir_), aabb1.max(shiftDir_) - real_c(offset));
+         aabb2.setAxisBounds(shiftDir_, aabb2.max(shiftDir_) - real_c(offset), aabb2.max(shiftDir_));
+      } else {
+         aabb1.setAxisBounds(shiftDir_, aabb1.min(shiftDir_), aabb1.min(shiftDir_) + real_c(offset));
+         aabb2.setAxisBounds(shiftDir_, aabb2.min(shiftDir_) + real_c(offset), aabb2.max(shiftDir_));
+      }
+
+      auto center1 = aabb1.center();
+      auto center2 = aabb2.center();
+
+      // account for ghost layers
+      aabb1.setAxisBounds(remDir, aabb1.min(remDir) - nGL, aabb1.max(remDir) + nGL);
+      aabb2.setAxisBounds(remDir, aabb2.min(remDir) - nGL, aabb2.max(remDir) + nGL);
+
+      auto localSendAABB1 = aabb1;
+      auto localRecvAABB1 = aabb1;
+      auto localSendAABB2 = aabb2;
+      auto localRecvAABB2 = aabb2;
+
+      localSendAABB1.setAxisBounds(shiftDir_, localSendAABB1.min(shiftDir_), localSendAABB1.max(shiftDir_) + nGL);
+      localSendAABB2.setAxisBounds(shiftDir_, localSendAABB2.min(shiftDir_) - nGL, localSendAABB2.max(shiftDir_));
+      localRecvAABB1.setAxisBounds(shiftDir_, localRecvAABB1.min(shiftDir_) - nGL, localRecvAABB1.max(shiftDir_));
+      localRecvAABB2.setAxisBounds(shiftDir_, localRecvAABB2.min(shiftDir_), localRecvAABB2.max(shiftDir_) + nGL);
+
+      if(normalShift == 1) { // at maximum of domain -> send inner layers, receive ghost layers
+         localSendAABB1.setAxisBounds(normalDir_, localSendAABB1.max(normalDir_) - nGL, localSendAABB1.max(normalDir_));
+         localSendAABB2.setAxisBounds(normalDir_, localSendAABB2.max(normalDir_) - nGL, localSendAABB2.max(normalDir_));
+         localRecvAABB1.setAxisBounds(normalDir_, localRecvAABB1.max(normalDir_), localRecvAABB1.max(normalDir_) + nGL);
+         localRecvAABB2.setAxisBounds(normalDir_, localRecvAABB2.max(normalDir_), localRecvAABB2.max(normalDir_) + nGL);
+      } else { // at minimum of domain -> send inner layers, receive ghost layers
+         localSendAABB1.setAxisBounds(normalDir_, localSendAABB1.min(normalDir_), localSendAABB1.min(normalDir_) + nGL);
+         localSendAABB2.setAxisBounds(normalDir_, localSendAABB2.min(normalDir_), localSendAABB2.min(normalDir_) + nGL);
+         localRecvAABB1.setAxisBounds(normalDir_, localRecvAABB1.min(normalDir_) - nGL, localRecvAABB1.min(normalDir_));
+         localRecvAABB2.setAxisBounds(normalDir_, localRecvAABB2.min(normalDir_) - nGL, localRecvAABB2.min(normalDir_));
+      }
+
+      WALBERLA_ASSERT_FLOAT_EQUAL(localSendAABB1.volume(), localRecvAABB1.volume())
+      WALBERLA_ASSERT_FLOAT_EQUAL(localSendAABB2.volume(), localRecvAABB2.volume())
+
+      center1 += (localShift + normalShiftVector);
+      center2 += (localShift + normalShiftVector);
+
+      std::array<bool, 3> virtualPeriodicity{false};
+      virtualPeriodicity[normalDir_] = true;
+      virtualPeriodicity[shiftDir_] = true;
+
+      domain_decomposition::mapPointToPeriodicDomain( virtualPeriodicity, sbf->getDomain(), center1 );
+      domain_decomposition::mapPointToPeriodicDomain( virtualPeriodicity, sbf->getDomain(), center2 );
+
+      uint_t rank1{};
+      uint_t rank2{};
+
+      BlockID id1{};
+      BlockID id2{};
+
+      const auto blockInformation = sbf->getBlockInformation();
+      WALBERLA_ASSERT(blockInformation.active())
+
+      blockInformation.getProcess(rank1, center1[0], center1[1], center1[2]);
+      blockInformation.getProcess(rank2, center2[0], center2[1], center2[2]);
+
+      blockInformation.getId(id1, center1[0], center1[1], center1[2]);
+      blockInformation.getId(id2, center2[0], center2[1], center2[2]);
+
+      // define unique send / receive tags for communication
+
+      const int atMaxTagSend = normalShift < 0 ? 0 : 1;
+      const int atMaxTagRecv = normalShift < 0 ? 1 : 0;
+
+      const int sendTag1 = ((int_c(blockID.getID()) + int_c(id1.getID() * numGlobalBlocks_)) * 2 + atMaxTagSend) * 2 + 0;
+      const int sendTag2 = ((int_c(blockID.getID()) + int_c(id2.getID() * numGlobalBlocks_)) * 2 + atMaxTagSend) * 2 + 1;
+      const int recvTag2 = ((int_c(id2.getID()) + int_c(blockID.getID() * numGlobalBlocks_)) * 2 + atMaxTagRecv) * 2 + 0;
+      const int recvTag1 = ((int_c(id1.getID()) + int_c(blockID.getID() * numGlobalBlocks_)) * 2 + atMaxTagRecv) * 2 + 1;
+
+      auto block = sbf->getBlock(blockID);
+
+      sendAABBs_[blockID].emplace_back(mpi::MPIRank(rank1), id1, globalAABBToLocalCI(localSendAABB1, sbf, block), sendTag1);
+      sendAABBs_[blockID].emplace_back(mpi::MPIRank(rank2), id2, globalAABBToLocalCI(localSendAABB2, sbf, block), sendTag2);
+      recvAABBs_[blockID].emplace_back(mpi::MPIRank(rank2), id2, globalAABBToLocalCI(localRecvAABB2, sbf, block), recvTag2);
+      recvAABBs_[blockID].emplace_back(mpi::MPIRank(rank1), id1, globalAABBToLocalCI(localRecvAABB1, sbf, block), recvTag1);
+
+      WALBERLA_LOG_DETAIL("blockID = " << blockID.getID() << ", normalShift = " << normalShift
+                                       << "\n\tsendRank1 = " << rank1 << "\tsendID1 = " << id1.getID() << "\tsendTag1 = " << sendTag1 << "\taabb = " << localSendAABB1
+                                       << "\n\tsendRank2 = " << rank2 << "\tsendID2 = " << id2.getID() << "\tsendTag2 = " << sendTag2 << "\taabb = " << localSendAABB2
+                                       << "\n\trecvRank1 = " << rank1 << "\trecvID1 = " << id1.getID() << "\trecvTag1 = " << recvTag1 << "\taabb = " << localRecvAABB1
+                                       << "\n\trecvRank2 = " << rank2 << "\trecvID2 = " << id2.getID() << "\trecvTag2 = " << recvTag2 << "\taabb = " << localRecvAABB2
+      )
+   }
+
+   /*!
+    * Does the same things as `processDoubleAABB` but assuming that the shift is a multiple of the block size, i.e.,
+    * every field slice is shifted one or several entire blocks. In this case, every block only has ONE send and ONE
+    * receive block whose information must be stored.
+    */
+   void processSingleAABB( const AABB & originalAABB, const std::shared_ptr<StructuredBlockForest> & sbf,
+                          const real_t nGL, const BlockID & blockID, const int normalShift ) {
+
+      WALBERLA_ASSERT(normalShift == -1 || normalShift == 1)
+
+      const auto localShift = normalShift == 1 ? shift_ : -shift_;
+
+      Vector3<ShiftType> normalShiftVector{};
+      normalShiftVector[normalDir_] = ShiftType(normalShift * int_c(sbf->getNumberOfCellsPerBlock(normalDir_)));
+
+      // aabb
+      auto aabb = originalAABB.getExtended(nGL);
+      auto center = aabb.center();
+
+      // receive from the interior of domain in ghost layers
+      auto localSendAABB = aabb;
+      auto localRecvAABB = aabb;
+
+      if(normalShift == 1) { // at maximum of domain -> send inner layers, receive ghost layers
+         localSendAABB.setAxisBounds(normalDir_, localSendAABB.max(normalDir_) - 2 * nGL, localSendAABB.max(normalDir_) - nGL);
+         localRecvAABB.setAxisBounds(normalDir_, localRecvAABB.max(normalDir_) - nGL, localRecvAABB.max(normalDir_));
+      } else { // at minimum of domain -> send inner layers, receive ghost layers
+         localSendAABB.setAxisBounds(normalDir_, localSendAABB.min(normalDir_) + nGL, localSendAABB.min(normalDir_) + 2 * nGL);
+         localRecvAABB.setAxisBounds(normalDir_, localRecvAABB.min(normalDir_), localRecvAABB.min(normalDir_) + nGL);
+      }
+
+      WALBERLA_ASSERT_FLOAT_EQUAL(localSendAABB.volume(), localRecvAABB.volume())
+
+      center += (localShift + normalShiftVector);
+
+      std::array<bool, 3> virtualPeriodicity{false};
+      virtualPeriodicity[normalDir_] = true;
+      virtualPeriodicity[shiftDir_] = true;
+
+      domain_decomposition::mapPointToPeriodicDomain( virtualPeriodicity, sbf->getDomain(), center );
+
+      uint_t rank{};
+
+      BlockID id{};
+
+      const auto blockInformation = sbf->getBlockInformation();
+      WALBERLA_ASSERT(blockInformation.active())
+
+      blockInformation.getProcess(rank, center[0], center[1], center[2]);
+
+      blockInformation.getId(id, center[0], center[1], center[2]);
+
+      // define unique send / receive tags for communication
+
+      const int atMaxTagSend = normalShift < 0 ? 0 : 1;
+      const int atMaxTagRecv = normalShift < 0 ? 1 : 0;
+
+      const int sendTag = ((int_c(blockID.getID()) + int_c(id.getID() * numGlobalBlocks_)) * 2 + atMaxTagSend) * 2 + 0;
+      const int recvTag = ((int_c(id.getID()) + int_c(blockID.getID() * numGlobalBlocks_)) * 2 + atMaxTagRecv) * 2 + 0;
+
+      auto block = sbf->getBlock(blockID);
+
+      sendAABBs_[blockID].emplace_back(mpi::MPIRank(rank), id, globalAABBToLocalCI(localSendAABB, sbf, block), sendTag);
+      recvAABBs_[blockID].emplace_back(mpi::MPIRank(rank), id, globalAABBToLocalCI(localRecvAABB, sbf, block), recvTag);
+
+      WALBERLA_LOG_DETAIL("blockID = " << blockID.getID() << ", normalShift = " << normalShift
+                                       << "\n\tsendRank = " << rank << "\tsendID = " << id.getID() << "\tsendTag = " << sendTag << "\taabb = " << localSendAABB
+                                       << "\n\trecvRank = " << rank << "\trecvID = " << id.getID() << "\trecvTag = " << recvTag << "\taabb = " << localRecvAABB
+      )
+   }
+
+   /*!
+    * Determines which blocks are participating in the boundary condition / communication, and calls the appropriate
+    * functions to extract and save the communication information.
+    */
+   void setupPeriodicity() {
+
+      const auto sbf = blockForest_.lock();
+      WALBERLA_ASSERT_NOT_NULLPTR( sbf )
+
+      auto & blockInformation = sbf->getBlockInformation();
+      WALBERLA_ASSERT(blockInformation.active())
+      std::vector<std::shared_ptr<IBlockID>> globalBlocks;
+      blockInformation.getAllBlocks(globalBlocks);
+      numGlobalBlocks_ = globalBlocks.size();
+
+      // get blocks of current processor
+      std::vector<IBlock*> localBlocks;
+      sbf->getBlocks(localBlocks);
+
+      const auto nGL = real_c(fieldGhostLayers_);
+
+      const bool shiftWholeBlock = (shift_[shiftDir_] % ShiftType(sbf->getNumberOfCells(*localBlocks[0], shiftDir_))) == 0;
+
+      // get f-size
+      {
+         auto* field = localBlocks[0]->getData<FieldType>(fieldID_);
+         WALBERLA_ASSERT_NOT_NULLPTR(field)
+         fSize_ = cell_idx_c(field->fSize());
+      }
+
+      for( auto block : localBlocks ) {
+
+         // get minimal ghost layer region (in normal direction)
+         const auto blockAABB = block->getAABB();
+         const auto blockID = dynamic_cast<Block*>(block)->getId();
+
+         const bool atMin = sbf->atDomainMinBorder(normalDir_, *block);
+         const bool atMax = sbf->atDomainMaxBorder(normalDir_, *block);
+
+         if(atMin) {
+            if(shiftWholeBlock) {
+               processSingleAABB(blockAABB, sbf, nGL, blockID, -1);
+            } else {
+               processDoubleAABB(blockAABB, sbf, nGL, blockID, -1);
+            }
+         }
+         if(atMax)  {
+            if(shiftWholeBlock) {
+               processSingleAABB(blockAABB, sbf, nGL, blockID, +1);
+            } else {
+               processDoubleAABB(blockAABB, sbf, nGL, blockID, +1);
+            }
+         }
+
+      }
+
+   }
+
+   Vector3<cell_idx_t> toCellVector( const Vector3<real_t> & point ) {
+      return Vector3<cell_idx_t >{ cell_idx_c(point[0]), cell_idx_c(point[1]), cell_idx_c(point[2]) };
+   }
+
+   CellInterval globalAABBToLocalCI( const AABB & aabb, const std::shared_ptr<StructuredBlockForest> & sbf, IBlock * const block ) {
+      auto globalCI = CellInterval{toCellVector(aabb.min()), toCellVector(aabb.max()) - Vector3<cell_idx_t>(1, 1, 1)};
+      CellInterval localCI;
+      sbf->transformGlobalToBlockLocalCellInterval(localCI, *block, globalCI);
+
+      return localCI;
+   }
+
+   template< typename tPair >
+   struct second_t {
+      typename tPair::second_type operator()( const tPair& p ) const { return p.second; }
+   };
+   template< typename tMap >
+   second_t< typename tMap::value_type > second( const tMap& ) { return second_t< typename tMap::value_type >(); }
+
+
+   std::weak_ptr<StructuredBlockForest> blockForest_;
+
+   uint_t normalDir_;
+   uint_t shiftDir_;
+   Vector3<ShiftType> shift_;
+
+   // for each local block, stores the ranks where to send / receive, the corresponding block IDs,
+   // the local AABBs that need to be packed / unpacked, and a unique tag for communication
+   std::map<BlockID, std::vector<std::tuple<mpi::MPIRank, BlockID, CellInterval, int>>> sendAABBs_{};
+   std::map<BlockID, std::vector<std::tuple<mpi::MPIRank, BlockID, CellInterval, int>>> recvAABBs_{};
+
+   bool setupPeriodicity_{false};
+   uint_t numGlobalBlocks_{};
+
+ protected:
+
+   const BlockDataID fieldID_;
+
+   const uint_t fieldGhostLayers_;
+   cell_idx_t fSize_;
+
+}; // class ShiftedPeriodicityBase
+
+
+//*******************************************************************************************************************
+/*!
+ * A periodicity boundary condition that adds a user-defined spatial shift to the field when applied.
+ * This shift can prevent the locking of large-scale turbulent features in the flow direction, see e.g.,
+ * Munters et al. (https://doi.org/10.1063/1.4941912).
+ *
+ * Periodicity defined in the blockforest must be turned off in the normal-direction.
+ *
+ * This class handles the CPU-specific packing and unpacking of the communication buffers.
+ *
+ * @tparam GhostLayerField_T Type of the ghost-layer field that is shifted periodically
+ */
+//*******************************************************************************************************************
+template<typename GhostLayerField_T>
+class ShiftedPeriodicity : public ShiftedPeriodicityBase<ShiftedPeriodicity<GhostLayerField_T>, GhostLayerField_T> {
+
+   using Base = ShiftedPeriodicityBase<ShiftedPeriodicity<GhostLayerField_T>, GhostLayerField_T>;
+   friend Base;
+
+ public:
+
+   using ValueType = typename GhostLayerField_T::value_type;
+   using ShiftType = typename Base::ShiftType;
+
+   ShiftedPeriodicity( const std::weak_ptr<StructuredBlockForest> & blockForest,
+                       const BlockDataID & fieldID, const uint_t fieldGhostLayers,
+                      const uint_t normalDir, const uint_t shiftDir, const ShiftType shiftValue )
+      : Base( blockForest, fieldID, fieldGhostLayers, normalDir, shiftDir, shiftValue )
+   {}
+
+ private:
+
+   void packBuffer(IBlock * const block, const CellInterval & ci,
+                   std::vector<ValueType> & buffer) {
+
+      // get field
+      auto field = block->getData<GhostLayerField_T>(this->fieldID_);
+      WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+      auto bufferIt = buffer.begin();
+
+      // forward iterate over ci and add values to value vector
+      for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
+         for(cell_idx_t f = 0; f < this->fSize_; ++f, ++bufferIt) {
+            WALBERLA_ASSERT(field->coordinatesValid(cellIt->x(), cellIt->y(), cellIt->z(), f))
+            *bufferIt = field->get(*cellIt, f);
+         }
+      }
+
+   }
+
+   void unpackBuffer(IBlock* const block, const CellInterval & ci,
+                     const std::vector<ValueType> & buffer) {
+
+      // get field
+      auto field = block->getData<GhostLayerField_T>(this->fieldID_);
+      WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+      auto bufferIt = buffer.begin();
+
+      // forward iterate over ci and add values to value vector
+      for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
+         for(cell_idx_t f = 0; f < this->fSize_; ++f, ++bufferIt) {
+            WALBERLA_ASSERT(field->coordinatesValid(cellIt->x(), cellIt->y(), cellIt->z(), f))
+            field->get(*cellIt, f) = *bufferIt;
+         }
+      }
+
+   }
+
+}; // class ShiftedPeriodicity
+
+} // namespace lbm
+} // namespace walberla
diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt
index fb6810d4e9241c476967bd430a9440b7b6eee86f..e870cdb5de4b783d1866fce23297e21f6bedff86 100644
--- a/src/gpu/CMakeLists.txt
+++ b/src/gpu/CMakeLists.txt
@@ -38,6 +38,8 @@ target_sources( gpu
       ParallelStreams.h
       GPURAII.h
       DeviceSelectMPI.cpp
+      ShiftedPeriodicity.cu
+      ShiftedPeriodicity.h
       )
 
 # sources only for CUDA
diff --git a/src/gpu/FieldAccessor.h b/src/gpu/FieldAccessor.h
index d737983d1aa5f289d624cd508eea8a7969a5fe69..e34d9c3c890c2bc316f629fa97f83453bb5c5d68 100644
--- a/src/gpu/FieldAccessor.h
+++ b/src/gpu/FieldAccessor.h
@@ -56,29 +56,29 @@ namespace gpu
               fOffset_(fOffset), indexingScheme_(indexingScheme )
       {}
 
-      __device__ void set( uint3 blockIdx, uint3 threadIdx )
+      __device__ void set( uint3 _blockIdx, uint3 _threadIdx )
       {
          switch ( indexingScheme_)
          {
-            case FZYX: ptr_ += blockIdx.z * fOffset_ + blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
-            case FZY : ptr_ +=                         blockIdx.y * fOffset_ + blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
-            case FZ  : ptr_ +=                                                 blockIdx.x * fOffset_ + threadIdx.x * zOffset_; break;
-            case F   : ptr_ +=                                                                         threadIdx.x * fOffset_; break;
-
-            case ZYXF: ptr_ += blockIdx.z * zOffset_ + blockIdx.y * yOffset_ + blockIdx.x * xOffset_ + threadIdx.x * fOffset_; break;
-            case ZYX : ptr_ +=                         blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
-            case ZY  : ptr_ +=                                                 blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
-            case Z   : ptr_ +=                                                                         threadIdx.x * zOffset_; break;
+            case FZYX: ptr_ += _blockIdx.z * fOffset_ + _blockIdx.y * zOffset_ + _blockIdx.x * yOffset_ + _threadIdx.x * xOffset_; break;
+            case FZY : ptr_ +=                          _blockIdx.y * fOffset_ + _blockIdx.x * zOffset_ + _threadIdx.x * yOffset_; break;
+            case FZ  : ptr_ +=                                                   _blockIdx.x * fOffset_ + _threadIdx.x * zOffset_; break;
+            case F   : ptr_ +=                                                                            _threadIdx.x * fOffset_; break;
+
+            case ZYXF: ptr_ += _blockIdx.z * zOffset_ + _blockIdx.y * yOffset_ + _blockIdx.x * xOffset_ + _threadIdx.x * fOffset_; break;
+            case ZYX : ptr_ +=                          _blockIdx.y * zOffset_ + _blockIdx.x * yOffset_ + _threadIdx.x * xOffset_; break;
+            case ZY  : ptr_ +=                                                   _blockIdx.x * zOffset_ + _threadIdx.x * yOffset_; break;
+            case Z   : ptr_ +=                                                                            _threadIdx.x * zOffset_; break;
          }
       }
 
 
-      __device__ uint_t getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim )
+      __device__ uint_t getLinearIndex( uint3 _blockIdx, uint3 _threadIdx, uint3 _gridDim, uint3 _blockDim )
       {
-         return threadIdx.x                              +
-                blockIdx.x * blockDim.x                  +
-                blockIdx.y * blockDim.x * gridDim.x      +
-                blockIdx.z * blockDim.x * gridDim.x * gridDim.y ;
+         return _threadIdx.x                                +
+                _blockIdx.x * _blockDim.x                   +
+                _blockIdx.y * _blockDim.x * _gridDim.x      +
+                _blockIdx.z * _blockDim.x * _gridDim.x * _gridDim.y ;
       }
 
       // This is always true for this specific field indexing class.
diff --git a/src/gpu/FieldAccessor3D.h b/src/gpu/FieldAccessor3D.h
index 66e95f7242ce0c183ab261f7dcd1600a57df10bb..5f32a43c4c110bd0589dc98e65c6cb7b7e8f7bc5 100644
--- a/src/gpu/FieldAccessor3D.h
+++ b/src/gpu/FieldAccessor3D.h
@@ -38,17 +38,17 @@ namespace gpu
                        uint_t yOffset,
                        uint_t zOffset,
                        uint_t fOffset,
-                       const dim3 & idxDim,
-                       const dim3 & blockDim )
+                       const dim3 & _idxDim,
+                       const dim3 & _blockDim )
             : ptr_( ptr ), xOffset_( xOffset ), yOffset_( yOffset ), zOffset_( zOffset ), fOffset_( fOffset ),
-              idxDim_( idxDim ), blockDim_( blockDim )
+              idxDim_( _idxDim ), blockDim_( _blockDim )
       {}
 
-      __device__ __forceinline__ void set( const uint3& blockIdx, const uint3& threadIdx )
+      __device__ __forceinline__ void set( const uint3& _blockIdx, const uint3& _threadIdx )
       {
-         uint_t x = blockIdx.x * blockDim_.x + threadIdx.x;
-         uint_t y = blockIdx.y * blockDim_.y + threadIdx.y;
-         uint_t z = blockIdx.z * blockDim_.z + threadIdx.z;
+         uint_t x = _blockIdx.x * blockDim_.x + _threadIdx.x;
+         uint_t y = _blockIdx.y * blockDim_.y + _threadIdx.y;
+         uint_t z = _blockIdx.z * blockDim_.z + _threadIdx.z;
 
          if ( x < idxDim_.x && y < idxDim_.y && z < idxDim_.z )
          {
diff --git a/src/gpu/FieldAccessorXYZ.h b/src/gpu/FieldAccessorXYZ.h
index d9046a783d3b5c073b03527332cff553edf1e99b..73c7a0a051929b17434b7d79c2a0fd00c3728507 100644
--- a/src/gpu/FieldAccessorXYZ.h
+++ b/src/gpu/FieldAccessorXYZ.h
@@ -37,11 +37,11 @@ namespace gpu
             : ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset), fOffset_(fOffset)
       {}
 
-      __device__ void set( uint3 blockIdx, uint3 threadIdx )
+      __device__ void set( uint3 _blockIdx, uint3 _threadIdx )
       {
-         ptr_ += threadIdx.x * xOffset_ +
-                 blockIdx.x  * yOffset_ +
-                 blockIdx.y  * zOffset_ ;
+         ptr_ += _threadIdx.x * xOffset_ +
+                 _blockIdx.x  * yOffset_ +
+                 _blockIdx.y  * zOffset_ ;
       }
 
       __device__ T & get()       { return * (T*)(ptr_);                }
diff --git a/src/gpu/ShiftedPeriodicity.cu b/src/gpu/ShiftedPeriodicity.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d2ae3f70d8aa1336eade55b42cbc32b2695fc4bc
--- /dev/null
+++ b/src/gpu/ShiftedPeriodicity.cu
@@ -0,0 +1,53 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ShiftedPeriodicity.cu
+//! \ingroup gpu
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include "gpu/ShiftedPeriodicity.h"
+
+namespace walberla {
+namespace gpu
+{
+
+namespace internal {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   __global__ void packBufferGPU( gpu::FieldAccessor<real_t> fa, real_t * const buffer ) {
+      fa.set(blockIdx, threadIdx);
+      if(fa.isValidPosition()) {
+         buffer[fa.getLinearIndex(blockIdx, threadIdx, gridDim, blockDim)] = fa.get();
+      }
+   }
+   __global__ void unpackBufferGPU( gpu::FieldAccessor<real_t> fa, const real_t * const buffer ) {
+      fa.set(blockIdx, threadIdx);
+      if(fa.isValidPosition()) {
+         fa.get() = buffer[fa.getLinearIndex(blockIdx, threadIdx, gridDim, blockDim)];
+      }
+   }
+#else
+   __global__ void packBufferGPU( gpu::FieldAccessor<real_t>, real_t * const ) {
+      WALBERLA_ABORT("gpu/ShiftedPeriodicity only supported when built with GPU support. Please use boundary/ShiftedPeriodicity on CPUs")
+   }
+   __global__ void unpackBufferGPU( gpu::FieldAccessor<real_t>, const real_t * const ) {
+      WALBERLA_ABORT("gpu/ShiftedPeriodicity only supported when built with GPU support. Please use boundary/ShiftedPeriodicity on CPUs")
+   }
+#endif
+}
+
+} // namespace gpu
+} // namespace walberla
diff --git a/src/gpu/ShiftedPeriodicity.h b/src/gpu/ShiftedPeriodicity.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf46577c21e688aaa375f15eddfd969e4eb6f669
--- /dev/null
+++ b/src/gpu/ShiftedPeriodicity.h
@@ -0,0 +1,144 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ShiftedPeriodicity.h
+//! \ingroup gpu
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/StructuredBlockForest.h"
+
+#include "boundary/ShiftedPeriodicity.h"
+
+#include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+#include "core/debug/Debug.h"
+#include "core/math/Vector3.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+
+#include "gpu/DeviceWrapper.h"
+#include "gpu/FieldAccessor.h"
+#include "gpu/FieldIndexing.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/Kernel.h"
+
+#include <cstdlib>
+#include <device_launch_parameters.h>
+#include <memory>
+#include <vector>
+
+#include "ErrorChecking.h"
+
+namespace walberla {
+namespace gpu
+{
+
+namespace internal {
+   // GPU kernels - can be extended for other data types
+   __global__ void packBufferGPU( gpu::FieldAccessor<real_t> fa, real_t * const buffer );
+   __global__ void unpackBufferGPU( gpu::FieldAccessor<real_t> fa, const real_t * const buffer );
+}
+
+//*******************************************************************************************************************
+/*!
+ * A periodicity boundary condition that adds a user-defined spatial shift to the field when applied.
+ * This shift can prevent the locking of large-scale turbulent features in the flow direction, see e.g.,
+ * Munters et al. (https://doi.org/10.1063/1.4941912).
+ *
+ * Periodicity defined in the blockforest must be turned off in the normal-direction.
+ *
+ * This class handles the GPU-specific packing and unpacking of the communication buffers.
+ *
+ * @tparam GhostLayerField_T Type of the ghost-layer field that is shifted periodically
+ */
+//*******************************************************************************************************************
+template< typename GPUField_T >
+class ShiftedPeriodicityGPU : public boundary::ShiftedPeriodicityBase<ShiftedPeriodicityGPU<GPUField_T>, GPUField_T> {
+
+   using Base = boundary::ShiftedPeriodicityBase<ShiftedPeriodicityGPU<GPUField_T>, GPUField_T>;
+   friend Base;
+
+ public:
+
+   using ValueType = typename GPUField_T::value_type;
+   using ShiftType = typename Base::ShiftType;
+   using FieldIdx_T = gpu::FieldIndexing<ValueType>;
+
+   ShiftedPeriodicityGPU(const std::weak_ptr< StructuredBlockForest >& blockForest,
+                         const BlockDataID& fieldID, const uint_t fieldGhostLayers,
+                         const uint_t normalDir, const uint_t shiftDir, const ShiftType shiftValue)
+      : Base(blockForest, fieldID, fieldGhostLayers, normalDir, shiftDir, shiftValue)
+   {}
+
+
+ private:
+
+   void packBuffer(IBlock* const block, const CellInterval& ci, std::vector< ValueType >& h_buffer) {
+
+      // get field
+      auto d_field = block->getData< GPUField_T >(this->fieldID_);
+      WALBERLA_ASSERT_NOT_NULLPTR(d_field)
+
+      const uint_t nValues = ci.numCells() * uint_c(this->fSize_);
+
+      // create GPU buffer
+      ValueType * d_buffer{};
+      WALBERLA_GPU_CHECK(gpuMalloc(&d_buffer, nValues * sizeof(ValueType)))
+
+      // fill buffer on GPU
+      auto packKernel = gpu::make_kernel( &internal::packBufferGPU );
+      packKernel.addFieldIndexingParam( FieldIdx_T::interval( *d_field, ci, 0, this->fSize_ ) );
+      packKernel.addParam<real_t*>(d_buffer);
+      packKernel();
+
+      // copy from device to host buffer
+      WALBERLA_GPU_CHECK(gpuMemcpy(h_buffer.data(), d_buffer, nValues * sizeof(ValueType), gpuMemcpyDeviceToHost))
+
+      WALBERLA_GPU_CHECK(gpuFree(d_buffer))
+
+   }
+
+   void unpackBuffer(IBlock* const block, const CellInterval& ci, const std::vector< ValueType >& h_buffer) {
+
+      // get field
+      auto d_field = block->getData< GPUField_T >(this->fieldID_);
+      WALBERLA_ASSERT_NOT_NULLPTR(d_field)
+
+      const uint_t nValues = ci.numCells() * uint_c(this->fSize_);
+
+      // create GPU buffer
+      ValueType * d_buffer{};
+      WALBERLA_GPU_CHECK(gpuMalloc(&d_buffer, nValues * sizeof(ValueType)))
+
+      // copy from host to device buffer
+      WALBERLA_GPU_CHECK(gpuMemcpy(d_buffer, h_buffer.data(), nValues * sizeof(ValueType), gpuMemcpyHostToDevice))
+
+      // unpack buffer on GPU
+      auto unpackKernel = gpu::make_kernel( &internal::unpackBufferGPU );
+      unpackKernel.addFieldIndexingParam( FieldIdx_T::interval( *d_field, ci, 0, this->fSize_ ) );
+      unpackKernel.addParam<const real_t*>(d_buffer);
+      unpackKernel();
+
+      WALBERLA_GPU_CHECK(gpuFree(d_buffer))
+   }
+
+}; // class ShiftedPeriodicity
+
+} // namespace gpu
+} // namespace walberla
diff --git a/tests/boundary/CMakeLists.txt b/tests/boundary/CMakeLists.txt
index 82ced321b3efd91cd261ad3567cf0b067a457775..8dd843d974f81d9e7ce4521f2f88604596bd618d 100644
--- a/tests/boundary/CMakeLists.txt
+++ b/tests/boundary/CMakeLists.txt
@@ -7,3 +7,14 @@
 
 waLBerla_compile_test( FILES BoundaryHandling.cpp )
 waLBerla_execute_test( NAME BoundaryHandling )
+
+if (WALBERLA_BUILD_WITH_PYTHON)
+
+    waLBerla_link_files_to_builddir( *.py )
+
+    waLBerla_compile_test( FILES TestShiftedPeriodicity.cpp DEPENDS blockforest field python_coupling )
+    waLBerla_execute_test( NAME TestShiftedPeriodicity1 COMMAND $<TARGET_FILE:TestShiftedPeriodicity> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetup.py )
+    waLBerla_execute_test( NAME TestShiftedPeriodicity2 COMMAND $<TARGET_FILE:TestShiftedPeriodicity> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetup.py PROCESSES 2 )
+    waLBerla_execute_test( NAME TestShiftedPeriodicity4 COMMAND $<TARGET_FILE:TestShiftedPeriodicity> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetup.py PROCESSES 4 )
+
+endif()
\ No newline at end of file
diff --git a/tests/boundary/TestShiftedPeriodicity.cpp b/tests/boundary/TestShiftedPeriodicity.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2351a2f37896cbaaa8d725701b3c33f2dd2396d8
--- /dev/null
+++ b/tests/boundary/TestShiftedPeriodicity.cpp
@@ -0,0 +1,310 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TestShiftedPeriodicity.cpp
+//! \ingroup boundary
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/cell/Cell.h"
+#include "core/cell/CellInterval.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/mpi/Environment.h"
+#include "core/mpi/MPIWrapper.h"
+#include "core/mpi/MPIManager.h"
+#include "core/mpi/Operation.h"
+#include "core/mpi/Reduce.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+
+#include "field/Layout.h"
+//#include "field/vtk/VTKWriter.h"
+
+#include "python_coupling/CreateConfig.h"
+
+#include "stencil/D3Q27.h"
+#include "stencil/Directions.h"
+
+#include <blockforest/Initialization.h>
+#include <boundary/ShiftedPeriodicity.h>
+#include <core/DataTypes.h>
+#include <core/debug/Debug.h>
+#include <core/debug/TestSubsystem.h>
+#include <field/AddToStorage.h>
+#include <field/GhostLayerField.h>
+#include <memory>
+#include <vector>
+
+namespace walberla {
+
+using Stencil_T = stencil::D3Q27;
+
+using ValueType_T = real_t;
+using Field_T = GhostLayerField< ValueType_T, 3 >;
+
+constexpr cell_idx_t fieldGhostLayers = 1;
+
+//////////
+// MAIN //
+//////////
+
+template< typename FieldType_T >
+class FieldInitialiser {
+
+ public:
+   FieldInitialiser( const std::weak_ptr< StructuredBlockForest > & sbf, const BlockDataID fieldId )
+   : sbf_(sbf), fieldId_(fieldId)
+   {}
+
+   void operator()() {
+
+
+      const auto blocks = sbf_.lock();
+      WALBERLA_ASSERT_NOT_NULLPTR(blocks)
+
+      for ( auto & block : *blocks )
+      {
+         // get field
+         auto * field = block.template getData<FieldType_T>(fieldId_);
+         WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+         // get cell interval
+         auto ci = field->xyzSizeWithGhostLayer();
+
+         for (const auto& cell : ci) {
+
+            // get global coordinates
+            Cell globalCell;
+            blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+            for (uint_t d = 0; d < FieldType_T::F_SIZE; ++d)
+            {
+               field->get(cell, d) = real_c(  (globalCell.x() + 2 * fieldGhostLayers)
+                                            * (globalCell.y() + 2 * fieldGhostLayers)
+                                            * (globalCell.z() + 2 * fieldGhostLayers)
+                                            * int_c(d + 1));
+            }
+         }
+      }
+
+   }
+
+ private:
+   std::weak_ptr< StructuredBlockForest > sbf_{};
+   const BlockDataID fieldId_{};
+
+};
+
+
+int main( int argc, char **argv ) {
+
+   const mpi::Environment env(argc, argv);
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      auto config = *cfg;
+      logging::configureLogging(config);
+
+      // create the domain, flag field and vector field (non-uniform initialisation)
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config->getBlock("DomainSetup"), nullptr, true);
+
+      const auto fieldID = field::addToStorage< Field_T >(blocks, "test field", real_t(0), field::Layout::fzyx, fieldGhostLayers);
+      FieldInitialiser< Field_T > initialiser(blocks, fieldID);
+
+      // re-initialise fields
+      initialiser();
+
+      // create periodic shift boundary condition
+
+      const auto spConfig = config->getBlock("Boundaries").getBlock("ShiftedPeriodicity");
+      const uint_t shiftDir = spConfig.getParameter<uint_t>("shiftDir");
+      const int shiftValue = spConfig.getParameter<int>("shiftValue");
+      const uint_t normalDir = spConfig.getParameter<uint_t>("normalDir");
+
+      boundary::ShiftedPeriodicity<Field_T> shiftedPeriodicity(
+         blocks, fieldID, fieldGhostLayers, normalDir, shiftDir, shiftValue
+      );
+
+//      auto vtkOutput = field::createVTKOutput<Field_T>(fieldID, *blocks, "test_field", 1, fieldGhostLayers,
+//                                                         false, "vtk_out", "simulation_step", false, false);
+//      vtkOutput();
+
+      // apply boundary condition and standard communication
+      shiftedPeriodicity();
+
+//      vtkOutput();
+
+      /// compare values
+      // create local domain slices to compare values before and after shift
+
+      const uint_t remDir = 3 - shiftDir - normalDir;
+      const auto shift = shiftedPeriodicity.shift();
+
+      const uint_t shiftSize = blocks->getNumberOfCells(shiftDir);
+      const uint_t remSize   = blocks->getNumberOfCells(remDir);
+      const uint_t sizeSlice = shiftSize * remSize * Field_T::F_SIZE;
+
+      std::vector<ValueType_T> innerMin(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> innerMax(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> glMin(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> glMax(sizeSlice, ValueType_T(0));
+
+      auto getIdx = [&remSize](const cell_idx_t shiftIdx, const cell_idx_t remIdx){
+         return shiftIdx * cell_idx_c(Field_T::F_SIZE * remSize)
+                + remIdx * cell_idx_c(Field_T::F_SIZE);
+      };
+
+      // fill slices for comparing values
+      for(auto & block : *blocks) {
+         const bool atMin = blocks->atDomainMinBorder(normalDir, block);
+         const bool atMax = blocks->atDomainMaxBorder(normalDir, block);
+         if(!atMin && !atMax)
+            continue;
+
+         auto * field = block.getData<Field_T>(fieldID);
+         WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+         // fill innerMin and glMin
+         if(atMin) {
+            Vector3<int> dirVector{};
+            dirVector[normalDir] = -1;
+            const auto dir = stencil::vectorToDirection(dirVector);
+
+            CellInterval innerMinCI;
+            CellInterval glMinCI;
+            field->getSliceBeforeGhostLayer(dir, innerMinCI, fieldGhostLayers, false);
+            field->getGhostRegion(dir, glMinCI, fieldGhostLayers, false);
+
+            // fill inner min
+            for(const auto & cell : innerMinCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               cell_idx_t idxShiftDir = globalCell[shiftDir] - shift[shiftDir];
+               if(idxShiftDir >= cell_idx_c(shiftSize)) idxShiftDir -= cell_idx_c(shiftSize);
+               if(idxShiftDir <= - int_c(fieldGhostLayers)) idxShiftDir += cell_idx_c(shiftSize);
+
+               const cell_idx_t idx = getIdx(idxShiftDir, cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, innerMin.size())
+
+                  innerMin[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+
+            // fill gl min
+            for(const auto & cell : glMinCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               const cell_idx_t idx = getIdx(cell_idx_c(globalCell[shiftDir]), globalCell[remDir]);
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, glMin.size())
+
+                  glMin[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+         }
+
+         // fill innerMax and glMax
+         if(atMax) {
+            Vector3<int> dirVector{};
+            dirVector[normalDir] = 1;
+            const auto dir = stencil::vectorToDirection(dirVector);
+
+            CellInterval innerMaxCI;
+            CellInterval glMaxCI;
+            field->getSliceBeforeGhostLayer(dir, innerMaxCI, fieldGhostLayers, false);
+            field->getGhostRegion(dir, glMaxCI, fieldGhostLayers, false);
+
+            // fill inner max
+            for(const auto & cell : innerMaxCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               cell_idx_t idxShiftDir = globalCell[shiftDir] + shift[shiftDir];
+               if(idxShiftDir >= cell_idx_c(shiftSize)) idxShiftDir -= cell_idx_c(shiftSize);
+               if(idxShiftDir <= - int_c(fieldGhostLayers)) idxShiftDir += cell_idx_c(shiftSize);
+
+               const cell_idx_t idx = getIdx(idxShiftDir, cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, innerMax.size())
+
+                  innerMax[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+
+            // fill gl max
+            for(const auto & cell : glMaxCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               const cell_idx_t idx = getIdx(cell_idx_c(globalCell[shiftDir]), cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, glMax.size())
+
+                  glMax[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+         }
+
+
+      }
+
+      WALBERLA_MPI_SECTION() {
+
+         mpi::reduceInplace(innerMin, mpi::SUM);
+         mpi::reduceInplace(innerMax, mpi::SUM);
+         mpi::reduceInplace(glMin, mpi::SUM);
+         mpi::reduceInplace(glMax, mpi::SUM);
+
+      }
+
+      WALBERLA_ROOT_SECTION() {
+         for(uint_t i = 0; i < sizeSlice; ++i) {
+            WALBERLA_CHECK_FLOAT_EQUAL(innerMin[i], glMax[i])
+            WALBERLA_CHECK_FLOAT_EQUAL(innerMax[i], glMin[i])
+         }
+      }
+
+   }
+
+   return 0;
+}
+
+} // namespace walberla
+
+
+
+int main(int argc, char **argv) {
+
+   walberla::debug::enterTestMode();
+
+   return walberla::main(argc,argv);
+}
diff --git a/tests/boundary/TestShiftedPeriodicitySetup.py b/tests/boundary/TestShiftedPeriodicitySetup.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ca890c22f56abb1449ad2f55d63c1350fa029b9
--- /dev/null
+++ b/tests/boundary/TestShiftedPeriodicitySetup.py
@@ -0,0 +1,43 @@
+import waLBerla as wlb
+
+
+class Scenario:
+    def __init__(self, normal_dir, shift_dir, shift_value, periodicity):
+        self.normal_dir = normal_dir
+        self.shift_dir = shift_dir
+        self.shift_value = shift_value
+        self.periodicity = tuple(periodicity)
+
+    @wlb.member_callback
+    def config(self):
+
+        return {
+            'DomainSetup': {
+                'blocks': (3, 3, 3),
+                'cellsPerBlock': (4, 4, 4),
+                'cartesianSetup': 0,
+                'periodic': self.periodicity,
+            },
+            'Boundaries': {
+                'ShiftedPeriodicity': {
+                    'shiftDir': self.shift_dir,
+                    'shiftValue': self.shift_value,
+                    'normalDir': self.normal_dir
+                }
+            },
+            'Logging': {
+                'logLevel': "Info"
+            }
+        }
+
+
+scenarios = wlb.ScenarioManager()
+
+for normal_dir in (0, 1, 2):
+    for shift_dir in (0, 1, 2):
+        if normal_dir == shift_dir:
+            continue
+        periodicity = 3 * [0]
+        periodicity[shift_dir] = 1
+        for shift_value in (-3, 0, 2, 5, 8, 15):
+            scenarios.add(Scenario(normal_dir, shift_dir, shift_value, periodicity))
diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt
index e760cca4db10704cbedfef89a4faca71631ea730..18dab47d95569fd7445f0811b100c94cb5315829 100644
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -56,4 +56,14 @@ waLBerla_generate_target_from_python(NAME CodegenGeneratedGPUFieldPackInfo FILE
 waLBerla_compile_test( FILES codegen/GeneratedFieldPackInfoTestGPU.cpp
         DEPENDS blockforest core field CodegenGeneratedGPUFieldPackInfo )
 waLBerla_execute_test( NAME GeneratedFieldPackInfoTestGPU )
+
+    if (WALBERLA_BUILD_WITH_PYTHON)
+
+        waLBerla_link_files_to_builddir( *.py )
+
+        waLBerla_compile_test( FILES TestShiftedPeriodicityGPU.cpp DEPENDS gpu blockforest field python_coupling )
+        waLBerla_execute_test( NAME TestShiftedPeriodicityGPU COMMAND $<TARGET_FILE:TestShiftedPeriodicityGPU> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetupGPU.py )
+
+    endif()
+
 endif()
diff --git a/tests/gpu/TestShiftedPeriodicityGPU.cpp b/tests/gpu/TestShiftedPeriodicityGPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..02cd9777e9535e214a18303b929ae9289fa3f562
--- /dev/null
+++ b/tests/gpu/TestShiftedPeriodicityGPU.cpp
@@ -0,0 +1,319 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TestShiftedPeriodicityGPU.cpp
+//! \ingroup gpu
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/cell/Cell.h"
+#include "core/cell/CellInterval.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/mpi/Environment.h"
+#include "core/mpi/MPIWrapper.h"
+#include "core/mpi/MPIManager.h"
+#include "core/mpi/Operation.h"
+#include "core/mpi/Reduce.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+
+#include "field/Layout.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "python_coupling/CreateConfig.h"
+
+#include "stencil/D3Q27.h"
+#include "stencil/Directions.h"
+
+#include <blockforest/Initialization.h>
+#include <gpu/ShiftedPeriodicity.h>
+#include <gpu/AddGPUFieldToStorage.h>
+#include <gpu/FieldCopy.h>
+#include <core/DataTypes.h>
+#include <core/debug/Debug.h>
+#include <core/debug/TestSubsystem.h>
+#include <field/AddToStorage.h>
+#include <field/GhostLayerField.h>
+#include <memory>
+#include <vector>
+
+namespace walberla {
+
+using Stencil_T = stencil::D3Q27;
+
+using ValueType_T = real_t;
+using Field_T = GhostLayerField< ValueType_T, 3 >;
+using GPUField_T = gpu::GPUField< ValueType_T >;
+
+constexpr cell_idx_t fieldGhostLayers = 1;
+
+//////////
+// MAIN //
+//////////
+
+template< typename FieldType_T >
+class FieldInitialiser {
+
+ public:
+   FieldInitialiser( const std::weak_ptr< StructuredBlockForest > & sbf, const BlockDataID fieldId )
+   : sbf_(sbf), fieldId_(fieldId)
+   {}
+
+   void operator()() {
+
+
+      const auto blocks = sbf_.lock();
+      WALBERLA_ASSERT_NOT_NULLPTR(blocks)
+
+      for ( auto & block : *blocks )
+      {
+         // get field
+         auto * field = block.template getData<FieldType_T>(fieldId_);
+         WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+         // get cell interval
+         auto ci = field->xyzSizeWithGhostLayer();
+
+         for (const auto& cell : ci) {
+
+            // get global coordinates
+            Cell globalCell;
+            blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+            for (uint_t d = 0; d < FieldType_T::F_SIZE; ++d)
+            {
+               field->get(cell, d) = real_c(  (globalCell.x() + 2 * fieldGhostLayers)
+                                            * (globalCell.y() + 2 * fieldGhostLayers)
+                                            * (globalCell.z() + 2 * fieldGhostLayers)
+                                            * int_c(d + 1));
+            }
+         }
+      }
+
+   }
+
+ private:
+   std::weak_ptr< StructuredBlockForest > sbf_{};
+   const BlockDataID fieldId_{};
+
+};
+
+
+int main( int argc, char **argv ) {
+
+   const mpi::Environment env(argc, argv);
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      auto config = *cfg;
+      logging::configureLogging(config);
+
+      // create the domain, flag field and vector field (non-uniform initialisation)
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config->getBlock("DomainSetup"), nullptr, true);
+
+      const auto h_fieldID = field::addToStorage< Field_T >(blocks, "test field CPU", real_t(0), field::Layout::fzyx, fieldGhostLayers);
+      FieldInitialiser< Field_T > initialiser(blocks, h_fieldID);
+
+      const auto d_fieldID = gpu::addGPUFieldToStorage< Field_T >(blocks, h_fieldID, "test field GPU");
+
+      // re-initialise fields
+      initialiser();
+
+      // create periodic shift boundary condition
+
+      const auto spConfig = config->getBlock("Boundaries").getBlock("ShiftedPeriodicity");
+      const uint_t shiftDir = spConfig.getParameter<uint_t>("shiftDir");
+      const int shiftValue = spConfig.getParameter<int>("shiftValue");
+      const uint_t normalDir = spConfig.getParameter<uint_t>("normalDir");
+      ;
+      gpu::ShiftedPeriodicityGPU<GPUField_T> shiftedPeriodicity(
+         blocks, d_fieldID, fieldGhostLayers, normalDir, shiftDir, shiftValue
+      );
+
+//      auto vtkOutput = field::createVTKOutput<Field_T>(fieldID, *blocks, "test_field", 1, fieldGhostLayers,
+//                                                         false, "vtk_out", "simulation_step", false, false);
+//      vtkOutput();
+
+      // apply boundary condition and standard communication
+      shiftedPeriodicity();
+
+//      vtkOutput();
+
+      /// compare values
+
+      // copy resulting GPU to CPU
+      gpu::fieldCpy<Field_T, GPUField_T>(blocks, h_fieldID, d_fieldID);
+
+      // create local domain slices to compare values before and after shift
+
+      const uint_t remDir = 3 - shiftDir - normalDir;
+      const auto shift = shiftedPeriodicity.shift();
+
+      const uint_t shiftSize = blocks->getNumberOfCells(shiftDir);
+      const uint_t remSize   = blocks->getNumberOfCells(remDir);
+      const uint_t sizeSlice = shiftSize * remSize * Field_T::F_SIZE;
+
+      std::vector<ValueType_T> innerMin(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> innerMax(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> glMin(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> glMax(sizeSlice, ValueType_T(0));
+
+      auto getIdx = [&remSize](const cell_idx_t shiftIdx, const cell_idx_t remIdx){
+         return shiftIdx * cell_idx_c(Field_T::F_SIZE * remSize)
+                + remIdx * cell_idx_c(Field_T::F_SIZE);
+      };
+
+      // fill slices for comparing values
+      for(auto & block : *blocks) {
+         const bool atMin = blocks->atDomainMinBorder(normalDir, block);
+         const bool atMax = blocks->atDomainMaxBorder(normalDir, block);
+         if(!atMin && !atMax)
+            continue;
+
+         auto * field = block.getData<Field_T>(h_fieldID);
+         WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+         // fill innerMin and glMin
+         if(atMin) {
+            Vector3<int> dirVector{};
+            dirVector[normalDir] = -1;
+            const auto dir = stencil::vectorToDirection(dirVector);
+
+            CellInterval innerMinCI;
+            CellInterval glMinCI;
+            field->getSliceBeforeGhostLayer(dir, innerMinCI, fieldGhostLayers, false);
+            field->getGhostRegion(dir, glMinCI, fieldGhostLayers, false);
+
+            // fill inner min
+            for(const auto & cell : innerMinCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               cell_idx_t idxShiftDir = globalCell[shiftDir] - shift[shiftDir];
+               if(idxShiftDir >= cell_idx_c(shiftSize)) idxShiftDir -= cell_idx_c(shiftSize);
+               if(idxShiftDir <= - int_c(fieldGhostLayers)) idxShiftDir += cell_idx_c(shiftSize);
+
+               const cell_idx_t idx = getIdx(idxShiftDir, cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, innerMin.size())
+
+                  innerMin[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+
+            // fill gl min
+            for(const auto & cell : glMinCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               const cell_idx_t idx = getIdx(cell_idx_c(globalCell[shiftDir]), globalCell[remDir]);
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, glMin.size())
+
+                  glMin[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+         }
+
+         // fill innerMax and glMax
+         if(atMax) {
+            Vector3<int> dirVector{};
+            dirVector[normalDir] = 1;
+            const auto dir = stencil::vectorToDirection(dirVector);
+
+            CellInterval innerMaxCI;
+            CellInterval glMaxCI;
+            field->getSliceBeforeGhostLayer(dir, innerMaxCI, fieldGhostLayers, false);
+            field->getGhostRegion(dir, glMaxCI, fieldGhostLayers, false);
+
+            // fill inner max
+            for(const auto & cell : innerMaxCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               cell_idx_t idxShiftDir = globalCell[shiftDir] + shift[shiftDir];
+               if(idxShiftDir >= cell_idx_c(shiftSize)) idxShiftDir -= cell_idx_c(shiftSize);
+               if(idxShiftDir <= - int_c(fieldGhostLayers)) idxShiftDir += cell_idx_c(shiftSize);
+
+               const cell_idx_t idx = getIdx(idxShiftDir, cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, innerMax.size())
+
+                  innerMax[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+
+            // fill gl min
+            for(const auto & cell : glMaxCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               const cell_idx_t idx = getIdx(cell_idx_c(globalCell[shiftDir]), cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, glMax.size())
+
+                  glMax[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+         }
+
+
+      }
+
+      WALBERLA_MPI_SECTION() {
+
+         mpi::reduceInplace(innerMin, mpi::SUM);
+         mpi::reduceInplace(innerMax, mpi::SUM);
+         mpi::reduceInplace(glMin, mpi::SUM);
+         mpi::reduceInplace(glMax, mpi::SUM);
+
+      }
+
+      WALBERLA_ROOT_SECTION() {
+         for(uint_t i = 0; i < sizeSlice; ++i) {
+            WALBERLA_CHECK_FLOAT_EQUAL(innerMin[i], glMax[i])
+            WALBERLA_CHECK_FLOAT_EQUAL(innerMax[i], glMin[i])
+         }
+      }
+
+   }
+
+   return 0;
+}
+
+} // namespace walberla
+
+
+
+int main(int argc, char **argv) {
+
+   walberla::debug::enterTestMode();
+
+   return walberla::main(argc,argv);
+}
diff --git a/tests/gpu/TestShiftedPeriodicitySetupGPU.py b/tests/gpu/TestShiftedPeriodicitySetupGPU.py
new file mode 100644
index 0000000000000000000000000000000000000000..06443685aa1119fbe6391d89e64ae048e047cd93
--- /dev/null
+++ b/tests/gpu/TestShiftedPeriodicitySetupGPU.py
@@ -0,0 +1,40 @@
+import waLBerla as wlb
+
+
+class Scenario:
+    def __init__(self, normal_dir, shift_dir, shift_value, periodicity):
+        self.normal_dir = normal_dir
+        self.shift_dir = shift_dir
+        self.shift_value = shift_value
+        self.periodicity = tuple(periodicity)
+
+    @wlb.member_callback
+    def config(self):
+
+        return {
+            'DomainSetup': {
+                'blocks': (3, 3, 3),
+                'cellsPerBlock': (4, 4, 4),
+                'cartesianSetup': 0,
+                'periodic': self.periodicity,
+            },
+            'Boundaries': {
+                'ShiftedPeriodicity': {
+                    'shiftDir': self.shift_dir,
+                    'shiftValue': self.shift_value,
+                    'normalDir': self.normal_dir
+                }
+            }
+        }
+
+
+scenarios = wlb.ScenarioManager()
+
+for normal_dir in (0, 1, 2):
+    for shift_dir in (0, 1, 2):
+        if normal_dir == shift_dir:
+            continue
+        periodicity = 3 * [0]
+        periodicity[shift_dir] = 1
+        for shift_value in (-3, 0, 2, 5, 8, 15):
+            scenarios.add(Scenario(normal_dir, shift_dir, shift_value, periodicity))