Shifted Periodicity

eb3f1c0f · Helen Schottenhamml · Girish Kumatagi · 0d2ceae5 · eb3f1c0f · eb3f1c0f
Commit eb3f1c0f authored 7 months ago by Helen Schottenhamml Committed by Girish Kumatagi 7 months ago
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,7 +24,7 @@ add_subdirectory( blockforest )
 add_subdirectory( boundary )
 add_subdirectory( communication )
 add_subdirectory( core )
-add_subdirectory(gpu)
+add_subdirectory( gpu )
 add_subdirectory( domain_decomposition )
 add_subdirectory( executiontree )
 if ( WALBERLA_BUILD_WITH_FFT AND FFTW3_FOUND )

--- a/src/boundary/CMakeLists.txt
+++ b/src/boundary/CMakeLists.txt
@@ -14,4 +14,5 @@ target_sources( boundary
      BoundaryHandlingCollection.h
      Boundary.cpp
      BoundaryUID.h
+      ShiftedPeriodicity.h
      )
--- a/src/boundary/ShiftedPeriodicity.h
+++ b/src/boundary/ShiftedPeriodicity.h
--- a/src/gpu/CMakeLists.txt
+++ b/src/gpu/CMakeLists.txt
@@ -38,6 +38,8 @@ target_sources( gpu
      ParallelStreams.h
      GPURAII.h
      DeviceSelectMPI.cpp
+      ShiftedPeriodicity.cu
+      ShiftedPeriodicity.h
      )

 # sources only for CUDA

--- a/src/gpu/FieldAccessor.h
+++ b/src/gpu/FieldAccessor.h
@@ -56,29 +56,29 @@ namespace gpu
              fOffset_(fOffset), indexingScheme_(indexingScheme )
      {}

-      __device__ void set( uint3 blockIdx, uint3 threadIdx )
+      __device__ void set( uint3 _blockIdx, uint3 _threadIdx )
      {
         switch ( indexingScheme_)
         {
-            case FZYX: ptr_ += blockIdx.z * fOffset_ + blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
-            case FZY : ptr_ +=                         blockIdx.y * fOffset_ + blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
-            case FZ  : ptr_ +=                                                 blockIdx.x * fOffset_ + threadIdx.x * zOffset_; break;
-            case F   : ptr_ +=                                                                         threadIdx.x * fOffset_; break;
-
-            case ZYXF: ptr_ += blockIdx.z * zOffset_ + blockIdx.y * yOffset_ + blockIdx.x * xOffset_ + threadIdx.x * fOffset_; break;
-            case ZYX : ptr_ +=                         blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
-            case ZY  : ptr_ +=                                                 blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
-            case Z   : ptr_ +=                                                                         threadIdx.x * zOffset_; break;
+            case FZYX: ptr_ += _blockIdx.z * fOffset_ + _blockIdx.y * zOffset_ + _blockIdx.x * yOffset_ + _threadIdx.x * xOffset_; break;
+            case FZY : ptr_ +=                          _blockIdx.y * fOffset_ + _blockIdx.x * zOffset_ + _threadIdx.x * yOffset_; break;
+            case FZ  : ptr_ +=                                                   _blockIdx.x * fOffset_ + _threadIdx.x * zOffset_; break;
+            case F   : ptr_ +=                                                                            _threadIdx.x * fOffset_; break;
+
+            case ZYXF: ptr_ += _blockIdx.z * zOffset_ + _blockIdx.y * yOffset_ + _blockIdx.x * xOffset_ + _threadIdx.x * fOffset_; break;
+            case ZYX : ptr_ +=                          _blockIdx.y * zOffset_ + _blockIdx.x * yOffset_ + _threadIdx.x * xOffset_; break;
+            case ZY  : ptr_ +=                                                   _blockIdx.x * zOffset_ + _threadIdx.x * yOffset_; break;
+            case Z   : ptr_ +=                                                                            _threadIdx.x * zOffset_; break;
         }
      }


-      __device__ uint_t getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim )
+      __device__ uint_t getLinearIndex( uint3 _blockIdx, uint3 _threadIdx, uint3 _gridDim, uint3 _blockDim )
      {
-         return threadIdx.x                              +
-                blockIdx.x * blockDim.x                  +
-                blockIdx.y * blockDim.x * gridDim.x      +
-                blockIdx.z * blockDim.x * gridDim.x * gridDim.y ;
+         return _threadIdx.x                                +
+                _blockIdx.x * _blockDim.x                   +
+                _blockIdx.y * _blockDim.x * _gridDim.x      +
+                _blockIdx.z * _blockDim.x * _gridDim.x * _gridDim.y ;
      }

      // This is always true for this specific field indexing class.

--- a/src/gpu/FieldAccessor3D.h
+++ b/src/gpu/FieldAccessor3D.h
@@ -38,17 +38,17 @@ namespace gpu
                       uint_t yOffset,
                       uint_t zOffset,
                       uint_t fOffset,
-                       const dim3 & idxDim,
-                       const dim3 & blockDim )
+                       const dim3 & _idxDim,
+                       const dim3 & _blockDim )
            : ptr_( ptr ), xOffset_( xOffset ), yOffset_( yOffset ), zOffset_( zOffset ), fOffset_( fOffset ),
-              idxDim_( idxDim ), blockDim_( blockDim )
+              idxDim_( _idxDim ), blockDim_( _blockDim )
      {}

-      __device__ __forceinline__ void set( const uint3& blockIdx, const uint3& threadIdx )
+      __device__ __forceinline__ void set( const uint3& _blockIdx, const uint3& _threadIdx )
      {
-         uint_t x = blockIdx.x * blockDim_.x + threadIdx.x;
-         uint_t y = blockIdx.y * blockDim_.y + threadIdx.y;
-         uint_t z = blockIdx.z * blockDim_.z + threadIdx.z;
+         uint_t x = _blockIdx.x * blockDim_.x + _threadIdx.x;
+         uint_t y = _blockIdx.y * blockDim_.y + _threadIdx.y;
+         uint_t z = _blockIdx.z * blockDim_.z + _threadIdx.z;

         if ( x < idxDim_.x && y < idxDim_.y && z < idxDim_.z )
         {

--- a/src/gpu/FieldAccessorXYZ.h
+++ b/src/gpu/FieldAccessorXYZ.h
@@ -37,11 +37,11 @@ namespace gpu
            : ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset), fOffset_(fOffset)
      {}

-      __device__ void set( uint3 blockIdx, uint3 threadIdx )
+      __device__ void set( uint3 _blockIdx, uint3 _threadIdx )
      {
-         ptr_ += threadIdx.x * xOffset_ +
-                 blockIdx.x  * yOffset_ +
-                 blockIdx.y  * zOffset_ ;
+         ptr_ += _threadIdx.x * xOffset_ +
+                 _blockIdx.x  * yOffset_ +
+                 _blockIdx.y  * zOffset_ ;
      }

      __device__ T & get()       { return * (T*)(ptr_);                }

--- a/src/gpu/ShiftedPeriodicity.cu
+++ b/src/gpu/ShiftedPeriodicity.cu
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ShiftedPeriodicity.cu
+//! \ingroup gpu
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include "gpu/ShiftedPeriodicity.h"
+
+namespace walberla {
+namespace gpu
+{
+
+namespace internal {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   __global__ void packBufferGPU( gpu::FieldAccessor<real_t> fa, real_t * const buffer ) {
+      fa.set(blockIdx, threadIdx);
+      if(fa.isValidPosition()) {
+         buffer[fa.getLinearIndex(blockIdx, threadIdx, gridDim, blockDim)] = fa.get();
+      }
+   }
+   __global__ void unpackBufferGPU( gpu::FieldAccessor<real_t> fa, const real_t * const buffer ) {
+      fa.set(blockIdx, threadIdx);
+      if(fa.isValidPosition()) {
+         fa.get() = buffer[fa.getLinearIndex(blockIdx, threadIdx, gridDim, blockDim)];
+      }
+   }
+#else
+   __global__ void packBufferGPU( gpu::FieldAccessor<real_t>, real_t * const ) {
+      WALBERLA_ABORT("gpu/ShiftedPeriodicity only supported when built with GPU support. Please use boundary/ShiftedPeriodicity on CPUs")
+   }
+   __global__ void unpackBufferGPU( gpu::FieldAccessor<real_t>, const real_t * const ) {
+      WALBERLA_ABORT("gpu/ShiftedPeriodicity only supported when built with GPU support. Please use boundary/ShiftedPeriodicity on CPUs")
+   }
+#endif
+}
+
+} // namespace gpu
+} // namespace walberla
--- a/src/gpu/ShiftedPeriodicity.h
+++ b/src/gpu/ShiftedPeriodicity.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ShiftedPeriodicity.h
+//! \ingroup gpu
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/StructuredBlockForest.h"
+
+#include "boundary/ShiftedPeriodicity.h"
+
+#include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+#include "core/debug/Debug.h"
+#include "core/math/Vector3.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+
+#include "gpu/DeviceWrapper.h"
+#include "gpu/FieldAccessor.h"
+#include "gpu/FieldIndexing.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/Kernel.h"
+
+#include <cstdlib>
+#include <device_launch_parameters.h>
+#include <memory>
+#include <vector>
+
+#include "ErrorChecking.h"
+
+namespace walberla {
+namespace gpu
+{
+
+namespace internal {
+   // GPU kernels - can be extended for other data types
+   __global__ void packBufferGPU( gpu::FieldAccessor<real_t> fa, real_t * const buffer );
+   __global__ void unpackBufferGPU( gpu::FieldAccessor<real_t> fa, const real_t * const buffer );
+}
+
+//*******************************************************************************************************************
+/*!
+ * A periodicity boundary condition that adds a user-defined spatial shift to the field when applied.
+ * This shift can prevent the locking of large-scale turbulent features in the flow direction, see e.g.,
+ * Munters et al. (https://doi.org/10.1063/1.4941912).
+ *
+ * Periodicity defined in the blockforest must be turned off in the normal-direction.
+ *
+ * This class handles the GPU-specific packing and unpacking of the communication buffers.
+ *
+ * @tparam GhostLayerField_T Type of the ghost-layer field that is shifted periodically
+ */
+//*******************************************************************************************************************
+template< typename GPUField_T >
+class ShiftedPeriodicityGPU : public boundary::ShiftedPeriodicityBase<ShiftedPeriodicityGPU<GPUField_T>, GPUField_T> {
+
+   using Base = boundary::ShiftedPeriodicityBase<ShiftedPeriodicityGPU<GPUField_T>, GPUField_T>;
+   friend Base;
+
+ public:
+
+   using ValueType = typename GPUField_T::value_type;
+   using ShiftType = typename Base::ShiftType;
+   using FieldIdx_T = gpu::FieldIndexing<ValueType>;
+
+   ShiftedPeriodicityGPU(const std::weak_ptr< StructuredBlockForest >& blockForest,
+                         const BlockDataID& fieldID, const uint_t fieldGhostLayers,
+                         const uint_t normalDir, const uint_t shiftDir, const ShiftType shiftValue)
+      : Base(blockForest, fieldID, fieldGhostLayers, normalDir, shiftDir, shiftValue)
+   {}
+
+
+ private:
+
+   void packBuffer(IBlock* const block, const CellInterval& ci, std::vector< ValueType >& h_buffer) {
+
+      // get field
+      auto d_field = block->getData< GPUField_T >(this->fieldID_);
+      WALBERLA_ASSERT_NOT_NULLPTR(d_field)
+
+      const uint_t nValues = ci.numCells() * uint_c(this->fSize_);
+
+      // create GPU buffer
+      ValueType * d_buffer{};
+      WALBERLA_GPU_CHECK(gpuMalloc(&d_buffer, nValues * sizeof(ValueType)))
+
+      // fill buffer on GPU
+      auto packKernel = gpu::make_kernel( &internal::packBufferGPU );
+      packKernel.addFieldIndexingParam( FieldIdx_T::interval( *d_field, ci, 0, this->fSize_ ) );
+      packKernel.addParam<real_t*>(d_buffer);
+      packKernel();
+
+      // copy from device to host buffer
+      WALBERLA_GPU_CHECK(gpuMemcpy(h_buffer.data(), d_buffer, nValues * sizeof(ValueType), gpuMemcpyDeviceToHost))
+
+      WALBERLA_GPU_CHECK(gpuFree(d_buffer))
+
+   }
+
+   void unpackBuffer(IBlock* const block, const CellInterval& ci, const std::vector< ValueType >& h_buffer) {
+
+      // get field
+      auto d_field = block->getData< GPUField_T >(this->fieldID_);
+      WALBERLA_ASSERT_NOT_NULLPTR(d_field)
+
+      const uint_t nValues = ci.numCells() * uint_c(this->fSize_);
+
+      // create GPU buffer
+      ValueType * d_buffer{};
+      WALBERLA_GPU_CHECK(gpuMalloc(&d_buffer, nValues * sizeof(ValueType)))
+
+      // copy from host to device buffer
+      WALBERLA_GPU_CHECK(gpuMemcpy(d_buffer, h_buffer.data(), nValues * sizeof(ValueType), gpuMemcpyHostToDevice))
+
+      // unpack buffer on GPU
+      auto unpackKernel = gpu::make_kernel( &internal::unpackBufferGPU );
+      unpackKernel.addFieldIndexingParam( FieldIdx_T::interval( *d_field, ci, 0, this->fSize_ ) );
+      unpackKernel.addParam<const real_t*>(d_buffer);
+      unpackKernel();
+
+      WALBERLA_GPU_CHECK(gpuFree(d_buffer))
+   }
+
+}; // class ShiftedPeriodicity
+
+} // namespace gpu
+} // namespace walberla
--- a/tests/boundary/CMakeLists.txt
+++ b/tests/boundary/CMakeLists.txt
@@ -7,3 +7,14 @@

 waLBerla_compile_test( FILES BoundaryHandling.cpp )
 waLBerla_execute_test( NAME BoundaryHandling )
+
+if (WALBERLA_BUILD_WITH_PYTHON)
+
+    waLBerla_link_files_to_builddir( *.py )
+
+    waLBerla_compile_test( FILES TestShiftedPeriodicity.cpp DEPENDS blockforest field python_coupling )
+    waLBerla_execute_test( NAME TestShiftedPeriodicity1 COMMAND $<TARGET_FILE:TestShiftedPeriodicity> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetup.py )
+    waLBerla_execute_test( NAME TestShiftedPeriodicity2 COMMAND $<TARGET_FILE:TestShiftedPeriodicity> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetup.py PROCESSES 2 )
+    waLBerla_execute_test( NAME TestShiftedPeriodicity4 COMMAND $<TARGET_FILE:TestShiftedPeriodicity> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetup.py PROCESSES 4 )
+
+endif()
\ No newline at end of file
--- a/tests/boundary/TestShiftedPeriodicity.cpp
+++ b/tests/boundary/TestShiftedPeriodicity.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TestShiftedPeriodicity.cpp
+//! \ingroup boundary
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/cell/Cell.h"
+#include "core/cell/CellInterval.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/mpi/Environment.h"
+#include "core/mpi/MPIWrapper.h"
+#include "core/mpi/MPIManager.h"
+#include "core/mpi/Operation.h"
+#include "core/mpi/Reduce.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+
+#include "field/Layout.h"
+//#include "field/vtk/VTKWriter.h"
+
+#include "python_coupling/CreateConfig.h"
+
+#include "stencil/D3Q27.h"
+#include "stencil/Directions.h"
+
+#include <blockforest/Initialization.h>
+#include <boundary/ShiftedPeriodicity.h>
+#include <core/DataTypes.h>
+#include <core/debug/Debug.h>
+#include <core/debug/TestSubsystem.h>
+#include <field/AddToStorage.h>
+#include <field/GhostLayerField.h>
+#include <memory>
+#include <vector>
+
+namespace walberla {
+
+using Stencil_T = stencil::D3Q27;
+
+using ValueType_T = real_t;
+using Field_T = GhostLayerField< ValueType_T, 3 >;
+
+constexpr cell_idx_t fieldGhostLayers = 1;
+
+//////////
+// MAIN //
+//////////
+
+template< typename FieldType_T >
+class FieldInitialiser {
+
+ public:
+   FieldInitialiser( const std::weak_ptr< StructuredBlockForest > & sbf, const BlockDataID fieldId )
+   : sbf_(sbf), fieldId_(fieldId)
+   {}
+
+   void operator()() {
+
+
+      const auto blocks = sbf_.lock();
+      WALBERLA_ASSERT_NOT_NULLPTR(blocks)
+
+      for ( auto & block : *blocks )
+      {
+         // get field
+         auto * field = block.template getData<FieldType_T>(fieldId_);
+         WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+         // get cell interval
+         auto ci = field->xyzSizeWithGhostLayer();
+
+         for (const auto& cell : ci) {
+
+            // get global coordinates
+            Cell globalCell;
+            blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+            for (uint_t d = 0; d < FieldType_T::F_SIZE; ++d)
+            {
+               field->get(cell, d) = real_c(  (globalCell.x() + 2 * fieldGhostLayers)
+                                            * (globalCell.y() + 2 * fieldGhostLayers)
+                                            * (globalCell.z() + 2 * fieldGhostLayers)
+                                            * int_c(d + 1));
+            }
+         }
+      }
+
+   }
+
+ private:
+   std::weak_ptr< StructuredBlockForest > sbf_{};
+   const BlockDataID fieldId_{};
+
+};
+
+
+int main( int argc, char **argv ) {
+
+   const mpi::Environment env(argc, argv);
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      auto config = *cfg;
+      logging::configureLogging(config);
+
+      // create the domain, flag field and vector field (non-uniform initialisation)
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config->getBlock("DomainSetup"), nullptr, true);
+
+      const auto fieldID = field::addToStorage< Field_T >(blocks, "test field", real_t(0), field::Layout::fzyx, fieldGhostLayers);
+      FieldInitialiser< Field_T > initialiser(blocks, fieldID);
+
+      // re-initialise fields
+      initialiser();
+
+      // create periodic shift boundary condition
+
+      const auto spConfig = config->getBlock("Boundaries").getBlock("ShiftedPeriodicity");
+      const uint_t shiftDir = spConfig.getParameter<uint_t>("shiftDir");
+      const int shiftValue = spConfig.getParameter<int>("shiftValue");
+      const uint_t normalDir = spConfig.getParameter<uint_t>("normalDir");
+
+      boundary::ShiftedPeriodicity<Field_T> shiftedPeriodicity(
+         blocks, fieldID, fieldGhostLayers, normalDir, shiftDir, shiftValue
+      );
+
+//      auto vtkOutput = field::createVTKOutput<Field_T>(fieldID, *blocks, "test_field", 1, fieldGhostLayers,
+//                                                         false, "vtk_out", "simulation_step", false, false);
+//      vtkOutput();
+
+      // apply boundary condition and standard communication
+      shiftedPeriodicity();
+
+//      vtkOutput();
+
+      /// compare values
+      // create local domain slices to compare values before and after shift
+
+      const uint_t remDir = 3 - shiftDir - normalDir;
+      const auto shift = shiftedPeriodicity.shift();
+
+      const uint_t shiftSize = blocks->getNumberOfCells(shiftDir);
+      const uint_t remSize   = blocks->getNumberOfCells(remDir);
+      const uint_t sizeSlice = shiftSize * remSize * Field_T::F_SIZE;
+
+      std::vector<ValueType_T> innerMin(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> innerMax(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> glMin(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> glMax(sizeSlice, ValueType_T(0));
+
+      auto getIdx = [&remSize](const cell_idx_t shiftIdx, const cell_idx_t remIdx){
+         return shiftIdx * cell_idx_c(Field_T::F_SIZE * remSize)
+                + remIdx * cell_idx_c(Field_T::F_SIZE);
+      };
+
+      // fill slices for comparing values
+      for(auto & block : *blocks) {
+         const bool atMin = blocks->atDomainMinBorder(normalDir, block);
+         const bool atMax = blocks->atDomainMaxBorder(normalDir, block);
+         if(!atMin && !atMax)
+            continue;
+
+         auto * field = block.getData<Field_T>(fieldID);
+         WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+         // fill innerMin and glMin
+         if(atMin) {
+            Vector3<int> dirVector{};
+            dirVector[normalDir] = -1;
+            const auto dir = stencil::vectorToDirection(dirVector);
+
+            CellInterval innerMinCI;
+            CellInterval glMinCI;
+            field->getSliceBeforeGhostLayer(dir, innerMinCI, fieldGhostLayers, false);
+            field->getGhostRegion(dir, glMinCI, fieldGhostLayers, false);
+
+            // fill inner min
+            for(const auto & cell : innerMinCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               cell_idx_t idxShiftDir = globalCell[shiftDir] - shift[shiftDir];
+               if(idxShiftDir >= cell_idx_c(shiftSize)) idxShiftDir -= cell_idx_c(shiftSize);
+               if(idxShiftDir <= - int_c(fieldGhostLayers)) idxShiftDir += cell_idx_c(shiftSize);
+
+               const cell_idx_t idx = getIdx(idxShiftDir, cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, innerMin.size())
+
+                  innerMin[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+
+            // fill gl min
+            for(const auto & cell : glMinCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               const cell_idx_t idx = getIdx(cell_idx_c(globalCell[shiftDir]), globalCell[remDir]);
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, glMin.size())
+
+                  glMin[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+         }
+
+         // fill innerMax and glMax
+         if(atMax) {
+            Vector3<int> dirVector{};
+            dirVector[normalDir] = 1;
+            const auto dir = stencil::vectorToDirection(dirVector);
+
+            CellInterval innerMaxCI;
+            CellInterval glMaxCI;
+            field->getSliceBeforeGhostLayer(dir, innerMaxCI, fieldGhostLayers, false);
+            field->getGhostRegion(dir, glMaxCI, fieldGhostLayers, false);
+
+            // fill inner max
+            for(const auto & cell : innerMaxCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               cell_idx_t idxShiftDir = globalCell[shiftDir] + shift[shiftDir];
+               if(idxShiftDir >= cell_idx_c(shiftSize)) idxShiftDir -= cell_idx_c(shiftSize);
+               if(idxShiftDir <= - int_c(fieldGhostLayers)) idxShiftDir += cell_idx_c(shiftSize);
+
+               const cell_idx_t idx = getIdx(idxShiftDir, cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, innerMax.size())
+
+                  innerMax[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+
+            // fill gl max
+            for(const auto & cell : glMaxCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               const cell_idx_t idx = getIdx(cell_idx_c(globalCell[shiftDir]), cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, glMax.size())
+
+                  glMax[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+         }
+
+
+      }
+
+      WALBERLA_MPI_SECTION() {
+
+         mpi::reduceInplace(innerMin, mpi::SUM);
+         mpi::reduceInplace(innerMax, mpi::SUM);
+         mpi::reduceInplace(glMin, mpi::SUM);
+         mpi::reduceInplace(glMax, mpi::SUM);
+
+      }
+
+      WALBERLA_ROOT_SECTION() {
+         for(uint_t i = 0; i < sizeSlice; ++i) {
+            WALBERLA_CHECK_FLOAT_EQUAL(innerMin[i], glMax[i])
+            WALBERLA_CHECK_FLOAT_EQUAL(innerMax[i], glMin[i])
+         }
+      }
+
+   }
+
+   return 0;
+}
+
+} // namespace walberla
+
+
+
+int main(int argc, char **argv) {
+
+   walberla::debug::enterTestMode();
+
+   return walberla::main(argc,argv);
+}
--- a/tests/boundary/TestShiftedPeriodicitySetup.py
+++ b/tests/boundary/TestShiftedPeriodicitySetup.py
+import waLBerla as wlb
+
+
+class Scenario:
+    def __init__(self, normal_dir, shift_dir, shift_value, periodicity):
+        self.normal_dir = normal_dir
+        self.shift_dir = shift_dir
+        self.shift_value = shift_value
+        self.periodicity = tuple(periodicity)
+
+    @wlb.member_callback
+    def config(self):
+
+        return {
+            'DomainSetup': {
+                'blocks': (3, 3, 3),
+                'cellsPerBlock': (4, 4, 4),
+                'cartesianSetup': 0,
+                'periodic': self.periodicity,
+            },
+            'Boundaries': {
+                'ShiftedPeriodicity': {
+                    'shiftDir': self.shift_dir,
+                    'shiftValue': self.shift_value,
+                    'normalDir': self.normal_dir
+                }
+            },
+            'Logging': {
+                'logLevel': "Info"
+            }
+        }
+
+
+scenarios = wlb.ScenarioManager()
+
+for normal_dir in (0, 1, 2):
+    for shift_dir in (0, 1, 2):
+        if normal_dir == shift_dir:
+            continue
+        periodicity = 3 * [0]
+        periodicity[shift_dir] = 1
+        for shift_value in (-3, 0, 2, 5, 8, 15):
+            scenarios.add(Scenario(normal_dir, shift_dir, shift_value, periodicity))
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -56,4 +56,14 @@ waLBerla_generate_target_from_python(NAME CodegenGeneratedGPUFieldPackInfo FILE
 waLBerla_compile_test( FILES codegen/GeneratedFieldPackInfoTestGPU.cpp
        DEPENDS blockforest core field CodegenGeneratedGPUFieldPackInfo )
 waLBerla_execute_test( NAME GeneratedFieldPackInfoTestGPU )
+
+    if (WALBERLA_BUILD_WITH_PYTHON)
+
+        waLBerla_link_files_to_builddir( *.py )
+
+        waLBerla_compile_test( FILES TestShiftedPeriodicityGPU.cpp DEPENDS gpu blockforest field python_coupling )
+        waLBerla_execute_test( NAME TestShiftedPeriodicityGPU COMMAND $<TARGET_FILE:TestShiftedPeriodicityGPU> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetupGPU.py )
+
+    endif()
+
 endif()
--- a/tests/gpu/TestShiftedPeriodicityGPU.cpp
+++ b/tests/gpu/TestShiftedPeriodicityGPU.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TestShiftedPeriodicityGPU.cpp
+//! \ingroup gpu
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/cell/Cell.h"
+#include "core/cell/CellInterval.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Vector3.h"
+#include "core/mpi/Environment.h"
+#include "core/mpi/MPIWrapper.h"
+#include "core/mpi/MPIManager.h"
+#include "core/mpi/Operation.h"
+#include "core/mpi/Reduce.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+
+#include "field/Layout.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "python_coupling/CreateConfig.h"
+
+#include "stencil/D3Q27.h"
+#include "stencil/Directions.h"
+
+#include <blockforest/Initialization.h>
+#include <gpu/ShiftedPeriodicity.h>
+#include <gpu/AddGPUFieldToStorage.h>
+#include <gpu/FieldCopy.h>
+#include <core/DataTypes.h>
+#include <core/debug/Debug.h>
+#include <core/debug/TestSubsystem.h>
+#include <field/AddToStorage.h>
+#include <field/GhostLayerField.h>
+#include <memory>
+#include <vector>
+
+namespace walberla {
+
+using Stencil_T = stencil::D3Q27;
+
+using ValueType_T = real_t;
+using Field_T = GhostLayerField< ValueType_T, 3 >;
+using GPUField_T = gpu::GPUField< ValueType_T >;
+
+constexpr cell_idx_t fieldGhostLayers = 1;
+
+//////////
+// MAIN //
+//////////
+
+template< typename FieldType_T >
+class FieldInitialiser {
+
+ public:
+   FieldInitialiser( const std::weak_ptr< StructuredBlockForest > & sbf, const BlockDataID fieldId )
+   : sbf_(sbf), fieldId_(fieldId)
+   {}
+
+   void operator()() {
+
+
+      const auto blocks = sbf_.lock();
+      WALBERLA_ASSERT_NOT_NULLPTR(blocks)
+
+      for ( auto & block : *blocks )
+      {
+         // get field
+         auto * field = block.template getData<FieldType_T>(fieldId_);
+         WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+         // get cell interval
+         auto ci = field->xyzSizeWithGhostLayer();
+
+         for (const auto& cell : ci) {
+
+            // get global coordinates
+            Cell globalCell;
+            blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+            for (uint_t d = 0; d < FieldType_T::F_SIZE; ++d)
+            {
+               field->get(cell, d) = real_c(  (globalCell.x() + 2 * fieldGhostLayers)
+                                            * (globalCell.y() + 2 * fieldGhostLayers)
+                                            * (globalCell.z() + 2 * fieldGhostLayers)
+                                            * int_c(d + 1));
+            }
+         }
+      }
+
+   }
+
+ private:
+   std::weak_ptr< StructuredBlockForest > sbf_{};
+   const BlockDataID fieldId_{};
+
+};
+
+
+int main( int argc, char **argv ) {
+
+   const mpi::Environment env(argc, argv);
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      auto config = *cfg;
+      logging::configureLogging(config);
+
+      // create the domain, flag field and vector field (non-uniform initialisation)
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config->getBlock("DomainSetup"), nullptr, true);
+
+      const auto h_fieldID = field::addToStorage< Field_T >(blocks, "test field CPU", real_t(0), field::Layout::fzyx, fieldGhostLayers);
+      FieldInitialiser< Field_T > initialiser(blocks, h_fieldID);
+
+      const auto d_fieldID = gpu::addGPUFieldToStorage< Field_T >(blocks, h_fieldID, "test field GPU");
+
+      // re-initialise fields
+      initialiser();
+
+      // create periodic shift boundary condition
+
+      const auto spConfig = config->getBlock("Boundaries").getBlock("ShiftedPeriodicity");
+      const uint_t shiftDir = spConfig.getParameter<uint_t>("shiftDir");
+      const int shiftValue = spConfig.getParameter<int>("shiftValue");
+      const uint_t normalDir = spConfig.getParameter<uint_t>("normalDir");
+      ;
+      gpu::ShiftedPeriodicityGPU<GPUField_T> shiftedPeriodicity(
+         blocks, d_fieldID, fieldGhostLayers, normalDir, shiftDir, shiftValue
+      );
+
+//      auto vtkOutput = field::createVTKOutput<Field_T>(fieldID, *blocks, "test_field", 1, fieldGhostLayers,
+//                                                         false, "vtk_out", "simulation_step", false, false);
+//      vtkOutput();
+
+      // apply boundary condition and standard communication
+      shiftedPeriodicity();
+
+//      vtkOutput();
+
+      /// compare values
+
+      // copy resulting GPU to CPU
+      gpu::fieldCpy<Field_T, GPUField_T>(blocks, h_fieldID, d_fieldID);
+
+      // create local domain slices to compare values before and after shift
+
+      const uint_t remDir = 3 - shiftDir - normalDir;
+      const auto shift = shiftedPeriodicity.shift();
+
+      const uint_t shiftSize = blocks->getNumberOfCells(shiftDir);
+      const uint_t remSize   = blocks->getNumberOfCells(remDir);
+      const uint_t sizeSlice = shiftSize * remSize * Field_T::F_SIZE;
+
+      std::vector<ValueType_T> innerMin(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> innerMax(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> glMin(sizeSlice, ValueType_T(0));
+      std::vector<ValueType_T> glMax(sizeSlice, ValueType_T(0));
+
+      auto getIdx = [&remSize](const cell_idx_t shiftIdx, const cell_idx_t remIdx){
+         return shiftIdx * cell_idx_c(Field_T::F_SIZE * remSize)
+                + remIdx * cell_idx_c(Field_T::F_SIZE);
+      };
+
+      // fill slices for comparing values
+      for(auto & block : *blocks) {
+         const bool atMin = blocks->atDomainMinBorder(normalDir, block);
+         const bool atMax = blocks->atDomainMaxBorder(normalDir, block);
+         if(!atMin && !atMax)
+            continue;
+
+         auto * field = block.getData<Field_T>(h_fieldID);
+         WALBERLA_ASSERT_NOT_NULLPTR(field)
+
+         // fill innerMin and glMin
+         if(atMin) {
+            Vector3<int> dirVector{};
+            dirVector[normalDir] = -1;
+            const auto dir = stencil::vectorToDirection(dirVector);
+
+            CellInterval innerMinCI;
+            CellInterval glMinCI;
+            field->getSliceBeforeGhostLayer(dir, innerMinCI, fieldGhostLayers, false);
+            field->getGhostRegion(dir, glMinCI, fieldGhostLayers, false);
+
+            // fill inner min
+            for(const auto & cell : innerMinCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               cell_idx_t idxShiftDir = globalCell[shiftDir] - shift[shiftDir];
+               if(idxShiftDir >= cell_idx_c(shiftSize)) idxShiftDir -= cell_idx_c(shiftSize);
+               if(idxShiftDir <= - int_c(fieldGhostLayers)) idxShiftDir += cell_idx_c(shiftSize);
+
+               const cell_idx_t idx = getIdx(idxShiftDir, cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, innerMin.size())
+
+                  innerMin[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+
+            // fill gl min
+            for(const auto & cell : glMinCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               const cell_idx_t idx = getIdx(cell_idx_c(globalCell[shiftDir]), globalCell[remDir]);
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, glMin.size())
+
+                  glMin[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+         }
+
+         // fill innerMax and glMax
+         if(atMax) {
+            Vector3<int> dirVector{};
+            dirVector[normalDir] = 1;
+            const auto dir = stencil::vectorToDirection(dirVector);
+
+            CellInterval innerMaxCI;
+            CellInterval glMaxCI;
+            field->getSliceBeforeGhostLayer(dir, innerMaxCI, fieldGhostLayers, false);
+            field->getGhostRegion(dir, glMaxCI, fieldGhostLayers, false);
+
+            // fill inner max
+            for(const auto & cell : innerMaxCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               cell_idx_t idxShiftDir = globalCell[shiftDir] + shift[shiftDir];
+               if(idxShiftDir >= cell_idx_c(shiftSize)) idxShiftDir -= cell_idx_c(shiftSize);
+               if(idxShiftDir <= - int_c(fieldGhostLayers)) idxShiftDir += cell_idx_c(shiftSize);
+
+               const cell_idx_t idx = getIdx(idxShiftDir, cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, innerMax.size())
+
+                  innerMax[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+
+            // fill gl min
+            for(const auto & cell : glMaxCI){
+               Cell globalCell;
+               blocks->transformBlockLocalToGlobalCell(globalCell, block, cell);
+
+               const cell_idx_t idx = getIdx(cell_idx_c(globalCell[shiftDir]), cell_idx_c(globalCell[remDir]));
+
+               for(cell_idx_t f = 0; f < cell_idx_c(Field_T::F_SIZE); ++f) {
+                  WALBERLA_ASSERT(field->coordinatesValid(cell.x(), cell.y(), cell.z(), f))
+                  WALBERLA_ASSERT_LESS(idx + f, glMax.size())
+
+                  glMax[uint_c(idx + f)] = field->get(cell, f);
+               }
+            }
+         }
+
+
+      }
+
+      WALBERLA_MPI_SECTION() {
+
+         mpi::reduceInplace(innerMin, mpi::SUM);
+         mpi::reduceInplace(innerMax, mpi::SUM);
+         mpi::reduceInplace(glMin, mpi::SUM);
+         mpi::reduceInplace(glMax, mpi::SUM);
+
+      }
+
+      WALBERLA_ROOT_SECTION() {
+         for(uint_t i = 0; i < sizeSlice; ++i) {
+            WALBERLA_CHECK_FLOAT_EQUAL(innerMin[i], glMax[i])
+            WALBERLA_CHECK_FLOAT_EQUAL(innerMax[i], glMin[i])
+         }
+      }
+
+   }
+
+   return 0;
+}
+
+} // namespace walberla
+
+
+
+int main(int argc, char **argv) {
+
+   walberla::debug::enterTestMode();
+
+   return walberla::main(argc,argv);
+}
--- a/tests/gpu/TestShiftedPeriodicitySetupGPU.py
+++ b/tests/gpu/TestShiftedPeriodicitySetupGPU.py
+import waLBerla as wlb
+
+
+class Scenario:
+    def __init__(self, normal_dir, shift_dir, shift_value, periodicity):
+        self.normal_dir = normal_dir
+        self.shift_dir = shift_dir
+        self.shift_value = shift_value
+        self.periodicity = tuple(periodicity)
+
+    @wlb.member_callback
+    def config(self):
+
+        return {
+            'DomainSetup': {
+                'blocks': (3, 3, 3),
+                'cellsPerBlock': (4, 4, 4),
+                'cartesianSetup': 0,
+                'periodic': self.periodicity,
+            },
+            'Boundaries': {
+                'ShiftedPeriodicity': {
+                    'shiftDir': self.shift_dir,
+                    'shiftValue': self.shift_value,
+                    'normalDir': self.normal_dir
+                }
+            }
+        }
+
+
+scenarios = wlb.ScenarioManager()
+
+for normal_dir in (0, 1, 2):
+    for shift_dir in (0, 1, 2):
+        if normal_dir == shift_dir:
+            continue
+        periodicity = 3 * [0]
+        periodicity[shift_dir] = 1
+        for shift_value in (-3, 0, 2, 5, 8, 15):
+            scenarios.add(Scenario(normal_dir, shift_dir, shift_value, periodicity))