diff --git a/apps/tutorials/basics/01_BlocksAndFields.dox b/apps/tutorials/basics/01_BlocksAndFields.dox
index 944c64cb50331323490d89a6ccb266418fd8de5b..4feaf75a4ed7f104ad0f37514bddbe5b0d0a5c1a 100644
--- a/apps/tutorials/basics/01_BlocksAndFields.dox
+++ b/apps/tutorials/basics/01_BlocksAndFields.dox
@@ -6,7 +6,7 @@ namespace walberla {
 \brief Introduction to block structure and field.
 
 This tutorial walks you through the process of creating a simple waLBerla application. 
-The source file of this tutorial can be found in apps/tutorials/01_BlocksAndFields.cpp.
+The source file of this tutorial can be found in `apps/tutorials/01_BlocksAndFields.cpp`.
 To compile and run this example, go to your build directory into `apps/tutorials` type `make`
 and run the generated executable.
 
@@ -153,7 +153,7 @@ Using this setup mechanism, waLBerla does not enforce that the fields have the s
 
 Remember: For waLBerla, a block is just a container for arbitrary data - and a field is just an "arbitrary" data item stored on each block.
 Block data does not have to be any waLBerla data structure. It is possible to store any type of data on a block, 
-so instead of using the field class, we could, for example, have used a std::vector<std::vector<double> > to store our lattice.
+so instead of using the field class, we could, for example, have used a `std::vector<std::vector<double>>` to store our lattice.
 
 The callback function can now be registered at the block storage with the following piece of code:
 
@@ -171,7 +171,7 @@ dock widget can then be used to display slices of the field.
 
 \image html tutorial_basics01_field.jpeg
 
-The next tutorial contains the writing of algorithms operating on block data: \ref tutorial02
+The next tutorial contains the writing of algorithms operating on block data: \ref tutorial_basics_02
 
 \tableofcontents
 
diff --git a/apps/tutorials/gpu/01_GameOfLife_cuda.dox b/apps/tutorials/gpu/01_GameOfLife_cuda.dox
index 8794e6c520ffb31d2c3653622cb2f4b4ba4b6eda..0c811bea939f122291cb46962f9087f64c4c62f1 100644
--- a/apps/tutorials/gpu/01_GameOfLife_cuda.dox
+++ b/apps/tutorials/gpu/01_GameOfLife_cuda.dox
@@ -37,9 +37,9 @@ auto hostFieldAllocator = make_shared< gpu::HostFieldAllocator<real_t> >();
 BlockDataID const cpuFieldID =field::addToStorage< ScalarField >(blocks, "CPU Field", real_c(0.0), field::fzyx, uint_c(1), hostFieldAllocator);
 \endcode
 
-Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 .
+Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics_03 .
 Then two GPU fields are created: "source" and "destination" field. The helper function
-gpu::addGPUFieldToStorage() creates a gpu::GPUField field of the same size and layout of the given
+\ref gpu::addGPUFieldToStorage() creates a \ref gpu::GPUField field of the same size and layout of the given
 CPU field:
 \code
 BlockDataID gpuFieldSrcID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
@@ -118,10 +118,10 @@ Note that copying data is costly and thus we don't want to do this in every time
 
 \section gpu01_comm Communication
 
-For this tutorial we use the gpu::communication::UniformGPUScheme that first collects all data in a buffer and
-sends only one message per communication step and neighbor. For the PackInfo we use the MemcpyPackInfo. It receives
-a buffer located on the GPU and fills it using memcpy operations
-If the GPU library is build with MPI support this buffer can be send to other GPUs without a copy to the CPU.
+For this tutorial we use the \ref gpu::communication::UniformGPUScheme that first collects all data in a buffer and
+sends only one message per communication step and neighbor. For the `PackInfo` we use the \ref gpu::communication::MemcpyPackInfo.
+It receives a buffer located on the GPU and fills it using memcpy operations.
+If the GPU library is built with MPI support this buffer can be sent to other GPUs without a copy to the CPU.
 Otherwise the copying will be done in the back by the communication class.
 
 \code
diff --git a/src/blockforest/communication/NonUniformPackInfo.h b/src/blockforest/communication/NonUniformPackInfo.h
index 73c3f760fbfb54b3af1be35fdd2d633e3495269e..cd9b20725d6816e4ba21c51b49dbf1f3d091e086 100644
--- a/src/blockforest/communication/NonUniformPackInfo.h
+++ b/src/blockforest/communication/NonUniformPackInfo.h
@@ -52,28 +52,28 @@ public:
    /**
     * Should return true if the amount of data that is packed for a given block in direction
     * "dir" is guaranteed to remain constant over time. False otherwise.
-    * If you are not sure what to return, return false! Returning false is always save.
+    * If you are not sure what to return, return false! Returning false is always safe.
     * Falsely return true will lead to errors! However, if the data can be guaranteed to remain
     * constant over time, returning true enables performance optimizations during the communication.
     */
    virtual bool constantDataExchange() const = 0;
 
    /**
-    * Must return false if calling unpackData and/or communicateLocal is not thread-safe.
+    * Must return false if calling `unpackData*()` and/or `communicateLocal*()` methods is not thread-safe.
     * True otherwise.
-    * If you are not sure what to return, return false! Returning false is always save.
-    * Falsely return true will most likely lead to errors! However, if both unpackData AND
-    * communicateLocal are thread-safe, returning true can lead to performance improvements.
+    * If you are not sure what to return, return false! Returning false is always safe.
+    * Falsely return true will most likely lead to errors! However, if both `unpackData*()` AND
+    * `communicateLocal*()` are thread-safe, returning true can lead to performance improvements.
     */
    virtual bool threadsafeReceiving() const = 0;
 
-   /// Must be thread-safe! Calls packDataImpl.
+   /// Must be thread-safe! Calls \ref packDataEqualLevelImpl.
    inline void packDataEqualLevel( const Block * sender, stencil::Direction dir, mpi::SendBuffer & buffer ) const;
 
-   /// If NOT thread-safe, threadsafeReceiving must return false!
+   /// If NOT thread-safe, \ref threadsafeReceiving must return false!
    virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer ) = 0;
 
-   /// If NOT thread-safe, threadsafeReceiving must return false!
+   /// If NOT thread-safe, \ref threadsafeReceiving must return false!
    virtual void communicateLocalEqualLevel( const Block * sender, Block * receiver, stencil::Direction dir ) = 0;
 
    inline  void packDataCoarseToFine        ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, mpi::SendBuffer & buffer ) const;
diff --git a/src/blockforest/communication/UniformToNonUniformPackInfoAdapter.h b/src/blockforest/communication/UniformToNonUniformPackInfoAdapter.h
index c9c6b895c47977aaadd7ae2d5f9640051a5a83e9..f0e5838b4f8879019d2ede96538db39dc6182860 100644
--- a/src/blockforest/communication/UniformToNonUniformPackInfoAdapter.h
+++ b/src/blockforest/communication/UniformToNonUniformPackInfoAdapter.h
@@ -32,7 +32,7 @@ namespace communication {
 
 //*******************************************************************************************************************
 /*! 
- * Adapter to use a UniformPackInfo in a NonUniformBufferedScheme. No communication between coarse <-> fine blocks
+ * Adapter to use a \ref communication::UniformPackInfo in a \ref NonUniformBufferedScheme. No communication between coarse <-> fine blocks
  * happens.
  */
 //*******************************************************************************************************************
@@ -51,25 +51,25 @@ public:
    /**
    * Should return true if the amount of data that is packed for a given block in direction
    * "dir" is guaranteed to remain constant over time. False otherwise.
-   * If you are not sure what to return, return false! Returning false is always save.
+   * If you are not sure what to return, return false! Returning false is always safe.
    * Falsely return true will lead to errors! However, if the data can be guaranteed to remain
    * constant over time, returning true enables performance optimizations during the communication.
    */
    virtual bool constantDataExchange() const { return uniformPackInfo_->constantDataExchange(); }
 
    /**
-   * Must return false if calling unpackData and/or communicateLocal is not thread-safe.
+   * Must return false if calling `unpackData*()` and/or `communicateLocal*()` methods is not thread-safe.
    * True otherwise.
-   * If you are not sure what to return, return false! Returning false is always save.
-   * Falsely return true will most likely lead to errors! However, if both unpackData AND
-   * communicateLocal are thread-safe, returning true can lead to performance improvements.
+   * If you are not sure what to return, return false! Returning false is always safe.
+   * Falsely return true will most likely lead to errors! However, if both `unpackData*()` AND
+   * `communicateLocal*()` are thread-safe, returning true can lead to performance improvements.
    */
    virtual bool threadsafeReceiving() const { return uniformPackInfo_->threadsafeReceiving(); }
 
-   /// If NOT thread-safe, threadsafeReceiving must return false!
+   /// If NOT thread-safe, \ref threadsafeReceiving must return false!
    virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer ) { uniformPackInfo_->unpackData( receiver, dir, buffer ); }
 
-   /// If NOT thread-safe, threadsafeReceiving must return false!
+   /// If NOT thread-safe, \ref threadsafeReceiving must return false!
    virtual void communicateLocalEqualLevel( const Block * sender, Block * receiver, stencil::Direction dir ) { uniformPackInfo_->communicateLocal( sender, receiver, dir ); }
 
    virtual void unpackDataCoarseToFine( Block * /*fineReceiver*/, const BlockID & /*coarseSender*/, stencil::Direction /*dir*/, mpi::RecvBuffer & /*buffer*/ ) { }
diff --git a/src/communication/UniformPackInfo.h b/src/communication/UniformPackInfo.h
index aa110f9bdf5c51b37a57572cfbc800b004ab37b6..168ce9685473619a897e0ebb85677a7f10f66cee 100644
--- a/src/communication/UniformPackInfo.h
+++ b/src/communication/UniformPackInfo.h
@@ -35,18 +35,21 @@ namespace communication {
 
 
 /**
- * \brief UniformPackInfo encapsulates information on how to extract data from blocks,
- * that should be communicated (see packData() ) to neighboring blocks
- * and how to inject this data in a receiving block (see unpackData() )
+ * \brief Data packing/unpacking for ghost layer based communication of a field.
  *
- * Another special method exists for communication between two blocks,
- * which are allocated on the same
- * process. In this case the data does not have be communicated via a buffer,
+ * Encapsulate information on how to extract data from blocks that should be
+ * communicated to neighboring blocks (see \ref packData())
+ * and how to inject this data in a receiving block (see \ref unpackData()).
+ * This involves a memory buffer and two memory copy operations.
+ *
+ * A special method exists for communication between two blocks which are
+ * allocated on the same process (see \ref communicateLocal()).
+ * In this case the data does not have be communicated via a buffer,
  * but can be copied directly.
  *
  * Data that is packed in direction "dir" at one block is unpacked in
  * direction "stencil::inverseDir[dir]" at the neighboring block. This
- * behavior must be implemented in "communicateLocal"!
+ * behavior must be implemented in \ref communicateLocal()!
  *
  * \ingroup communication
  */
@@ -65,23 +68,25 @@ public:
    /**
     * Should return true if the amount of data that is packed for a given block in direction
     * "dir" is guaranteed to remain constant over time. False otherwise.
-    * If you are not sure what to return, return false! Returning false is always save.
-    * Falsely return true will lead to errors! However, if the data can be guaranteed to remain
+    * If you are not sure what to return, return false! Returning false is always safe.
+    * Falsely returning true will lead to errors! However, if the data can be guaranteed to remain
     * constant over time, returning true enables performance optimizations during the communication.
     */
    virtual bool constantDataExchange() const = 0;
 
    /**
-    * Must return false if calling unpackData and/or communicateLocal is not thread-safe.
+    * Must return false if calling \ref unpackData and/or \ref communicateLocal is not thread-safe.
     * True otherwise.
-    * If you are not sure what to return, return false! Returning false is always save.
-    * Falsely return true will most likely lead to errors! However, if both unpackData AND
-    * communicateLocal are thread-safe, returning true can lead to performance improvements.
+    * If you are not sure what to return, return false! Returning false is always safe.
+    * Falsely returning true will most likely lead to errors! However, if both \ref unpackData AND
+    * \ref communicateLocal are thread-safe, returning true can lead to performance improvements.
     */
    virtual bool threadsafeReceiving() const = 0;
 
    /**
-    * Packs data from a block into a send buffer. Must be thread-safe! Calls packDataImpl.
+    * \brief Pack data from a block into a send buffer.
+    *
+    * Must be thread-safe! Calls \ref packDataImpl.
     *
     * @param sender     the block whose data should be packed into a buffer
     * @param dir        pack data for neighbor in this direction
@@ -91,19 +96,21 @@ public:
    inline void packData( const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & buffer ) const;
 
    /**
-    * Unpacks received Data.
-    * If NOT thread-safe, threadsafeReceiving must return false!
+    * \brief Unpack received Data.
+    *
+    * If NOT thread-safe, \ref threadsafeReceiving must return false!
     *
     * @param receiver the block where the unpacked data should be stored into
     * @param dir      receive data from neighbor in this direction
-    * @param buffer
+    * @param buffer   buffer for reading the data from
     */
    virtual void unpackData( IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer ) = 0;
 
    /**
-    * Function to copy data from one local block to another local block.
+    * \brief Copy data from one local block to another local block.
+    *
     * Both blocks are allocated on the current process.
-    * If NOT thread-safe, threadsafeReceiving must return false!
+    * If NOT thread-safe, \ref threadsafeReceiving must return false!
     *
     * @param sender    id of block where the data should be copied from
     * @param receiver  id of block where the data should be copied to
@@ -134,7 +141,9 @@ public:
 protected:
 
    /**
-    * Packs data from a block into a send buffer. Must be thread-safe!
+    * \brief Pack data from a block into a send buffer.
+    *
+    * Must be thread-safe!
     *
     * @param sender     the block whose data should be packed into a buffer
     * @param dir        pack data for neighbor in this direction
diff --git a/src/gpu/FieldAccessor.h b/src/gpu/FieldAccessor.h
index fc1214e081c0822fdc1370ad9e69f44c2e986594..d737983d1aa5f289d624cd508eea8a7969a5fe69 100644
--- a/src/gpu/FieldAccessor.h
+++ b/src/gpu/FieldAccessor.h
@@ -31,6 +31,13 @@ namespace gpu
 
 
 
+   /**
+    * \brief Handle to the underlying device data of a \ref GPUField.
+    *
+    * Encapsulate the device memory pointer and offsets necessary
+    * to calculate the address of a cell from a GPU kernel's thread
+    * coordinates in the thread block.
+    */
    template<typename T>
    class FieldAccessor
    {
diff --git a/src/gpu/FieldIndexing.h b/src/gpu/FieldIndexing.h
index 51b337e61237690ddc5163113abeb47ee44691b1..a06c95087898b3c705548311a7d9810e63519e78 100644
--- a/src/gpu/FieldIndexing.h
+++ b/src/gpu/FieldIndexing.h
@@ -44,6 +44,14 @@ namespace gpu
 template< typename T >
 class GPUField;
 
+/**
+ * \brief Utility class to generate handles to the underlying device data of a \ref GPUField.
+ *
+ * Pre-calculate memory offsets of a \ref GPUField for a given slice,
+ * cell interval, or the entire grid with or without the ghost layer,
+ * and store them in a \ref FieldAccessor handle.
+ * That handle is obtained by calling \ref gpuAccess().
+ */
 template< typename T >
 class FieldIndexing
 {
diff --git a/src/gpu/GPUField.h b/src/gpu/GPUField.h
index f8a0242ed3aa5e9de3606d8ff1737b4fe869f42f..7d004c76203060c5fb77c350f306007a091ca0c9 100755
--- a/src/gpu/GPUField.h
+++ b/src/gpu/GPUField.h
@@ -45,16 +45,20 @@ namespace gpu
    *  Basically a wrapper around a CUDA/HIP device pointer together with size information about the field
    *  i.e. sizes in x,y,z,f directions and number of ghost layers.
    *
-   *  Internally represented by a gpuPitchedPtr which is allocated with gpuMalloc3D to take padding of the
-   *  innermost coordinate into account.
+   *  Internally represented by a \c gpuPitchedPtr which is allocated with extra padding for the
+   *  innermost coordinate.
+   *  Pitched memory is a type of non-linear memory where padding is introduced
+   *  to optimize data alignment and thus reduce data access latency,
+   *  for example by avoiding shared memory bank conflicts.
    *
    *  Supports Array-of-Structures (AoS,zyxf) layout and Structure-of-Arrays (SoA, fzyx) layout, in a similar way
-   *  to field::Field
+   *  to \ref field::Field
    *
-   *  To work with the GPUField look at the gpu::fieldCpy functions to transfer a field::Field to a gpu::GPUField
+   *  To work with the \ref gpu::GPUField look at the \ref gpu::fieldCpy functions to transfer a \ref field::Field to a \ref gpu::GPUField
    *  and vice versa.
-   *  When writing device kernels for GPUFields have a look at the FieldIndexing and FieldAccessor concepts.
-   *  These simplify the "iteration" i.e. indexing of cells in GPUFields.
+   *
+   *  When writing device kernels for a \ref GPUField, have a look at the \ref FieldIndexing and \ref FieldAccessor concepts.
+   *  These simplify the "iteration" i.e. indexing of cells in a \ref GPUField.
    */
    //*******************************************************************************************************************
    template<typename T>
diff --git a/src/gpu/GPURAII.h b/src/gpu/GPURAII.h
index 815b3829114506a8c601669aa4195461bd60151a..6bcfd7811b4e68f591f3dfe2597111139ec9906f 100644
--- a/src/gpu/GPURAII.h
+++ b/src/gpu/GPURAII.h
@@ -13,7 +13,7 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file CudaRAII.h
+//! \file GPURAII.h
 //! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
diff --git a/src/gpu/communication/GPUPackInfo.h b/src/gpu/communication/GPUPackInfo.h
index c34600f29b2219088c29b0d5ff2e9fb1dc4a1142..f700c372924a310e3ba816164692b1b6650c13d5 100644
--- a/src/gpu/communication/GPUPackInfo.h
+++ b/src/gpu/communication/GPUPackInfo.h
@@ -47,10 +47,28 @@ namespace walberla::gpu::communication {
 
 
 /**
- * Data packing/unpacking for ghost layer based communication of a gpu::GPUField
+ * \brief Data packing/unpacking for ghost layer based communication of a \ref GPUField.
+ *
+ * Encapsulate information on how to extract data from blocks that should be
+ * communicated to neighboring blocks (see \ref packDataImpl())
+ * and how to inject this data in a receiving block (see \ref unpackData()).
+ * This involves a host memory buffer and two device-to-host memory copy operations.
+ *
+ * A special method exists for communication between two blocks which are
+ * allocated on the same process (see \ref communicateLocal()).
+ * In this case the data does not have be communicated via a host buffer,
+ * but can be sent directly. This involves a single device-to-device memory
+ * copy operation.
+ *
+ * Data that is packed in direction "dir" at one block is unpacked in
+ * direction "stencil::inverseDir[dir]" at the neighboring block.
+ * This behavior must be implemented in \ref communicateLocal()!
+ *
+ * See \ref MemcpyPackInfo for a more efficient packing/unpacking method
+ * where the buffer is stored in device memory rather than in host memory.
+ *
  * \ingroup gpu
- * Template Parameters:
- *    - GPUField_T   A fully qualified GPUField.
+ * \tparam GPUField_T   A fully qualified \ref GPUField.
  */
 template<typename GPUField_T>
 class GPUPackInfo : public walberla::communication::UniformPackInfo
diff --git a/src/gpu/communication/GeneratedGPUPackInfo.h b/src/gpu/communication/GeneratedGPUPackInfo.h
index f5f6c98b60b529045a1877a435fcacacb9359a95..4b905ad63fa8aca23006d8b5ac2a2a09ab30078a 100644
--- a/src/gpu/communication/GeneratedGPUPackInfo.h
+++ b/src/gpu/communication/GeneratedGPUPackInfo.h
@@ -28,14 +28,59 @@
 
 namespace walberla::gpu {
 
+/**
+ * \brief Data packing/unpacking for ghost layer based communication of a \ref GPUField.
+ *
+ * Encapsulate information on how to extract data from blocks that should be
+ * communicated to neighboring blocks (see \ref pack())
+ * and how to inject this data in a receiving block (see \ref unpack()).
+ * This involves a memory buffer and two memory copy operations.
+ *
+ * A special method exists for communication between two blocks which are
+ * allocated on the same process (see \ref communicateLocal()).
+ * In this case the data does not have be communicated via a buffer,
+ * but can be copied directly.
+ *
+ * Data that is packed in direction "dir" at one block is unpacked in
+ * direction "stencil::inverseDir[dir]" at the neighboring block. This
+ * behavior must be implemented in \ref communicateLocal()!
+ *
+ * \ingroup gpu
+ */
 class GeneratedGPUPackInfo
 {
 public:
   GeneratedGPUPackInfo() = default;
   virtual ~GeneratedGPUPackInfo() = default;
 
+   /**
+    * \brief Pack data from a block into a send buffer.
+    *
+    * \param dir        pack data for neighbor in this direction
+    * \param buffer     buffer for writing the data into
+    * \param block      the block whose data should be packed into a buffer
+    * \param stream     GPU stream
+    */
    virtual void pack  ( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0;
+   /**
+    * \brief Copy data from one local block to another local block.
+    *
+    * Both blocks are allocated on the same MPI rank.
+    *
+    * \param dir       the direction of the communication (from sender to receiver)
+    * \param sender    id of block where the data should be copied from
+    * \param receiver  id of block where the data should be copied to
+    * \param stream     GPU stream
+    */
    virtual void communicateLocal  ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) = 0;
+   /**
+    * \brief Unpack data from a receive buffer into a block.
+    *
+    * \param dir        receive data from neighbor in this direction
+    * \param buffer     buffer for reading the data from
+    * \param block      the block where the unpacked data should be stored into
+    * \param stream     GPU stream
+    */
    virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0;
    virtual uint_t size( stencil::Direction dir, IBlock *block ) = 0;
 };
diff --git a/src/gpu/communication/MemcpyPackInfo.h b/src/gpu/communication/MemcpyPackInfo.h
index 6c15988f4f2687275fea7f0f8be36b2e7d99fcf6..c55c3394193afb67244e93843cf9d11d12fca7dd 100644
--- a/src/gpu/communication/MemcpyPackInfo.h
+++ b/src/gpu/communication/MemcpyPackInfo.h
@@ -13,6 +13,27 @@
 
 namespace walberla::gpu::communication {
 
+/**
+ * \brief Data packing/unpacking for ghost layer based communication of a \ref GPUField.
+ *
+ * Encapsulate information on how to extract data from blocks that should be
+ * communicated to neighboring blocks (see \ref pack())
+ * and how to inject this data in a receiving block (see \ref unpack()).
+ * This involves a device memory buffer and two device-to-device memory copy operations.
+ *
+ * A special method exists for communication between two blocks which are
+ * allocated on the same process (see \ref communicateLocal()).
+ * In this case the data does not have be communicated via a device buffer,
+ * but can be sent directly. This involves a single device-to-device memory
+ * copy operation.
+ *
+ * Data that is packed in direction "dir" at one block is unpacked in
+ * direction "stencil::inverseDir[dir]" at the neighboring block.
+ * This behavior must be implemented in \ref communicateLocal()!
+ *
+ * \ingroup gpu
+ * \tparam GPUFieldType   A fully qualified \ref GPUField.
+ */
 template<typename GPUFieldType>
 class MemcpyPackInfo : public ::walberla::gpu::GeneratedGPUPackInfo
 {
diff --git a/src/gpu/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h
index bc481d8950c25d4aa5196316c641e8b67e34318a..183df0497a53e11f2260fc3e591d65462800c036 100644
--- a/src/gpu/communication/UniformGPUScheme.h
+++ b/src/gpu/communication/UniformGPUScheme.h
@@ -42,6 +42,37 @@ namespace communication {
 
 
 
+/**
+ * \brief Communication scheme for buffered communication in uniform block grids.
+ *
+ * Synchronize a set of \ref GPUField between GPU devices.
+ * Communication between fields on the same process: use direct copy
+ * via \ref GeneratedGPUPackInfo::communicateLocal.
+ * Communication between different processes: use a buffered communication scheme;
+ * when multiple fields have been changed they can be synchronized at once,
+ * using one MPI message per communication partner.
+ *
+ *   \code
+ *      UniformGPUScheme<stencil::D3Q19> scheme;  // the stencil defines the communication neighbors
+ *      scheme.addPackInfo( make_shared<gpu::communication::MemcpyPackInfo<FieldType> >( idOfFirstField ) );
+ *      scheme.addPackInfo( make_shared<gpu::communication::MemcpyPackInfo<FieldType> >( idOfSecondField ) );
+ *
+ *      // either synchronous communication...
+ *      scheme();
+ *
+ *      // .. or asynchronous:
+ *      scheme.startCommunication();
+ *      functionWhichDoesNotNeedCommunicatedValues();
+ *      scheme.wait();
+ *   \endcode
+ *
+ * This scheme sends one message per communication step and neighbor device.
+ * Therefore all contents that have to be sent are packed into a single buffer.
+ * Multiple \ref GeneratedGPUPackInfo can be registered to send their contents in a single step.
+ *
+ * When running multiple \ref UniformGPUScheme concurrently, different MPI tags
+ * have to be used for the schemes: the tag can be passed in the constructor.
+ */
    template<typename Stencil>
    class UniformGPUScheme
    {
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
index 96c514fcc4369084273c098ac9bf4ad21310ae29..585d1db348cbf0e1b5572f563b48ff55a717e9ec 100644
--- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
@@ -291,7 +291,7 @@ class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUnif
    bool areNeighborsInDirection(const Block* block, const BlockID& neighborID,
                                 Vector3< cell_idx_t > dirVec) const;
 
-   CellInterval intervalHullInDirection(const CellInterval& ci, Vector3< cell_idx_t > tangentialDir,
+   CellInterval intervalHullInDirection(const CellInterval& ci, Vector3< cell_idx_t > dirVec,
                                         cell_idx_t width) const;
    bool skipsThroughCoarseBlock(const Block* block, Direction dir) const;
 
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
index 987cebe9b2bfd343ed0277ed3faefef4dddaa753..7ff9c7fd3cf5383499f51d9b17bc995f45450ef2 100644
--- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
@@ -425,7 +425,7 @@ inline Vector3< cell_idx_t >
 }
 
 /**
- * Returns the part of a cell interval's hull of given width in direction dirVec.
+ * Returns the part of a cell interval's hull of given \p width in direction \p dirVec.
  * @param ci        The original cell interval
  * @param dirVec    Direction Vector
  * @param width     Width of the hull