diff --git a/pystencils_walberla/__init__.py b/pystencils_walberla/__init__.py
index 767581a754edcf025f743c15338ef2a4b41c7624..a7e42bfbad77181ed8d76771d60758f4d5284a39 100644
--- a/pystencils_walberla/__init__.py
+++ b/pystencils_walberla/__init__.py
@@ -1,8 +1,11 @@
+from pystencils_walberla.boundary import (
+    generate_boundary, generate_staggered_boundary, generate_staggered_flux_boundary)
+
 from .cmake_integration import CodeGeneration
 from .codegen import (
-    generate_pack_info, generate_pack_info_for_field, generate_pack_info_from_kernel,
-    generate_mpidtype_info_from_kernel, generate_sweep)
+    generate_mpidtype_info_from_kernel, generate_pack_info, generate_pack_info_for_field,
+    generate_pack_info_from_kernel, generate_sweep)
 
 __all__ = ['CodeGeneration',
            'generate_sweep', 'generate_pack_info_from_kernel', 'generate_pack_info_for_field', 'generate_pack_info',
-           'generate_mpidtype_info_from_kernel']
+           'generate_mpidtype_info_from_kernel', 'generate_staggered_boundary', 'generate_staggered_flux_boundary', 'generate_boundary']
diff --git a/pystencils_walberla/boundary.py b/pystencils_walberla/boundary.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0d411af55cd89fda60811680cffda0faf93349b
--- /dev/null
+++ b/pystencils_walberla/boundary.py
@@ -0,0 +1,140 @@
+from functools import partial
+
+import numpy as np
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils import Field, FieldType
+from pystencils.boundaries.boundaryhandling import create_boundary_kernel
+from pystencils.boundaries.createindexlist import (
+    boundary_index_array_coordinate_names, direction_member_name,
+    numpy_data_type_for_boundary_object)
+from pystencils.data_types import TypedSymbol, create_type
+from pystencils_walberla.codegen import KernelInfo, default_create_kernel_parameters
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+
+try:
+    from pystencils_autodiff import AdjointField
+    from pystencils_autodiff.lbm.adjoint_boundaryconditions import AdjointBoundaryCondition
+except ImportError:
+    AdjointBoundaryCondition = ()
+    AdjointField = ()
+
+
+def generate_boundary(generation_context,
+                      class_name,
+                      boundary_object,
+                      field_name,
+                      neighbor_stencil,
+                      index_shape=(),
+                      field_type=FieldType.GENERIC,
+                      kernel_creation_function=None,
+                      target='cpu',
+                      namespace='pystencils',
+                      **create_kernel_params):
+    struct_name = "IndexInfo"
+    boundary_object.name = class_name
+    dim = len(neighbor_stencil[0])
+
+    create_kernel_params = default_create_kernel_parameters(generation_context, create_kernel_params)
+    create_kernel_params["target"] = target
+    index_struct_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
+
+    field = Field.create_generic(field_name, dim,
+                                 np.float64 if generation_context.double_accuracy else np.float32,
+                                 index_dimensions=len(index_shape), layout='fzyx', index_shape=index_shape,
+                                 field_type=field_type)
+    if isinstance(boundary_object, AdjointBoundaryCondition):
+        field = AdjointField(field)
+
+    index_field = Field('indexVector', FieldType.INDEXED, index_struct_dtype, layout=[0],
+                        shape=(TypedSymbol("indexVectorSize", create_type(np.int64)), 1), strides=(1, 1))
+
+    if not kernel_creation_function:
+        kernel_creation_function = create_boundary_kernel
+    kernel = kernel_creation_function(field, index_field, neighbor_stencil,
+                                      boundary_object, openmp=generation_context.openmp, target=target)
+    kernel.function_name = "boundary_" + boundary_object.name
+    kernel.assumed_inner_stride_one = False
+
+    # waLBerla is a 3D framework. Therefore, a zero for the z index has to be added if we work in 2D
+    if dim == 2:
+        stencil = ()
+        for d in neighbor_stencil:
+            d = d + (0,)
+            stencil = stencil + (d,)
+    else:
+        stencil = neighbor_stencil
+
+    stencil_info = [(i, d, ", ".join([str(e) for e in d])) for i, d in enumerate(stencil)]
+    inv_dirs = []
+    for direction in stencil:
+        inverse_dir = tuple([-i for i in direction])
+        inv_dirs.append(stencil.index(inverse_dir))
+
+    context = {
+        'class_name': boundary_object.name,
+        'StructName': struct_name,
+        'StructDeclaration': struct_from_numpy_dtype(struct_name, index_struct_dtype),
+        'kernel': KernelInfo(kernel),
+        'stencil_info': stencil_info,
+        'inverse_directions': inv_dirs,
+        'dim': dim,
+        'target': target,
+        'namespace': namespace,
+        'inner_or_boundary': boundary_object.inner_or_boundary,
+        'additional_data': '',  # list(map(lambda a: a[0], boundary_object.additional_data)),
+        'additional_data_functor': ''  # 'boundaryDataInitializer' if boundary_object.additional_data else ''
+    }
+
+    env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+
+    header = env.get_template('Boundary.tmpl.h').render(**context)
+    source = env.get_template('Boundary.tmpl.cpp').render(**context)
+
+    source_extension = "cpp" if target == "cpu" else "cu"
+    generation_context.write_file("{}.h".format(class_name), header)
+    generation_context.write_file("{}.{}".format(class_name, source_extension), source)
+    return kernel
+
+
+def _generate_boundary(generation_context,
+                       class_name,
+                       boundary_object,
+                       field_name,
+                       dim,
+                       *args,
+                       **kwargs):
+    # drop redundant dim parameter
+    return generate_boundary(generation_context, class_name, boundary_object, field_name, *args, **kwargs)
+
+
+generate_staggered_boundary = partial(_generate_boundary, field_name='field', field_type=FieldType.STAGGERED)
+generate_staggered_flux_boundary = partial(_generate_boundary, field_name='flux', field_type=FieldType.STAGGERED_FLUX)
+
+
+def struct_from_numpy_dtype(struct_name, numpy_dtype):
+    result = "struct %s { \n" % (struct_name,)
+
+    equality_compare = []
+    constructor_params = []
+    constructor_initializer_list = []
+    for name, (sub_type, offset) in numpy_dtype.fields.items():
+        pystencils_type = create_type(sub_type)
+        result += "    %s %s;\n" % (pystencils_type, name)
+        if name in boundary_index_array_coordinate_names or name == direction_member_name:
+            constructor_params.append("%s %s_" % (pystencils_type, name))
+            constructor_initializer_list.append("%s(%s_)" % (name, name))
+        else:
+            constructor_initializer_list.append("%s()" % name)
+        if pystencils_type.is_float():
+            equality_compare.append("floatIsEqual(%s, o.%s)" % (name, name))
+        else:
+            equality_compare.append("%s == o.%s" % (name, name))
+
+    result += "    %s(%s) : %s {}\n" % \
+              (struct_name, ", ".join(constructor_params), ", ".join(constructor_initializer_list))
+    result += "    bool operator==(const %s & o) const {\n        return %s;\n    }\n" % \
+              (struct_name, " && ".join(equality_compare))
+    result += "};\n"
+    return result
diff --git a/pystencils_walberla/templates/Boundary.tmpl.cpp b/pystencils_walberla/templates/Boundary.tmpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4b3333b14bbd2939964a90723ad60b1ed465f6e
--- /dev/null
+++ b/pystencils_walberla/templates/Boundary.tmpl.cpp
@@ -0,0 +1,104 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "{{class_name}}.h"
+{% if target == 'gpu' -%}
+#include "cuda/ErrorChecking.h"
+{%- endif %}
+
+
+{% if target == 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target == 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+using namespace std;
+
+namespace walberla {
+namespace {{namespace}} {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#pragma diag_suppress = declared_but_not_referenced
+#endif
+
+
+{{kernel|generate_definition}}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void {{class_name}}::run( IBlock * block, IndexVectors::Type type {% if target == 'gpu'%}, cudaStream_t stream {%endif%})
+{
+    auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+    int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
+    if( indexVectorSize == 0)
+        return;
+
+    {% if target == 'gpu' -%}
+    auto pointer = indexVectors->pointerGpu(type);
+    {% else %}
+    auto pointer = indexVectors->pointerCpu(type);
+    {% endif %}
+
+    uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+    {{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}}
+    {{kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}}
+}
+
+void {{class_name}}::operator() ( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} )
+{
+    run( block, IndexVectors::ALL{% if target == 'gpu'%}, stream {%endif%});
+}
+
+void {{class_name}}::inner( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} )
+{
+    run( block, IndexVectors::INNER{% if target == 'gpu'%}, stream {%endif%} );
+}
+
+void {{class_name}}::outer( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} )
+{
+    run( block, IndexVectors::OUTER{% if target == 'gpu'%}, stream {%endif%} );
+}
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
diff --git a/pystencils_walberla/templates/Boundary.tmpl.h b/pystencils_walberla/templates/Boundary.tmpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..95ae0dd30e8dcbec5f5f9990dea9dd87a344e672
--- /dev/null
+++ b/pystencils_walberla/templates/Boundary.tmpl.h
@@ -0,0 +1,196 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
+
+#include "core/DataTypes.h"
+
+{% if target is equalto 'cpu' -%}
+#include "field/GhostLayerField.h"
+{%- elif target is equalto 'gpu' -%}
+#include "cuda/GPUField.h"
+{%- endif %}
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+
+#include <set>
+#include <vector>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+{% if additional_data_functor %}
+template < class AdditionalDataFunctor_T >
+{% endif %}
+class {{class_name}}
+{
+public:
+    {{StructDeclaration|indent(4)}}
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<{{StructName}}>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() : cpuVectors_(NUM_TYPES)  {}
+        bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
+
+        {% if target == 'gpu' -%}
+        ~IndexVectors() {
+            for( auto & gpuVec: gpuVectors_)
+                cudaFree( gpuVec );
+        }
+        {% endif %}
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        {{StructName}} * pointerCpu(Type t)  { return &(cpuVectors_[t][0]); }
+
+        {% if target == 'gpu' -%}
+        {{StructName}} * pointerGpu(Type t)  { return gpuVectors_[t]; }
+        {% endif %}
+
+        void syncGPU()
+        {
+            {% if target == 'gpu' -%}
+            gpuVectors_.resize( cpuVectors_.size() );
+            for(size_t i=0; i < size_t(NUM_TYPES); ++i )
+            {
+                auto & gpuVec = gpuVectors_[i];
+                auto & cpuVec = cpuVectors_[i];
+                cudaFree( gpuVec );
+                cudaMalloc( &gpuVec, sizeof({{StructName}}) * cpuVec.size() );
+                cudaMemcpy( gpuVec, &cpuVec[0], sizeof({{StructName}}) * cpuVec.size(), cudaMemcpyHostToDevice );
+            }
+            {%- endif %}
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_;
+
+        {% if target == 'gpu' -%}
+        using GpuIndexVector = {{StructName}} *;
+        std::vector<GpuIndexVector> gpuVectors_;
+        {% endif %}
+    };
+
+
+    {{class_name}}( const shared_ptr<StructuredBlockForest> & blocks,
+                   {{kernel|generate_constructor_parameters(['indexVector', 'indexVectorSize'])}} )
+        : {{ kernel|generate_constructor_initializer_list(['indexVector', 'indexVectorSize']) }}
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_{{class_name}}");
+    };
+
+    void operator() ( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = nullptr {%endif%});
+    void inner( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = nullptr {%endif%});
+    void outer( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = nullptr {%endif%});
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        for( auto it = flagField->begin(); it != flagField->end(); ++it )
+        {
+            if( ! isFlagSet(it, domainFlag) )
+                continue;
+
+            {%- for dirIdx, dirVec, offset in stencil_info %}
+            if ( isFlagSet( it.neighbor({{offset}} {%if dim == 3%}, 0 {%endif %}), boundaryFlag ) )
+            {
+                {% if inner_or_boundary -%}
+                auto element = {{StructName}}(it.x(), it.y(), {%if dim == 3%} it.z(), {%endif %} {{dirIdx}} );
+                {% else -%}
+                auto element = {{StructName}}(it.x() + cell_idx_c({{dirVec[0]}}), it.y() + cell_idx_c({{dirVec[1]}}), {%if dim == 3%} it.z() + cell_idx_c({{dirVec[2]}}), {%endif %} {{inverse_directions[dirIdx]}} );
+                {% endif -%}
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            {% endfor %}
+        }
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run( IBlock * block, IndexVectors::Type type{% if target == 'gpu'%}, cudaStream_t stream = nullptr {%endif%});
+
+public:
+    BlockDataID indexVectorID;
+    {% if additional_data_functor -%}AdditionalDataFunctor_T {{ additional_data_functor }};{%- endif %}
+    {{kernel|generate_members(('indexVector', 'indexVectorSize'))|indent(4)}}
+};
+
+
+
+} // namespace {{namespace}}
+} // namespace walberla
diff --git a/pystencils_walberla/templates/Sweep.tmpl.h b/pystencils_walberla/templates/Sweep.tmpl.h
index ab65351107770245ca02ee418383dc752b8746c4..d6f212b22a9e4e10405d55d02ffbe04f4191368a 100644
--- a/pystencils_walberla/templates/Sweep.tmpl.h
+++ b/pystencils_walberla/templates/Sweep.tmpl.h
@@ -58,7 +58,7 @@ public:
 
     {{ kernel| generate_destructor(class_name) |indent(4) }}
 
-    void operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} );
+    void operator() ( IBlock * block{%if target is equalto 'gpu'%}, cudaStream_t stream = nullptr{% endif %} );
     void runOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks,
                            const CellInterval & globalCellInterval, cell_idx_t ghostLayers, IBlock * block
                            {%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %});