diff --git a/generate_all_operators.py b/generate_all_operators.py
index bb20e9a895a33f24d1511102804dc78758b9bcdc..6a4c65f3a18a87fc1a4ec440cb334826085562d5 100644
--- a/generate_all_operators.py
+++ b/generate_all_operators.py
@@ -46,7 +46,6 @@ from hog.forms import (
     shear_heating,
     epsilon,
     full_stokes,
-    nonlinear_diffusion,
     nonlinear_diffusion_newton_galerkin,
     supg_diffusion,
     advection,
@@ -74,7 +73,7 @@ from hog.operator_generation.kernel_types import (
     ApplyWrapper,
     AssembleWrapper,
     AssembleDiagonalWrapper,
-    KernelWrapperType, LumpWrapper, AssembleNonInverseDiagonalWrapper
+    KernelWrapperType, LumpWrapper
 )
 from hog.operator_generation.loop_strategies import (
     LoopStrategy,
@@ -575,10 +574,11 @@ class OperatorInfo:
                 )
 
                 self.kernel_types.append(
-                    AssembleNonInverseDiagonalWrapper(
+                    AssembleDiagonalWrapper(
                         self.trial_space,
                         type_descriptor=self.type_descriptor,
                         dims=dims,
+                        invert=False,
                     )
                 )
 
@@ -634,10 +634,6 @@ def all_operators(
                             form=partial(shear_heating, viscosity_function_space=P2, velocity_function_space=P2),
                             type_descriptor=type_descriptor, geometries=list(geometries), opts=opts, blending=blending))
 
-    ops.append(OperatorInfo("P1", "NonlinearDiffusion", TrialSpace(P1), TestSpace(P1),
-                            form=partial(nonlinear_diffusion, coefficient_function_space=P1),
-                            type_descriptor=type_descriptor, geometries=list(geometries), opts=opts, blending=blending))
-
     ops.append(OperatorInfo("P1", "NonlinearDiffusionNewtonGalerkin", TrialSpace(P1),
                             TestSpace(P1), form=partial(nonlinear_diffusion_newton_galerkin,
                             coefficient_function_space=P1),
@@ -658,7 +654,6 @@ def all_operators(
                             coefficient_function_space=P1),
                             type_descriptor=type_descriptor, geometries=list(geometries), opts=opts, blending=blending))
 
-
     ops.append(OperatorInfo("P1", "NonlinearDiffusionNewtonGalerkinApproximatedMassLumped", TrialSpace(P1),
                             TestSpace(P1), form=partial(nonlinear_diffusion_newton_galerkin_approximated_mass_lumped,
                                                         coefficient_function_space=P1),
diff --git a/hog/forms.py b/hog/forms.py
index 357f0e2987ba5f775a9b2972bf40240cac15aca9..3038b6b4b0dade85a0b115dc7e8622137cd66a39 100644
--- a/hog/forms.py
+++ b/hog/forms.py
@@ -174,72 +174,6 @@ Weak formulation
         docstring=docstring,
     )
 
-
-def nonlinear_diffusion(
-    trial: TrialSpace,
-    test: TestSpace,
-    geometry: ElementGeometry,
-    symbolizer: Symbolizer,
-    coefficient_function_space: FunctionSpace,
-    blending: GeometryMap = IdentityMap(),
-) -> Form:
-    docstring = f"""
-Diffusion operator with coefficient function.
-
-Weak formulation
-
-    u: trial function (space: {trial})
-    v: test function  (space: {test})
-    c: FE coefficient function (space: {coefficient_function_space})
-    
-    ∫ a(c) ∇u · ∇v
-
-Note: :math:`a(c) = 1/8 + u^2` is currently hard-coded and the form is intended for :math:`c = u`,
-      hence, the naming, but will work for other coefficients, too.
-"""
-
-    if blending != IdentityMap():
-        raise HOGException(
-            "The nonlinear-diffusion form does currently not support blending."
-        )
-
-    if trial != test:
-        raise HOGException(
-            "Trial space must be equal to test space to assemble non-linear diffusion matrix."
-        )
-
-    def integrand(
-        *,
-        jac_a_inv,
-        jac_a_abs_det,
-        grad_u,
-        grad_v,
-        k,
-        tabulate,
-        **_,
-    ):
-        a = sp.Rational(1, 8) + k["u"] * k["u"]
-        return a * tabulate(
-            double_contraction(
-                jac_a_inv.T * grad_u,
-                jac_a_inv.T * grad_v,
-            )
-            * jac_a_abs_det
-        )
-
-    return process_integrand(
-        integrand,
-        trial,
-        test,
-        geometry,
-        symbolizer,
-        blending=blending,
-        is_symmetric=trial == test,
-        docstring=docstring,
-        fe_coefficients={"u": coefficient_function_space},
-    )
-
-
 def nonlinear_diffusion_newton_galerkin(
     trial: TrialSpace,
     test: TestSpace,
@@ -250,7 +184,7 @@ def nonlinear_diffusion_newton_galerkin(
 ) -> Form:
     docstring = f"""
 
-Bi-linear form for the solution of the non-linear diffusion equation by a Newton-Galerkin approach
+Bi-linear form for the solution of the non-linear diffusion equation by a Newton-Galerkin approach.
 
 Weak formulation
 
@@ -322,7 +256,7 @@ def nonlinear_diffusion_stiffness(
         use_a_derivative: bool = False,
 ) -> Form:
     docstring = f"""
-todo
+Bi-linear form of the stiffness operator for the solution of the non-linear diffusion equation by a Newton-Galerkin approach.
 
 Weak formulation
 
@@ -393,7 +327,8 @@ def nonlinear_diffusion_mass_lumping_template(
         blending: GeometryMap = IdentityMap(),
 ) -> Form:
     docstring = f"""
-todo
+Bi-linear form for a part of a mass lumped operator for the solution of the non-linear diffusion equation by a Newton-Galerkin approach.
+This has to be additively composed with a stiffness operator to form the actual mass lumped operator.
 
 Weak formulation
 
@@ -401,9 +336,7 @@ Weak formulation
     v: test function  (space: {test})
     k: FE coefficient function (space: {coefficient_function_space})
 
-    ∫ a(k) ∇k · ∇v
-
-    or ∫ a'(k) ∇u · ∇v if use_a_derivative was set
+    ∫ a'(k) ∇k · ∇v
 
 Note: :math:`a(k) = 1/8 + k^2` is currently hard-coded and the form is intended for :math:`k = u`.
 """
@@ -459,9 +392,9 @@ def nonlinear_diffusion_newton_galerkin_approximated_mass_lumped(
 ) -> Form:
     docstring = f"""
 
-Bi-linear form with approximated mass lumping on the asymmetric part for the solution of the non-linear diffusion equation by a Newton-Galerkin approach
+Bi-linear form with approximated mass lumping on the asymmetric part for the solution of the non-linear diffusion equation by a Newton-Galerkin approach.
 This just uses the approximation 9.25 from Larson & Bengzon. This is not the actual mass lumping approach.
-
+Its convergence results are quite bad and it needs an altered interface for its integrand function, so this should probably get removed.
 
 Weak formulation
 
diff --git a/hog/operator_generation/kernel_types.py b/hog/operator_generation/kernel_types.py
index 36d98c77f802e78ef42eaaec02029c28bfbe6e2e..a50795fd51e9fa51741531d20f92307ca9159a95 100644
--- a/hog/operator_generation/kernel_types.py
+++ b/hog/operator_generation/kernel_types.py
@@ -606,14 +606,27 @@ class GEMVWrapper(KernelWrapperType):
 
 
 class AssembleDiagonalWrapper(KernelWrapperType):
+    invert: bool
+
     def __init__(
         self,
         fe_space: FunctionSpace,
         type_descriptor: HOGType,
-        dst_field: str = "invDiag_",
+        dst_field: str | None = None,
         dims: List[int] = [2, 3],
+        invert: bool = True
     ):
-        self.name = "computeInverseDiagonalOperatorValues"
+        if invert:
+            self.name = "computeInverseDiagonalOperatorValues"
+        else:
+            self.name = "computeDiagonalOperatorValues"
+
+        if dst_field is None:
+            if invert:
+                dst_field = "invDiag_"
+            else:
+                dst_field = "diag_"
+
         self.dst: FunctionSpaceImpl = create_impl(
             fe_space, dst_field, type_descriptor, is_pointer=True
         )
@@ -621,6 +634,8 @@ class AssembleDiagonalWrapper(KernelWrapperType):
         self.dst_fields = [self.dst]
         self.dims = dims
 
+        self.invert = invert
+
         def macro_loop(dim: int) -> str:
             Macro = {2: "Face", 3: "Cell"}[dim]
             macro = {2: "face", 3: "cell"}[dim]
@@ -653,9 +668,12 @@ class AssembleDiagonalWrapper(KernelWrapperType):
             else:
                 return 'WALBERLA_ABORT( "Not implemented." );'
 
-        fnc_name = "inverse diagonal entries"
+        if invert:
+            fnc_name = "inverse diagonal entries"
+        else:
+            fnc_name = "diagonal entries"
 
-        self._template = Template(
+        template_str =(
             f'this->startTiming( "{self.name}" );\n'
             f"\n"
             f"if ( {self.dst.name} == nullptr )\n"
@@ -674,8 +692,12 @@ class AssembleDiagonalWrapper(KernelWrapperType):
             f"        $comm_fe_functions_3D\n"
             f'        this->timingTree_->stop( "pre-communication" );\n'
             f"\n"
-            f"{indent(macro_loop(3), 2 * INDENT)}\n"
-            f"{indent(self.dst.invert_elementwise(3), 2 * INDENT)}\n"
+            f"{indent(macro_loop(3), 2 * INDENT)}\n")
+
+        if self.invert:
+            template_str += f"{indent(self.dst.invert_elementwise(3), 2 * INDENT)}\n"
+
+        template_str += (
             f"    }}\n"
             f"    else\n"
             f"    {{\n"
@@ -684,131 +706,12 @@ class AssembleDiagonalWrapper(KernelWrapperType):
             f'        this->timingTree_->stop( "pre-communication" );\n'
             f"\n"
             f"{indent(macro_loop(2), 2 * INDENT)}\n"
-            f"{indent(self.dst.invert_elementwise(2), 2 * INDENT)}\n"
-            f"    }}\n"
-            f"\n"
-            f"}}\n"
-            f"\n"
-            f'this->stopTiming( "{self.name}" );'
         )
 
-    @property
-    def kernel_type(self) -> KernelType:
-        return AssembleDiagonal()
-
-    def includes(self) -> Set[str]:
-        return {"hyteg/solvers/Smoothables.hpp"} | self.dst.includes()
+        if self.invert:
+            template_str += f"{indent(self.dst.invert_elementwise(2), 2 * INDENT)}\n"
 
-    def base_classes(self) -> List[str]:
-        return [
-            f"public OperatorWithInverseDiagonal< {self.dst.func_type_string()} >",
-        ]
-
-    def wrapper_methods(self) -> List[CppMethod]:
-        return [
-            CppMethod(
-                name=self.name,
-                arguments=[],
-                return_type="void",
-                content=self._template.template,
-            ),
-            CppMethod(
-                name="getInverseDiagonalValues",
-                arguments=[],
-                return_type=f"std::shared_ptr< {self.dst.func_type_string()} >",
-                is_const=True,
-                content=f"return {self.dst.name};",
-            ),
-        ]
-
-    def member_variables(self) -> List[CppMemberVariable]:
-        return [
-            CppMemberVariable(
-                variable=CppVariable(
-                    name=self.dst.name,
-                    type=f"std::shared_ptr< {self.dst.func_type_string()} >",
-                )
-            ),
-        ]
-
-class AssembleNonInverseDiagonalWrapper(KernelWrapperType):
-    def __init__(
-        self,
-        fe_space: FunctionSpace,
-        type_descriptor: HOGType,
-        dst_field: str = "diag_",
-        dims: List[int] = [2, 3],
-    ):
-        self.name = "computeDiagonalOperatorValues"
-        self.dst: FunctionSpaceImpl = create_impl(
-            fe_space, dst_field, type_descriptor, is_pointer=True
-        )
-        self.src_fields = []
-        self.dst_fields = [self.dst]
-        self.dims = dims
-
-        def macro_loop(dim: int) -> str:
-            Macro = {2: "Face", 3: "Cell"}[dim]
-            macro = {2: "face", 3: "cell"}[dim]
-
-            if dim in dims:
-                return (
-                    f"for ( auto& it : storage_->get{Macro}s() )\n"
-                    f"{{\n"
-                    f"    {Macro}& {macro} = *it.second;\n"
-                    f"\n"
-                    f"    // get hold of the actual numerical data\n"
-                    f"{indent(self.dst.pointer_retrieval(dim), INDENT)}\n"
-                    f"    $pointer_retrieval_{dim}D\n"
-                    f"\n"
-                    f"    $scalar_parameter_setup_{dim}D\n"
-                    f"\n"
-                    f'    this->timingTree_->start( "kernel" );\n'
-                    f"    $kernel_function_call_{dim}D\n"
-                    f'    this->timingTree_->stop( "kernel" );\n'
-                    f"}}\n"
-                    f"\n"
-                    f"// Push result to lower-dimensional primitives\n"
-                    f"//\n"
-                    f'this->timingTree_->start( "post-communication" );\n'
-                    f"// Note: We could avoid communication here by implementing the apply() also for the respective\n"
-                    f"//       lower dimensional primitives!\n"
-                    f"{self.dst.post_communication(dim, 'level', transform_basis=False)}\n"
-                    f'this->timingTree_->stop( "post-communication" );'
-                )
-            else:
-                return 'WALBERLA_ABORT( "Not implemented." );'
-
-        fnc_name = "diagonal entries"
-
-        self._template = Template(
-            f'this->startTiming( "{self.name}" );\n'
-            f"\n"
-            f"if ( {self.dst.name} == nullptr )\n"
-            f"{{\n"
-            f"    {self.dst.name} =\n"
-            f'        std::make_shared< {self.dst.func_type_string()} >( "{fnc_name}", storage_, minLevel_, maxLevel_ );\n'
-            f"}}\n"
-            f"\n"
-            f"for ( uint_t level = minLevel_; level <= maxLevel_; level++ )\n"
-            f"{{\n"
-            f"    {self.dst.name}->setToZero( level );\n"
-            f"\n"
-            f"    if ( storage_->hasGlobalCells() )\n"
-            f"    {{\n"
-            f'        this->timingTree_->start( "pre-communication" );\n'
-            f"        $comm_fe_functions_3D\n"
-            f'        this->timingTree_->stop( "pre-communication" );\n'
-            f"\n"
-            f"{indent(macro_loop(3), 2 * INDENT)}\n"
-            f"    }}\n"
-            f"    else\n"
-            f"    {{\n"
-            f'        this->timingTree_->start( "pre-communication" );\n'
-            f"        $comm_fe_functions_2D\n"
-            f'        this->timingTree_->stop( "pre-communication" );\n'
-            f"\n"
-            f"{indent(macro_loop(2), 2 * INDENT)}\n"
+        template_str += (
             f"    }}\n"
             f"\n"
             f"}}\n"
@@ -816,6 +719,8 @@ class AssembleNonInverseDiagonalWrapper(KernelWrapperType):
             f'this->stopTiming( "{self.name}" );'
         )
 
+        self._template = Template(template_str)
+
     @property
     def kernel_type(self) -> KernelType:
         return AssembleDiagonal()
@@ -824,11 +729,22 @@ class AssembleNonInverseDiagonalWrapper(KernelWrapperType):
         return {"hyteg/solvers/Smoothables.hpp"} | self.dst.includes()
 
     def base_classes(self) -> List[str]:
+        if self.invert:
+            interface = "OperatorWithInverseDiagonal"
+        else:
+            interface = "OperatorWithDiagonal"
+
         return [
-            f"public OperatorWithDiagonal< {self.dst.func_type_string()} >",
+            f"public {interface}< {self.dst.func_type_string()} >",
         ]
 
+
     def wrapper_methods(self) -> List[CppMethod]:
+        if self.invert:
+            getter_name = "getInverseDiagonalValues"
+        else:
+            getter_name = "getDiagonalValues"
+
         return [
             CppMethod(
                 name=self.name,
@@ -837,7 +753,7 @@ class AssembleNonInverseDiagonalWrapper(KernelWrapperType):
                 content=self._template.template,
             ),
             CppMethod(
-                name="getDiagonalValues",
+                name=getter_name,
                 arguments=[],
                 return_type=f"std::shared_ptr< {self.dst.func_type_string()} >",
                 is_const=True,
@@ -855,7 +771,6 @@ class AssembleNonInverseDiagonalWrapper(KernelWrapperType):
             ),
         ]
 
-
 class AssembleWrapper(KernelWrapperType):
     def __init__(
         self,
@@ -985,94 +900,12 @@ class AssembleWrapper(KernelWrapperType):
     def member_variables(self) -> List[CppMemberVariable]:
         return []
 
-class LumpWrapper(KernelWrapperType):
-    def __init__(
-            self,
-            fe_space: FunctionSpace,
-            type_descriptor: HOGType,
-            dst_field: str = "lumpedDiag_",
-            dims: List[int] = [2, 3],
-    ):
-        self.name = "computeLumpedDiagonalValues"
-        self.dst: FunctionSpaceImpl = create_impl(
-            fe_space, dst_field, type_descriptor, is_pointer=True
-        )
-        self.src_fields = []
-        self.dst_fields = [self.dst]
-        self.dims = dims
-
-        def macro_loop(dim: int) -> str:
-            Macro = {2: "Face", 3: "Cell"}[dim]
-            macro = {2: "face", 3: "cell"}[dim]
+class LumpWrapper(AssembleDiagonalWrapper):
+    def __init__(self, fe_space: FunctionSpace, type_descriptor: HOGType, dst_field: str = "lumpedDiag_",
+                 dims: List[int] = [2, 3]):
 
-            if dim in dims:
-                return (
-                    f"for ( auto& it : storage_->get{Macro}s() )\n"
-                    f"{{\n"
-                    f"    {Macro}& {macro} = *it.second;\n"
-                    f"\n"
-                    f"    // get hold of the actual numerical data\n"
-                    f"{indent(self.dst.pointer_retrieval(dim), INDENT)}\n"
-                    f"    $pointer_retrieval_{dim}D\n"
-                    f"\n"
-                    f"    $scalar_parameter_setup_{dim}D\n"
-                    f"\n"
-                    f'    this->timingTree_->start( "kernel" );\n'
-                    f"    $kernel_function_call_{dim}D\n"
-                    f'    this->timingTree_->stop( "kernel" );\n'
-                    f"}}\n"
-                    f"\n"
-                    f"// Push result to lower-dimensional primitives\n"
-                    f"//\n"
-                    f'this->timingTree_->start( "post-communication" );\n'
-                    f"// Note: We could avoid communication here by implementing the apply() also for the respective\n"
-                    f"//       lower dimensional primitives!\n"
-                    f"{self.dst.post_communication(dim, 'level', transform_basis=False)}\n"
-                    f'this->timingTree_->stop( "post-communication" );'
-                )
-            else:
-                return 'WALBERLA_ABORT( "Not implemented." );'
-
-        fnc_name = "lumped diagonal entries"
-
-        self._template = Template(
-            f'this->startTiming( "{self.name}" );\n'
-            f"\n"
-            f"if ( {self.dst.name} == nullptr )\n"
-            f"{{\n"
-            f"    {self.dst.name} =\n"
-            f'        std::make_shared< {self.dst.func_type_string()} >( "{fnc_name}", storage_, minLevel_, maxLevel_ );\n'
-            f"}}\n"
-            f"\n"
-            f"for ( uint_t level = minLevel_; level <= maxLevel_; level++ )\n"
-            f"{{\n"
-            f"    {self.dst.name}->setToZero( level );\n"
-            f"\n"
-            f"    if ( storage_->hasGlobalCells() )\n"
-            f"    {{\n"
-            f'        this->timingTree_->start( "pre-communication" );\n'
-            f"        $comm_fe_functions_3D\n"
-            f'        this->timingTree_->stop( "pre-communication" );\n'
-            f"\n"
-            f"{indent(macro_loop(3), 2 * INDENT)}\n"
-            # todo maybe keep as an option
-            #f"{indent(self.dst.invert_elementwise(3), 2 * INDENT)}\n"
-            f"    }}\n"
-            f"    else\n"
-            f"    {{\n"
-            f'        this->timingTree_->start( "pre-communication" );\n'
-            f"        $comm_fe_functions_2D\n"
-            f'        this->timingTree_->stop( "pre-communication" );\n'
-            f"\n"
-            f"{indent(macro_loop(2), 2 * INDENT)}\n"
-            # todo maybe keep as an option
-            #f"{indent(self.dst.invert_elementwise(2), 2 * INDENT)}\n"
-            f"    }}\n"
-            f"\n"
-            f"}}\n"
-            f"\n"
-            f'this->stopTiming( "{self.name}" );'
-        )
+        super().__init__(fe_space, type_descriptor, dst_field, dims, False)
+        self.name = "computeLumpedDiagonalValues"
 
     @property
     def kernel_type(self) -> KernelType: