diff --git a/generate_all_operators.py b/generate_all_operators.py
index e8b67fe6c4d0d32b4be143f3ec971280a13d8d4c..e92c28af80f9a8b5dac97ef16451d7430cfe3b79 100644
--- a/generate_all_operators.py
+++ b/generate_all_operators.py
@@ -53,7 +53,7 @@ from hog.forms import (
     supg_advection,
     grad_rho_by_rho_dot_u,
     nonlinear_diffusion_stiffness,
-    nonlinear_diffusion_newton_galerkin_approximated_mass_lumped,
+    nonlinear_diffusion_newton_galerkin_approximated_mass_lumped
 )
 from hog.forms_boundary import (
     mass_boundary,
@@ -74,7 +74,7 @@ from hog.operator_generation.kernel_types import (
     ApplyWrapper,
     AssembleWrapper,
     AssembleDiagonalWrapper,
-    KernelWrapperType, LumpWrapper,
+    KernelWrapperType, LumpWrapper, AssembleNonInverseDiagonalWrapper
 )
 from hog.operator_generation.loop_strategies import (
     LoopStrategy,
@@ -574,13 +574,21 @@ class OperatorInfo:
                     )
                 )
 
-            self.kernel_types.append(
-                LumpWrapper(
-                    self.trial_space,
-                    type_descriptor=self.type_descriptor,
-                    dims=dims,
+                self.kernel_types.append(
+                    AssembleNonInverseDiagonalWrapper(
+                        self.trial_space,
+                        type_descriptor=self.type_descriptor,
+                        dims=dims,
+                    )
+                )
+
+                self.kernel_types.append(
+                    LumpWrapper(
+                        self.trial_space,
+                        type_descriptor=self.type_descriptor,
+                        dims=dims,
+                    )
                 )
-            )
 
 
 def all_operators(
@@ -635,12 +643,12 @@ def all_operators(
                             coefficient_function_space=P1),
                             type_descriptor=type_descriptor, geometries=list(geometries), opts=opts, blending=blending))
 
-    ops.append(OperatorInfo("P1", "NonlinearDiffusionNewtonGalerkinStiffness", TrialSpace(P1),
+    ops.append(OperatorInfo("P1", "NonlinearDiffusionStiffness", TrialSpace(P1),
                             TestSpace(P1), form=partial(nonlinear_diffusion_stiffness,
                             coefficient_function_space=P1),
                             type_descriptor=type_descriptor, geometries=list(geometries), opts=opts, blending=blending))
 
-    ops.append(OperatorInfo("P1", "NonlinearDiffusionNewtonGalerkinStiffnessDifferentiated", TrialSpace(P1),
+    ops.append(OperatorInfo("P1", "NonlinearDiffusionStiffnessDifferentiated", TrialSpace(P1),
                             TestSpace(P1), form=partial(nonlinear_diffusion_stiffness,
                             coefficient_function_space=P1, use_a_derivative=True),
                             type_descriptor=type_descriptor, geometries=list(geometries), opts=opts, blending=blending))
diff --git a/hog/operator_generation/kernel_types.py b/hog/operator_generation/kernel_types.py
index 3b1a42e1a2b37a4d578404750ed01d5643ff0be4..36d98c77f802e78ef42eaaec02029c28bfbe6e2e 100644
--- a/hog/operator_generation/kernel_types.py
+++ b/hog/operator_generation/kernel_types.py
@@ -731,6 +731,130 @@ class AssembleDiagonalWrapper(KernelWrapperType):
             ),
         ]
 
+class AssembleNonInverseDiagonalWrapper(KernelWrapperType):
+    def __init__(
+        self,
+        fe_space: FunctionSpace,
+        type_descriptor: HOGType,
+        dst_field: str = "diag_",
+        dims: List[int] = [2, 3],
+    ):
+        self.name = "computeDiagonalOperatorValues"
+        self.dst: FunctionSpaceImpl = create_impl(
+            fe_space, dst_field, type_descriptor, is_pointer=True
+        )
+        self.src_fields = []
+        self.dst_fields = [self.dst]
+        self.dims = dims
+
+        def macro_loop(dim: int) -> str:
+            Macro = {2: "Face", 3: "Cell"}[dim]
+            macro = {2: "face", 3: "cell"}[dim]
+
+            if dim in dims:
+                return (
+                    f"for ( auto& it : storage_->get{Macro}s() )\n"
+                    f"{{\n"
+                    f"    {Macro}& {macro} = *it.second;\n"
+                    f"\n"
+                    f"    // get hold of the actual numerical data\n"
+                    f"{indent(self.dst.pointer_retrieval(dim), INDENT)}\n"
+                    f"    $pointer_retrieval_{dim}D\n"
+                    f"\n"
+                    f"    $scalar_parameter_setup_{dim}D\n"
+                    f"\n"
+                    f'    this->timingTree_->start( "kernel" );\n'
+                    f"    $kernel_function_call_{dim}D\n"
+                    f'    this->timingTree_->stop( "kernel" );\n'
+                    f"}}\n"
+                    f"\n"
+                    f"// Push result to lower-dimensional primitives\n"
+                    f"//\n"
+                    f'this->timingTree_->start( "post-communication" );\n'
+                    f"// Note: We could avoid communication here by implementing the apply() also for the respective\n"
+                    f"//       lower dimensional primitives!\n"
+                    f"{self.dst.post_communication(dim, 'level', transform_basis=False)}\n"
+                    f'this->timingTree_->stop( "post-communication" );'
+                )
+            else:
+                return 'WALBERLA_ABORT( "Not implemented." );'
+
+        fnc_name = "diagonal entries"
+
+        self._template = Template(
+            f'this->startTiming( "{self.name}" );\n'
+            f"\n"
+            f"if ( {self.dst.name} == nullptr )\n"
+            f"{{\n"
+            f"    {self.dst.name} =\n"
+            f'        std::make_shared< {self.dst.func_type_string()} >( "{fnc_name}", storage_, minLevel_, maxLevel_ );\n'
+            f"}}\n"
+            f"\n"
+            f"for ( uint_t level = minLevel_; level <= maxLevel_; level++ )\n"
+            f"{{\n"
+            f"    {self.dst.name}->setToZero( level );\n"
+            f"\n"
+            f"    if ( storage_->hasGlobalCells() )\n"
+            f"    {{\n"
+            f'        this->timingTree_->start( "pre-communication" );\n'
+            f"        $comm_fe_functions_3D\n"
+            f'        this->timingTree_->stop( "pre-communication" );\n'
+            f"\n"
+            f"{indent(macro_loop(3), 2 * INDENT)}\n"
+            f"    }}\n"
+            f"    else\n"
+            f"    {{\n"
+            f'        this->timingTree_->start( "pre-communication" );\n'
+            f"        $comm_fe_functions_2D\n"
+            f'        this->timingTree_->stop( "pre-communication" );\n'
+            f"\n"
+            f"{indent(macro_loop(2), 2 * INDENT)}\n"
+            f"    }}\n"
+            f"\n"
+            f"}}\n"
+            f"\n"
+            f'this->stopTiming( "{self.name}" );'
+        )
+
+    @property
+    def kernel_type(self) -> KernelType:
+        return AssembleDiagonal()
+
+    def includes(self) -> Set[str]:
+        return {"hyteg/solvers/Smoothables.hpp"} | self.dst.includes()
+
+    def base_classes(self) -> List[str]:
+        return [
+            f"public OperatorWithDiagonal< {self.dst.func_type_string()} >",
+        ]
+
+    def wrapper_methods(self) -> List[CppMethod]:
+        return [
+            CppMethod(
+                name=self.name,
+                arguments=[],
+                return_type="void",
+                content=self._template.template,
+            ),
+            CppMethod(
+                name="getDiagonalValues",
+                arguments=[],
+                return_type=f"std::shared_ptr< {self.dst.func_type_string()} >",
+                is_const=True,
+                content=f"return {self.dst.name};",
+            ),
+        ]
+
+    def member_variables(self) -> List[CppMemberVariable]:
+        return [
+            CppMemberVariable(
+                variable=CppVariable(
+                    name=self.dst.name,
+                    type=f"std::shared_ptr< {self.dst.func_type_string()} >",
+                )
+            ),
+        ]
+
 
 class AssembleWrapper(KernelWrapperType):
     def __init__(