diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/benchmark.py
index 34bdf6f8704cf4366f9a1e03ef5e4130ae17a302..df6bd9b6d0c754a76ca6a44c38c1ffc2299d0505 100644
--- a/pystencils_benchmark/benchmark.py
+++ b/pystencils_benchmark/benchmark.py
@@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined
 import numpy as np
 
 from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.astnodes import KernelFunction
+from pystencils.astnodes import KernelFunction, PragmaBlock
 from pystencils.enums import Backend
 from pystencils.typing import get_base_type
 from pystencils.sympyextensions import prod
@@ -22,7 +22,10 @@ _env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=Stric
 def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                        path: Path = None,
                        *,
-                       compiler: Compiler = Compiler.GCC) -> None:
+                       compiler: Compiler = Compiler.GCC,
+                       timing: bool = True,
+                       likwid: bool = False
+                       ) -> None:
     if path is None:
         path = Path('.')
     else:
@@ -47,16 +50,17 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
             f.write(source)
 
     with open(src_path / 'main.c', 'w+') as f:
-        f.write(kernel_main(kernel_asts))
+        f.write(kernel_main(kernel_asts, timing=timing, likwid=likwid))
 
     copy_static_files(path)
-    compiler_toolchain(path, compiler)
+    compiler_toolchain(path, compiler, likwid)
 
 
-def compiler_toolchain(path: Path, compiler: Compiler) -> None:
+def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None:
     name = compiler.name
     jinja_context = {
         'compiler': name,
+        'likwid': likwid,
     }
 
     files = ['Makefile', f'{name}.mk']
@@ -85,17 +89,19 @@ def copy_static_files(path: Path) -> None:
             f.write(template)
 
 
-def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True):
+def kernel_main(kernels_ast: List[KernelFunction], *,
+                timing: bool = True, likwid: bool = False) -> str:
     """
     Return C code of a benchmark program for the given kernel.
 
     Args:
         kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
         timing: add timing output to the code, prints time per iteration to stdout
+        likwid: add likwid marker to the code
     Returns:
         C code as string
     """
-    Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters', 'call_argument_list'])
+    Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters', 'call_argument_list', 'openmp'])
     kernels = []
     includes = set()
     for kernel in kernels_ast:
@@ -104,6 +110,8 @@ def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True):
         constants = []
         fields = []
         call_parameters = []
+        # TODO: Think about it maybe there is a better way to detect openmp
+        openmp = isinstance(kernel.body.args[0], PragmaBlock)
         for p in kernel.get_parameters():
             if not p.is_field_parameter:
                 constants.append((p.symbol.name, str(p.symbol.dtype)))
@@ -129,15 +137,18 @@ def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True):
                     size = dim1_size * padding_bytes + np.prod(field.shape) * size_data_type
 
                     assert align % np_dtype.itemsize == 0
-                    offset = ((dim0_size + padding_elements + ghost_layers) % kernel.instruction_set['width']) * size_data_type
+                    offset = ((dim0_size + padding_elements + ghost_layers) %
+                              kernel.instruction_set['width']) * size_data_type
                 else:
                     size = elements * size_data_type
                     offset = 0
                     align = 0
                 fields.append((p.field_name, dtype, elements, size, offset, align))
                 call_parameters.append(p.field_name)
+
+        # TODO: Think about openmp detection again
         kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
-                              call_argument_list=",".join(call_parameters)))
+                              call_argument_list=",".join(call_parameters), openmp=openmp))
 
         includes.add(name)
 
@@ -145,6 +156,7 @@ def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True):
         'kernels': kernels,
         'includes': includes,
         'timing': timing,
+        'likwid': likwid,
     }
 
     main = _env.get_template('main.c').render(**jinja_context)
diff --git a/pystencils_benchmark/templates/Clang.mk b/pystencils_benchmark/templates/Clang.mk
index 96296cc6d0aff6545ee8457aa601c86265b868a4..15bfe63d81ddcb58dd44b054dc8e477c435fb12c 100644
--- a/pystencils_benchmark/templates/Clang.mk
+++ b/pystencils_benchmark/templates/Clang.mk
@@ -12,7 +12,7 @@ CFLAGS   = -O3 -Wno-format -Wall -Werror $(ANSI_CFLAGS) -fopenmp -march=native
 # Maybe too much warnings
 #CFLAGS   += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
 # Specific C flags
-CFLAGS   := $(CFLAGS) -Wstrict-prototypes
+CFLAGS   := $(CFLAGS) -Wstrict-prototypes -Wno-error=strict-prototypes
 LFLAGS   = -fopenmp=libomp
 DEFINES  = -D_GNU_SOURCE -DNDEBUG
 INCLUDES =
diff --git a/pystencils_benchmark/templates/GCC.mk b/pystencils_benchmark/templates/GCC.mk
index a65ec464c663bfe23a90e5756203ede9eeb4d8de..7dccdfc1f6b35e8630e21966b3a35aaf35fa8841 100644
--- a/pystencils_benchmark/templates/GCC.mk
+++ b/pystencils_benchmark/templates/GCC.mk
@@ -12,7 +12,7 @@ CFLAGS   = -O3 -Wno-format -Wall -Werror $(ANSI_CFLAGS) -fopenmp -march=native
 # Maybe too much warnings
 #CFLAGS   += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
 # Specific C flags
-CFLAGS   := $(CFLAGS) -Wstrict-prototypes
+CFLAGS   := $(CFLAGS) -Wstrict-prototypes -Wno-error=strict-prototypes
 LFLAGS   = -fopenmp
 DEFINES  = -D_GNU_SOURCE -DNDEBUG
 INCLUDES =
diff --git a/pystencils_benchmark/templates/Makefile b/pystencils_benchmark/templates/Makefile
index 98fcaaa19d9a753fd346da9480fd4935d112f2e5..66b68b8d009137ffe0a3069d0f9e5e5a5f65d550 100644
--- a/pystencils_benchmark/templates/Makefile
+++ b/pystencils_benchmark/templates/Makefile
@@ -7,10 +7,25 @@ SRC_DIR    = ./src
 MAKE_DIR   = ./
 Q         ?= @
 
+{% if likwid %}
+# LIKWID DEFINES
+LIKWID_DEFINES := -DLIKWID_PERFMON
+LIKWID_PATH = $(shell dirname $(shell which likwid-perfctr))
+LIKWID_LIB := -L$(LIKWID_PATH)/../lib/
+LIKWID_INC := -I$(LIKWID_PATH)/../include/
+{% endif %}
+
 #DO NOT EDIT BELOW
 include $(MAKE_DIR)/$(TAG).mk
 INCLUDES  += -I./include
 
+{% if likwid %}
+INCLUDES += $(LIKWID_INC)
+DEFINES += $(LIKWID_DEFINES)
+LFLAGS += $(LIKWID_LIB)
+LIBS += -llikwid
+{% endif %}
+
 VPATH     = $(SRC_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/main.c
index a4c462d598ba4c88ad0f63c0afb66d1962c07b7f..cec0f0eb6c50c360879d72b7fc817d78126adb9f 100644
--- a/pystencils_benchmark/templates/main.c
+++ b/pystencils_benchmark/templates/main.c
@@ -6,6 +6,9 @@
 #include <stdlib.h>
 
 #include "timing.h"
+{% if likwid %}
+#include <likwid-marker.h>
+{% endif %}
 
 //kernels
 {% for include in includes %}
@@ -22,6 +25,10 @@ int main(int argc, char **argv)
         return -1;
     }
     int n_repeat = atoi(argv[1]);
+    {% if likwid %}
+    LIKWID_MARKER_INIT;
+    {%- endif %}
+
     {% for kernel in kernels %}
 
     { // Kernel: {{kernel.name}}
@@ -32,6 +39,9 @@ int main(int argc, char **argv)
         {% else %}
         {{dataType}}* {{field_name}} = ({{dataType}} *) malloc({{size}});
         {% endif %}
+        {% if kernel.openmp %}
+        #pragma omp parallel for schedule(static)
+        {% endif %}
         for (unsigned long long i = 0; i < {{elements}}; ++i)
             {{field_name}}[i] = 0.23;
         {% endfor %}
@@ -42,10 +52,31 @@ int main(int argc, char **argv)
         {{constantName}} = 0.23;
         {% endfor %}
 
+        {% if likwid %}
+        {% if kernel.openmp %}
+        #pragma omp parallel
+        {
+        {% endif %}
+        LIKWID_MARKER_REGISTER("{{kernel.name}}");
+        {% if kernel.openmp %}
+        }
+        {% endif %}
+        {% endif %}
+
         for(int warmup = 1; warmup >= 0; --warmup) {
             int repeat = 2;
             if(warmup == 0) {
                 repeat = n_repeat;
+                {% if likwid %}
+                {% if kernel.openmp %}
+                #pragma omp parallel
+                {
+                {% endif %}
+                LIKWID_MARKER_START("{{kernel.name}}");
+                {% if kernel.openmp %}
+                }
+                {% endif %}
+                {% endif %}
             }
 
             {% if timing %}
@@ -65,9 +96,25 @@ int main(int argc, char **argv)
                 printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat );
             {% endif %}
         }
+
+        {% if likwid %}
+        {% if kernel.openmp %}
+        #pragma omp parallel
+        {
+        {% endif %}
+        LIKWID_MARKER_STOP("{{kernel.name}}");
+        {% if kernel.openmp %}
+        }
+        {% endif %}
+        {% endif %}
+
         {% for field_name, dataType, elements, size, offset, alignment in kernel.fields %}
         free({{field_name}});
         {% endfor %}
     }
     {% endfor %}
+
+    {% if likwid %}
+    LIKWID_MARKER_CLOSE;
+    {% endif %}
 }