diff --git a/pystencils/kerncraft_coupling/generate_benchmark.py b/pystencils/kerncraft_coupling/generate_benchmark.py
index f63be049a1fcfb2e6a5867bc8f7dc94123759165..ec89c59d268e09079043286f2fcfa84b07ce546f 100644
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
@@ -1,4 +1,5 @@
 from jinja2 import Template
+import numpy as np
 import os
 import subprocess
 from pystencils.include import get_pystencils_include_path
@@ -8,6 +9,25 @@ from pystencils.sympyextensions import prod
 from pystencils.data_types import get_base_type
 from pystencils.astnodes import PragmaBlock
 
+include_template = Template("""
+#include "kerncraft.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <stdio.h>
+
+{{ includes }}
+{%- if likwid %}
+#include <likwid.h>
+{%- endif %}
+
+#define RESTRICT __restrict__
+#define FUNC_PREFIX
+
+
+""")
+
 benchmark_template = Template("""
 #include "kerncraft.h"
 #include <stdlib.h>
@@ -28,9 +48,42 @@ void dummy(void *);
 void timing(double* wcTime, double* cpuTime);
 extern int var_false;
 
-
 {{kernel_code}}
 
+void *aligned_malloc_with_offset( unsigned long size, unsigned long alignment, unsigned long offset )
+{
+   // With 0 alignment this function makes no sense
+   // use normal malloc instead
+   //WALBERLA_ASSERT_GREATER( alignment, 0 );
+   // Tests if alignment is power of two (assuming alignment>0)
+   //WALBERLA_ASSERT( !(alignment & (alignment - 1)) );
+
+   //WALBERLA_ASSERT_LESS( offset, alignment );
+
+   if( offset == 0 )
+      return aligned_malloc( size, alignment );
+
+
+   void *pa;  // pointer to allocated memory
+   void *ptr; // pointer to usable aligned memory
+
+   pa=malloc( (size+2*alignment-1 )+sizeof(void *));
+   if(!pa)
+      return 0;
+
+   // Find next aligned position, starting at pa+sizeof(void*)-1
+   ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
+   ptr=(void*) ( (char*)(ptr) + alignment - offset);
+
+   // Store pointer to real allocated chunk just before usable chunk
+   *((void **)ptr-1)=pa;
+
+   //WALBERLA_ASSERT_EQUAL( ((size_t)ptr+offset) % alignment, 0 );
+
+   return ptr;
+}
+
+
 
 int main(int argc, char **argv)
 {
@@ -41,7 +94,12 @@ int main(int argc, char **argv)
   {%- for field_name, dataType, size in fields %}
 
   // Initialization {{field_name}}
+  {%- if aligned %}
+  double * {{field_name}} = (double *) aligned_malloc_with_offset(sizeof({{dataType}}) * {{size}}, 32, 8);
+  {%- else %}
   double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
+  {%- endif %}
+
   for (unsigned long long i = 0; i < {{size}}; ++i)
     {{field_name}}[i] = 0.23;
 
@@ -69,6 +127,10 @@ int main(int argc, char **argv)
   #pragma omp barrier
   {%- elif likwid %}
   likwid_markerRegisterRegion("loop");
+  {%- elif openmp %}
+  #pragma omp parallel
+  {
+  #pragma omp barrier
   {%- endif %}
 
   for(int warmup = 1; warmup >= 0; --warmup) {
@@ -107,10 +169,10 @@ int main(int argc, char **argv)
 
   {%- if likwid %}
   likwid_markerStopRegion("loop");
+  {%- endif %}
   {%- if openmp %}
   }
   {%- endif %}
-  {%- endif %}
 
   {%- if likwid %}
   likwid_markerClose();
@@ -119,13 +181,12 @@ int main(int argc, char **argv)
 """)
 
 
-def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
+def generate_benchmark(ast, openmp=False, timing=False, main=True, kernel=True, assume_aligned=False):
     """Return C code of a benchmark program for the given kernel.
 
     Args:
         ast: the pystencils AST object as returned by create_kernel
-        likwid: if True likwid markers are added to the code
-        openmp: relevant only if likwid=True, to generated correct likwid initialization code
+        openmp: to generated correct likwid initialization code
         timing: add timing output to the code, prints time per iteration to stdout
 
     Returns:
@@ -143,7 +204,13 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
             assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
             field = accessed_fields[p.field_name]
             dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
+            if not assume_aligned:
+                fields.append((p.field_name, dtype, prod(field.shape)))
+            else:
+                index = np.where(field.strides == np.amax(field.strides))[0][0]
+                size = field.shape[index] * field.strides[index]
+                fields.append((p.field_name, dtype, int(size)))
+            
             call_parameters.append(p.field_name)
 
     header_list = get_headers(ast)
@@ -151,14 +218,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
 
     # Strip "#pragma omp parallel" from within kernel, because main function takes care of that
     # when likwid and openmp are enabled
-    if likwid and openmp:
-        if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
-            ast.body.args[0].pragma_line = ''
-
+    #if openmp:
+    #    if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
+    #        ast.body.args[0].pragma_line = ''
+    if not main and kernel:
+        return include_template.render(
+                {
+                    'includes': includes,
+                    'likwid': False
+                }) + generate_c(ast, dialect='c')
     args = {
-        'likwid': likwid,
+        'aligned': assume_aligned,
+        'likwid': False,
         'openmp': openmp,
-        'kernel_code': generate_c(ast, dialect='c'),
+        'kernel_code': generate_c(ast, dialect='c') if kernel else '',
         'kernelName': ast.function_name,
         'fields': fields,
         'constants': constants,
@@ -169,7 +242,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
     return benchmark_template.render(**args)
 
 
-def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
+def run_c_benchmark(ast, inner_iterations, outer_iterations=3, assume_aligned=False, openmp=False):
     """Runs the given kernel with outer loop in C
 
     Args:
@@ -182,7 +255,7 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
     """
     import kerncraft
 
-    benchmark_code = generate_benchmark(ast, timing=True)
+    benchmark_code = generate_benchmark(ast, timing=True, assume_aligned=assume_aligned, openmp=openmp)
     with open('bench.c', 'w') as f:
         f.write(benchmark_code)
 
@@ -190,17 +263,28 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
 
     extra_flags = ['-I' + get_pystencils_include_path(),
                    '-I' + os.path.join(kerncraft_path, 'headers')]
+    if False:
+        extra_flags.append(os.environ['LIKWID_INC'])
+        extra_flags.append(os.environ['LIKWID_LIB'])
 
     compiler_config = get_compiler_config()
     compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
+    compile_asm_cmd = [compiler_config['command']] + compiler_config['flags'].split()
     compile_cmd += [*extra_flags,
                     os.path.join(kerncraft_path, 'headers', 'timing.c'),
                     os.path.join(kerncraft_path, 'headers', 'dummy.c'),
                     'bench.c',
                     '-o', 'bench',
                     ]
-    run_compile_step(compile_cmd)
+    print(" ".join(compile_cmd))
+    compile_asm_cmd += [*extra_flags,
+                    os.path.join(kerncraft_path, 'headers', 'timing.c'),
+                    os.path.join(kerncraft_path, 'headers', 'dummy.c'),
+                    'bench.c', '-S',
+                    ]
 
+    run_compile_step(compile_cmd)
+    run_compile_step(compile_asm_cmd)
     results = []
     for _ in range(outer_iterations):
         benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))
diff --git a/pystencils/kerncraft_coupling/kerncraft_interface.py b/pystencils/kerncraft_coupling/kerncraft_interface.py
index b58ccb96a28af75994837183b400807fbde9acb1..9184f266c0291e3678b9680029b906e56cd9af97 100644
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
@@ -21,10 +21,22 @@ class PyStencilsKerncraftKernel(KernelCode):
     Implementation of kerncraft's kernel interface for pystencils CPU kernels.
     Analyses a list of equations assuming they will be executed on a CPU
     """
-    LIKWID_BASE = '/usr/local/likwid'
+    LIKWID_BASE = '/mnt/opt/likwid-4.3.4'
+
+    @staticmethod
+    def get_number_of_kernels(ast: KernelFunction):
+        return len([l for l in filtered_tree_iteration(ast, LoopOverCoordinate,
+            stop_type=SympyAssignment) if l.is_innermost_loop])
+
+    @staticmethod
+    def get_steps_for_loops(ast: KernelFunction):
+        inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
+                       if l.is_innermost_loop]
+        steps = [l.step * 8 for l in inner_loops]
+        return steps
 
     def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
-                 assumed_layout='SoA', debug_print=False, filename=None):
+                 assumed_layout='SoA', debug_print=False, filename=None, loop_idx=False):
         """Create a kerncraft kernel using a pystencils AST
 
         Args:
@@ -50,11 +62,12 @@ class PyStencilsKerncraftKernel(KernelCode):
         if len(inner_loops) == 0:
             raise ValueError("No loop found in pystencils AST")
         else:
-            if len(inner_loops) > 1:
-                warnings.warn("pystencils AST contains multiple inner loops. "
-                              "Only one can be analyzed - choosing first one")
-            inner_loop = inner_loops[0]
-
+            if len(inner_loops) > 1 and loop_idx is False:
+                warnings.warn(("pystencils AST contains multiple inner loops ({}). "
+                              "Only one can be analyzed - choosing first one").format(
+                                len(inner_loops)))
+                loop_idx = 0
+            inner_loop = inner_loops[loop_idx]
         self._loop_stack = []
         cur_node = inner_loop
         while cur_node is not None:
@@ -95,7 +108,7 @@ class PyStencilsKerncraftKernel(KernelCode):
         for field in fields_accessed:
             layout = get_layout_tuple(field)
             permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
+            self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
 
         # Scalars may be safely ignored
         # for param in ast.get_parameters():
@@ -128,6 +141,39 @@ class PyStencilsKerncraftKernel(KernelCode):
             print("-----------------------------  FLOPS -------------------------------")
             pprint(self._flops)
 
+    def get_main_code(self, as_filename=False, kernel_function_name='kernel'):
+        """
+        Generate and return compilable source code from AST.
+        """
+        code = generate_benchmark(self.kernel_ast, openmp=True, kernel=False)
+        if as_filename:
+            fp, already_available = self._get_intermediate_file('main.c',
+                                                            machine_and_compiler_dependent=False)
+            if not already_available:
+                fp.write(code)
+            return fp.name
+        else:
+            return code
+
+
+    def get_kernel_code(self, openmp=False, as_filename=False, name='kernel'):
+        """
+        Generate and return compilable source code.
+
+        Args:
+            openmp: if true, openmp code will be generated
+            as_filename:
+        """
+        code = generate_benchmark(self.kernel_ast, openmp=openmp, main=False, kernel=True)
+        if as_filename:
+            fp, already_available = self._get_intermediate_file('kernel_{}.c'.format('-omp' if openmp else ''),
+                                                                machine_and_compiler_dependent=False)
+            if not already_available:
+                fp.write(code)
+            return fp.name
+        else:
+            return code
+
     def as_code(self, type_='iaca', openmp=False, as_filename=False):
         """
         Generate and return compilable source code.
@@ -137,6 +183,11 @@ class PyStencilsKerncraftKernel(KernelCode):
             openmp: if true, openmp code will be generated
             as_filename:
         """
+        warnings.warn(
+            ('as_code is deprecated and works only for kerncraft <= 0.8.0. Please upgrade your'
+                + 'kerncraft version and use self.get_kernel_code'),
+            FutureWarning
+        )
         code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
         if as_filename:
             fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_),
@@ -149,14 +200,14 @@ class PyStencilsKerncraftKernel(KernelCode):
 
 
 class KerncraftParameters(DotDict):
-    def __init__(self, **kwargs):
+    def __init__(self, ptr_inc='auto', **kwargs):
         super(KerncraftParameters, self).__init__(**kwargs)
         self['asm_block'] = 'auto'
         self['asm_increment'] = 0
         self['cores'] = 1
         self['cache_predictor'] = 'SIM'
-        self['verbose'] = 0
-        self['pointer_increment'] = 'auto'
+        self['verbose'] = 1
+        self['pointer_increment'] = ptr_inc
         self['iterations'] = 10
         self['unit'] = 'cy/CL'
         self['ignore_warnings'] = True