diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5de4c9fc21eb83fc1ce9074c9cdfac8ef1af8b4b..4b968da0444d98af8df9f0a1804a5162e3b52cc0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -250,10 +250,12 @@ endif()
 # LIKWID ========================================================================
 #================================================================================
 if(LIKWID_DIR)
+    # Example: -DLIKWID_DIR=/apps/likwid/5.4.0
+
     target_compile_options(${PAIRS_TARGET} PRIVATE -DLIKWID_PERFMON -pthread)
 
-    target_link_libraries(${PAIRS_TARGET} PRIVATE ${LIKWID_DIR}/lib/liblikwid.a)
-    list(APPEND PAIRS_LINK_LIBRARIES ${LIKWID_DIR}/lib/liblikwid.a)
+    target_link_libraries(${PAIRS_TARGET} PRIVATE ${LIKWID_DIR}/lib/liblikwid.so)
+    list(APPEND PAIRS_LINK_LIBRARIES ${LIKWID_DIR}/lib/liblikwid.so)
 
     include_directories(${LIKWID_DIR}/include)
     list(APPEND PAIRS_INCLUDE_DIRS "${LIKWID_DIR}/include")
diff --git a/examples/modular/spring_dashpot.py b/examples/modular/spring_dashpot.py
index d88c8a78cfbfac89b800c3a42715105271f600ae..9b68d548024373dbd73b55d9a766cba7f0cbed1d 100644
--- a/examples/modular/spring_dashpot.py
+++ b/examples/modular/spring_dashpot.py
@@ -98,7 +98,7 @@ psim.build_cell_lists()
 # The order of user-defined functions is not important here since 
 # they are not used by other subroutines and are only callable individually 
 psim.compute(update_mass_and_inertia, symbols={'infinity': math.inf })
-psim.compute(spring_dashpot)
+psim.compute(spring_dashpot, profile=True)
 psim.compute(euler, parameters={'dt': pairs.real()})
 
 gravity_SI = 9.81
diff --git a/runtime/pairs.cpp b/runtime/pairs.cpp
index 351aa36b364ee200b29f06cb9d6cf9827f2dd381..a2382820e441daae0163511d12038919aae329f4 100644
--- a/runtime/pairs.cpp
+++ b/runtime/pairs.cpp
@@ -190,9 +190,12 @@ void PairsRuntime::copyArraySliceToDevice(
                 // PAIRS_DEBUG(
                 //     "Copying array %s to device (offset=%lu, n=%lu)\n",
                 //     array.getName().c_str(), offset, size);
+                this->getTimers()->start(TimerMarkers::DeviceTransfers);
 
                 pairs::copy_slice_to_device(
                     array.getHostPointer(), array.getDevicePointer(), offset, size);
+                
+                this->getTimers()->stop(TimerMarkers::DeviceTransfers);
             }
         }
     }
@@ -209,6 +212,7 @@ void PairsRuntime::copyArrayToDevice(Array &array, action_t action, size_t size)
 
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !array_flags->isDeviceFlagSet(array_id)) {
+            this->getTimers()->start(TimerMarkers::DeviceTransfers);
             if(array.isStatic()) {
                 // PAIRS_DEBUG(
                 //     "Copying static array %s to device (n=%lu)\n",
@@ -223,6 +227,7 @@ void PairsRuntime::copyArrayToDevice(Array &array, action_t action, size_t size)
 
                 pairs::copy_to_device(array.getHostPointer(), array.getDevicePointer(), size);
             }
+            this->getTimers()->stop(TimerMarkers::DeviceTransfers);
         }
     }
 
@@ -242,9 +247,12 @@ void PairsRuntime::copyArraySliceToHost(Array &array, action_t action, size_t of
                 // PAIRS_DEBUG(
                 //     "Copying array %s to host (offset=%lu, n=%lu)\n",
                 //     array.getName().c_str(), offset, size);
+                this->getTimers()->start(TimerMarkers::DeviceTransfers);
 
                 pairs::copy_slice_to_host(
                     array.getDevicePointer(), array.getHostPointer(), offset, size);
+                
+                this->getTimers()->stop(TimerMarkers::DeviceTransfers);
             }
         }
     }
@@ -261,6 +269,8 @@ void PairsRuntime::copyArrayToHost(Array &array, action_t action, size_t size) {
 
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !array_flags->isHostFlagSet(array_id)) {
+            this->getTimers()->start(TimerMarkers::DeviceTransfers);
+            
             if(array.isStatic()) {
                 // PAIRS_DEBUG(
                 //     "Copying static array %s to host (n=%lu)\n", array.getName().c_str(), size);
@@ -271,6 +281,7 @@ void PairsRuntime::copyArrayToHost(Array &array, action_t action, size_t size) {
                 // PAIRS_DEBUG("Copying array %s to host (n=%lu)\n", array.getName().c_str(), size);
                 pairs::copy_to_host(array.getDevicePointer(), array.getHostPointer(), size);
             }
+            this->getTimers()->stop(TimerMarkers::DeviceTransfers);
         }
     }
 
@@ -287,7 +298,9 @@ void PairsRuntime::copyPropertyToDevice(Property &prop, action_t action, size_t
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !prop_flags->isDeviceFlagSet(prop_id)) {
             // PAIRS_DEBUG("Copying property %s to device (n=%lu)\n", prop.getName().c_str(), size);
+            this->getTimers()->start(TimerMarkers::DeviceTransfers);
             pairs::copy_to_device(prop.getHostPointer(), prop.getDevicePointer(), size);
+            this->getTimers()->stop(TimerMarkers::DeviceTransfers);
         }
     }
 
@@ -304,7 +317,9 @@ void PairsRuntime::copyPropertyToHost(Property &prop, action_t action, size_t si
     if(action == Ignore || action == WriteAfterRead || action == ReadOnly) {
         if(action == Ignore || !prop_flags->isHostFlagSet(prop_id)) {
             // PAIRS_DEBUG("Copying property %s to host (n=%lu)\n", prop.getName().c_str(), size);
+            this->getTimers()->start(TimerMarkers::DeviceTransfers);
             pairs::copy_to_host(prop.getDevicePointer(), prop.getHostPointer(), size);
+            this->getTimers()->stop(TimerMarkers::DeviceTransfers);
         }
     }
 
@@ -324,9 +339,12 @@ void PairsRuntime::copyContactPropertyToDevice(
         if(action == Ignore || !contact_prop_flags->isDeviceFlagSet(prop_id)) {
             // PAIRS_DEBUG("Copying contact property %s to device (n=%lu)\n",
             //     contact_prop.getName().c_str(), size);
+            this->getTimers()->start(TimerMarkers::DeviceTransfers);
 
             pairs::copy_to_device(
                 contact_prop.getHostPointer(), contact_prop.getDevicePointer(), size);
+            
+            this->getTimers()->stop(TimerMarkers::DeviceTransfers);
 
             contact_prop_flags->setDeviceFlag(prop_id);
         }
@@ -346,9 +364,10 @@ void PairsRuntime::copyContactPropertyToHost(
         if(!contact_prop_flags->isHostFlagSet(contact_prop.getId())) {
             // PAIRS_DEBUG("Copying contact property %s to host (n=%lu)\n",
             //     contact_prop.getName().c_str(), size);
-
+            this->getTimers()->start(TimerMarkers::DeviceTransfers);
             pairs::copy_to_host(
                 contact_prop.getDevicePointer(), contact_prop.getHostPointer(), size);
+            this->getTimers()->stop(TimerMarkers::DeviceTransfers);
 
             contact_prop_flags->setHostFlag(prop_id);
         }
@@ -364,24 +383,25 @@ void PairsRuntime::copyFeaturePropertyToDevice(FeatureProperty &feature_prop) {
 
     // PAIRS_DEBUG("Copying feature property %s to device (n=%lu)\n",
     //     feature_prop.getName().c_str(), n);
+    this->getTimers()->start(TimerMarkers::DeviceTransfers);
 
     pairs::copy_static_symbol_to_device(
         feature_prop.getHostPointer(), feature_prop.getDevicePointer(), n);
+    
+    this->getTimers()->stop(TimerMarkers::DeviceTransfers);
 }
 
 void PairsRuntime::communicateSizes(int dim, const int *send_sizes, int *recv_sizes) {
     auto nsend_id = getArrayByHostPointer(send_sizes).getId();
     auto nrecv_id = getArrayByHostPointer(recv_sizes).getId();
 
-    this->getTimers()->start(DeviceTransfers);
     copyArrayToHost(nsend_id, ReadOnly);
     array_flags->setHostFlag(nrecv_id);
     array_flags->clearDeviceFlag(nrecv_id);
-    this->getTimers()->stop(DeviceTransfers);
 
-    this->getTimers()->start(Communication);
+    this->getTimers()->start(TimerMarkers::MPI);
     this->getDomainPartitioner()->communicateSizes(dim, send_sizes, recv_sizes);
-    this->getTimers()->stop(Communication);
+    this->getTimers()->stop(TimerMarkers::MPI);
 }
 
 void PairsRuntime::communicateData(
@@ -400,7 +420,6 @@ void PairsRuntime::communicateData(
     auto nsend_id = getArrayByHostPointer(nsend).getId();
     auto nrecv_id = getArrayByHostPointer(nrecv).getId();
 
-    this->getTimers()->start(DeviceTransfers);
     copyArrayToHost(send_offsets_id, ReadOnly);
     copyArrayToHost(recv_offsets_id, ReadOnly);
     copyArrayToHost(nsend_id, ReadOnly);
@@ -433,17 +452,13 @@ void PairsRuntime::communicateData(
     array_flags->clearDeviceFlag(recv_buf_id);
     #endif
 
-    this->getTimers()->stop(DeviceTransfers);
-
-    this->getTimers()->start(Communication);
+    this->getTimers()->start(TimerMarkers::MPI);
     this->getDomainPartitioner()->communicateData(
         dim, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv);
-    this->getTimers()->stop(Communication);
+    this->getTimers()->stop(TimerMarkers::MPI);
 
     #ifndef ENABLE_CUDA_AWARE_MPI
-    this->getTimers()->start(DeviceTransfers);
     copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
-    this->getTimers()->stop(DeviceTransfers);
     #endif
 }
 
@@ -463,7 +478,6 @@ void PairsRuntime::communicateDataReverse(
     auto nsend_id = getArrayByHostPointer(nsend).getId();
     auto nrecv_id = getArrayByHostPointer(nrecv).getId();
 
-    this->getTimers()->start(DeviceTransfers);
     copyArrayToHost(send_offsets_id, ReadOnly);
     copyArrayToHost(recv_offsets_id, ReadOnly);
     copyArrayToHost(nsend_id, ReadOnly);
@@ -496,17 +510,13 @@ void PairsRuntime::communicateDataReverse(
     array_flags->clearDeviceFlag(recv_buf_id);
     #endif
 
-    this->getTimers()->stop(DeviceTransfers);
-
-    this->getTimers()->start(Communication);
+    this->getTimers()->start(TimerMarkers::MPI);
     this->getDomainPartitioner()->communicateDataReverse(
         dim, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv);
-    this->getTimers()->stop(Communication);
+    this->getTimers()->stop(TimerMarkers::MPI);
 
     #ifndef ENABLE_CUDA_AWARE_MPI
-    this->getTimers()->start(DeviceTransfers);
     copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
-    this->getTimers()->stop(DeviceTransfers);
     #endif
 }
 
@@ -526,7 +536,6 @@ void PairsRuntime::communicateAllData(
     auto nsend_id = getArrayByHostPointer(nsend).getId();
     auto nrecv_id = getArrayByHostPointer(nrecv).getId();
 
-    this->getTimers()->start(DeviceTransfers);
     copyArrayToHost(send_offsets_id, ReadOnly);
     copyArrayToHost(recv_offsets_id, ReadOnly);
     copyArrayToHost(nsend_id, ReadOnly);
@@ -559,17 +568,13 @@ void PairsRuntime::communicateAllData(
     array_flags->clearDeviceFlag(recv_buf_id);
     #endif
 
-    this->getTimers()->stop(DeviceTransfers);
-
-    this->getTimers()->start(Communication);
+    this->getTimers()->start(TimerMarkers::MPI);
     this->getDomainPartitioner()->communicateAllData(
         ndims, elem_size, send_buf_ptr, send_offsets, nsend, recv_buf_ptr, recv_offsets, nrecv);
-    this->getTimers()->stop(Communication);
+    this->getTimers()->stop(TimerMarkers::MPI);
 
     #ifndef ENABLE_CUDA_AWARE_MPI
-    this->getTimers()->start(DeviceTransfers);
     copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * elem_size * sizeof(real_t));
-    this->getTimers()->stop(DeviceTransfers);
     #endif
 }
 
@@ -589,7 +594,6 @@ void PairsRuntime::communicateContactHistoryData(
     auto nsend_contact_id = getArrayByHostPointer(nsend_contact).getId();
     auto nrecv_contact_id = getArrayByHostPointer(nrecv_contact).getId();
 
-    this->getTimers()->start(DeviceTransfers);
     copyArrayToHost(contact_soffsets_id, ReadOnly);
     copyArrayToHost(nsend_contact_id, ReadOnly);
 
@@ -610,9 +614,7 @@ void PairsRuntime::communicateContactHistoryData(
     array_flags->clearDeviceFlag(recv_buf_id);
     #endif
 
-    this->getTimers()->stop(DeviceTransfers);
-
-    this->getTimers()->start(Communication);
+    this->getTimers()->start(TimerMarkers::MPI);
     this->getDomainPartitioner()->communicateSizes(dim, nsend_contact, nrecv_contact);
 
     contact_roffsets[dim * 2 + 0] = 0;
@@ -629,13 +631,11 @@ void PairsRuntime::communicateContactHistoryData(
         send_buf_ptr, contact_soffsets, nsend_contact,
         recv_buf_ptr, contact_roffsets, nrecv_contact);
 
-    this->getTimers()->stop(Communication);
+    this->getTimers()->stop(TimerMarkers::MPI);
 
     #ifndef ENABLE_CUDA_AWARE_MPI
-    this->getTimers()->start(DeviceTransfers);
     copyArrayToDevice(recv_buf_id, Ignore, nrecv_all * sizeof(real_t));
     copyArrayToDevice(contact_roffsets_id, Ignore);
-    this->getTimers()->stop(DeviceTransfers);
     #endif
 }
 
diff --git a/runtime/pairs.hpp b/runtime/pairs.hpp
index d582d277333ab10995d9973e3ab7791ba977bdba..d77be18ff5d907c3b6ebbc0a1155a6d18734d4c1 100644
--- a/runtime/pairs.hpp
+++ b/runtime/pairs.hpp
@@ -362,11 +362,16 @@ public:
 
     // Timers
     Timers<double> *getTimers() { return timers; }
+    
     void printTimers() {
         if(this->getDomainPartitioner()->getRank() == 0) {
             this->getTimers()->print();
         }
     }
+
+    void logTimers() {
+        this->getTimers()->writeToFile(this->getDomainPartitioner()->getRank());
+    }
 };
 
 template<typename Domain_T>
diff --git a/runtime/pairs_common.hpp b/runtime/pairs_common.hpp
index 03766e1459d3a9a3da8d12c38335a6a52101fb1b..af651893c9bdae1751cf7f962b5049b67578ae97 100644
--- a/runtime/pairs_common.hpp
+++ b/runtime/pairs_common.hpp
@@ -79,10 +79,9 @@ enum Actions {
 };
 
 enum TimerMarkers {
-    All = 0,
-    Communication = 1,
-    DeviceTransfers = 2,
-    Offset = 3
+    MPI = 0,
+    DeviceTransfers = 1,
+    Offset = 2
 };
 
 enum DomainPartitioners {
diff --git a/runtime/timers.hpp b/runtime/timers.hpp
index c4cdc943aa5faeed57b4971684277e0844d3e7da..8e399ddfbfa9a3dee0d1e15b1e0423613e7f10e7 100644
--- a/runtime/timers.hpp
+++ b/runtime/timers.hpp
@@ -2,6 +2,8 @@
 #include <chrono>
 #include <iostream>
 #include <unordered_map>
+#include <sstream>
+#include <iomanip>
 
 #pragma once
 
@@ -17,61 +19,98 @@ public:
 
     void add(size_t id, std::string name) {
         counter_names.resize(id + 1);
-        counters.resize(id + 1);
+        time_counters.resize(id + 1);
+        call_counters.resize(id + 1);
         clocks.resize(id + 1);
         counter_names[id] = name;
     }
 
-    void start(size_t id) { clocks[id] = std::chrono::high_resolution_clock::now(); }
+    void start(size_t id) { clocks[id] = std::chrono::high_resolution_clock::now(); ++call_counters[id];}
 
     void stop(size_t id) {
         auto current_clock = std::chrono::high_resolution_clock::now();
-        counters[id] += static_cast<TimeType>(
+        time_counters[id] += static_cast<TimeType>(
             std::chrono::duration_cast<TimeUnit>(current_clock - clocks[id]).count()) * time_factor;
     }
 
-    void print() {
-        std::unordered_map<std::string, TimeType> categorySums;
-
-        std::cout << "all: " << counters[0] << std::endl;
-        for (size_t i = 1; i < counters.size(); ++i) {
+    void writeToFile(int rank){
+        MPI_File file;
+        MPI_File_open(MPI_COMM_WORLD, "timers.txt", MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &file);
+    
+        std::ostringstream ss;
+        ss << "Rank: " << rank << "\n";
+        ss << std::left << std::setw(80) << "Timer"
+           << std::left << std::setw(15) << "Total [ms]"
+           << std::left << std::setw(15) << "Count" << "\n";
+        ss << "--------------------------------------------------------------------------------------------------------\n";
+        
+        // Modules
+        for (size_t i = TimerMarkers::Offset; i < time_counters.size(); ++i) {
             const std::string& counterName = counter_names[i];
-            TimeType counterValue = counters[i];
-
-            if(counterName.find("pack_") == 0 ||
-               counterName.find("unpack_") == 0 ||
-               counterName.find("determine_") == 0 ||
-               counterName.find("set_communication_") == 0 ||
-               counterName.find("remove_exchanged_particles") == 0 ||
-               counterName.find("change_size_after_exchange") == 0) {
-
-                categorySums["communication"] += counterValue;
-
-            } else if(counterName.find("build_cell_lists") == 0 ||
-                      counterName.find("build_cell_lists_stencil") == 0 ||
-                      counterName.find("partition_cell_lists") == 0 ||
-                      counterName.find("build_neighbor_lists") == 0) {
-
-                categorySums["neighbors"] += counterValue;
-
-            } else {
-                if(counterName.length() > 0) {
-                    categorySums[counterName] += counterValue;
-                } else {
-                    categorySums["other"] += counterValue;
-                }
+            if(counterName.length() > 0) {
+                ss << std::left << std::setw(80) << counter_names[i]
+                    << std::left << std::setw(15) << std::fixed << std::setprecision(2) << time_counters[i]
+                    << std::left << std::setw(15) << call_counters[i]
+                    << "\n";
             }
         }
 
-        // Print the accumulated sums for each category
-        for(const auto& category: categorySums) {
-            std::cout << category.first << ": " << category.second << std::endl;
+        // Markers
+        for (size_t i = 0; i < TimerMarkers::Offset; ++i) {
+            ss << std::left << std::setw(80) << counter_names[i]
+                << std::left << std::setw(15) << std::fixed << std::setprecision(2) << time_counters[i]
+                << std::left << std::setw(15) << 1
+                << "\n";
+        }
+
+        computeCategories();
+
+        // Categories
+        for (const auto& cs : categorySums) {;
+            ss << std::left << std::setw(80) << cs.first
+                << std::left << std::setw(15) << std::fixed << std::setprecision(2) << cs.second
+                << std::left << std::setw(15) << 1
+                << "\n";
+        }
+        ss << "\n\n";
+
+        std::string output = ss.str();
+        MPI_File_write_ordered(file, output.c_str(), output.size(), MPI_CHAR, MPI_STATUS_IGNORE);
+        MPI_File_close(&file);
+    }
+
+    void print(){
+    }
+
+    void computeCategories() {
+        for (size_t i = 0; i < time_counters.size(); ++i) {
+            const std::string& counterName = counter_names[i];
+            TimeType counterValue = time_counters[i];
+
+            if(counterName.find("INTERNAL_MODULES::pack_") == 0 ||
+               counterName.find("INTERNAL_MODULES::unpack_") == 0 ||
+               counterName.find("INTERNAL_MODULES::determine_") == 0 ||
+               counterName.find("INTERNAL_MODULES::set_communication_") == 0 ||
+               counterName.find("INTERNAL_MODULES::remove_exchanged_particles") == 0 ||
+               counterName.find("INTERNAL_MODULES::change_size_after_exchange") == 0) {
+
+                categorySums["INTERNAL_CATEGORIES::COMMUNICATION"] += counterValue;
+
+            } else if(counterName.find("INTERNAL_MODULES::build_cell_lists") == 0 ||
+                      counterName.find("INTERNAL_MODULES::build_cell_lists_stencil") == 0 ||
+                      counterName.find("INTERNAL_MODULES::partition_cell_lists") == 0 ||
+                      counterName.find("INTERNAL_MODULES::build_neighbor_lists") == 0) {
+
+                categorySums["INTERNAL_CATEGORIES::NEIGHBORS"] += counterValue;
+            }
         }
     }
 
 private:
     std::vector<std::string> counter_names;
-    std::vector<TimeType> counters;
+    std::vector<TimeType> time_counters;
+    std::vector<int> call_counters;
+    std::unordered_map<std::string, TimeType> categorySums;
     std::vector<std::chrono::high_resolution_clock::time_point> clocks;
     TimeType time_factor;
 };
diff --git a/runtime/timing.cpp b/runtime/timing.cpp
index 0068d8117b89d2529e3f8dc10b24e1d2483672e8..6addb69e12f532cab74aea533f04d41bc6fffade 100644
--- a/runtime/timing.cpp
+++ b/runtime/timing.cpp
@@ -20,4 +20,8 @@ void print_timers(PairsRuntime *ps) {
     ps->printTimers();
 }
 
+void log_timers(PairsRuntime *ps) {
+    ps->logTimers();
+}
+
 }
diff --git a/runtime/timing.hpp b/runtime/timing.hpp
index f7544603549232cce89c00e7071948da32511693..80662e9d886965cd0669f0716a5fbb5e87e61264 100644
--- a/runtime/timing.hpp
+++ b/runtime/timing.hpp
@@ -10,5 +10,6 @@ void register_timer(PairsRuntime *ps, int id, std::string name);
 void start_timer(PairsRuntime *ps, int id);
 void stop_timer(PairsRuntime *ps, int id);
 void print_timers(PairsRuntime *ps);
+void log_timers(PairsRuntime *ps);
 
 }
diff --git a/src/pairs/code_gen/cgen.py b/src/pairs/code_gen/cgen.py
index c8078a873ec21f05423591b30c623508bc479a7a..3258452a6987c9cf91d54e8f35e4ba5286e2a224 100644
--- a/src/pairs/code_gen/cgen.py
+++ b/src/pairs/code_gen/cgen.py
@@ -198,9 +198,8 @@ class CGen:
         self.print("namespace pairs::internal {")
         self.print.add_indent(4)
 
-        # All modules except the interface ones are declared in the pairs::internal scope
-        for module in self.sim.modules() + self.sim.udf_modules():
-            assert not module.interface
+        # All internal modules are declared in the pairs::internal scope
+        for module in self.sim.modules():
             self.generate_module_header(module, definition=False)
         
         self.print.add_indent(-4)
@@ -346,9 +345,8 @@ class CGen:
         for kernel in self.sim.kernels():
             self.generate_kernel(kernel)
 
-        # All modules except the interface ones are defined in the pairs::internal scope
-        for module in self.sim.modules() + self.sim.udf_modules():
-            assert not module.interface
+        # All internal modules are defined in the pairs::internal scope
+        for module in self.sim.modules():
             self.generate_module(module)
 
         self.print.add_indent(-4)
diff --git a/src/pairs/code_gen/interface.py b/src/pairs/code_gen/interface.py
index 78d5a851fc55c61187e7aea84181cbc5a3f9e0a3..228a7ae0800a5044e6bb85722608e780a4b51a56 100644
--- a/src/pairs/code_gen/interface.py
+++ b/src/pairs/code_gen/interface.py
@@ -73,6 +73,9 @@ class InterfaceModules:
             RegisterMarkers(self.sim)
         ])
 
+        if self.sim._enable_profiler:
+            PrintCode(self.sim, "LIKWID_MARKER_INIT;")
+
         self.sim.add_statement(inits)
 
     @pairs_interface_block
@@ -186,63 +189,15 @@ class InterfaceModules:
         self.sim.module_name('size')
         Return(self.sim, ScalarOp.inline(self.sim.nlocal + self.sim.nghost))
 
-    @pairs_interface_block
-    def create_sphere(self):
-        self.sim.module_name('create_sphere')
-        x = Parameter(self.sim, 'x', Types.Real)
-        y = Parameter(self.sim, 'y', Types.Real)
-        z = Parameter(self.sim, 'z', Types.Real)
-        vx = Parameter(self.sim, 'vx', Types.Real)
-        vy = Parameter(self.sim, 'vy', Types.Real)
-        vz = Parameter(self.sim, 'vz', Types.Real)
-        density = Parameter(self.sim, 'density', Types.Real)
-        radius = Parameter(self.sim, 'radius', Types.Real)
-        ptype = Parameter(self.sim, 'type', Types.Real)
-        flag = Parameter(self.sim, 'flag', Types.Real)
-
-        Return(self.sim, Call(self.sim, "pairs::create_sphere", 
-                              [x, y, z, vx, vy, vz, 
-                               density, radius, ptype, flag], Types.UInt64))
-
-    @pairs_interface_block
-    def create_halfspace(self):
-        self.sim.module_name('create_halfspace')
-        x = Parameter(self.sim, 'x', Types.Real)
-        y = Parameter(self.sim, 'y', Types.Real)
-        z = Parameter(self.sim, 'z', Types.Real)
-        nx = Parameter(self.sim, 'nx', Types.Real)
-        ny = Parameter(self.sim, 'ny', Types.Real)
-        nz = Parameter(self.sim, 'nz', Types.Real)
-        ptype = Parameter(self.sim, 'type', Types.Real)
-        flag = Parameter(self.sim, 'flag', Types.Real)
-
-        Return(self.sim, Call(self.sim, "pairs::create_halfspace", 
-                              [x, y, z, nx, ny, nz, ptype, flag], Types.UInt64))
-        
-    @pairs_interface_block
-    def dem_sc_grid(self):
-        self.sim.module_name('dem_sc_grid')
-        xmax = Parameter(self.sim, 'xmax', Types.Real)
-        ymax = Parameter(self.sim, 'ymax', Types.Real)
-        zmax = Parameter(self.sim, 'zmax', Types.Real)
-        spacing = Parameter(self.sim, 'spacing', Types.Real)
-        diameter = Parameter(self.sim, 'diameter', Types.Real)
-        min_diameter = Parameter(self.sim, 'min_diameter', Types.Real)
-        max_diameter = Parameter(self.sim, 'max_diameter', Types.Real)
-        initial_velocity = Parameter(self.sim, 'initial_velocity', Types.Real)
-        particle_density = Parameter(self.sim, 'particle_density', Types.Real)
-        ntypes = Parameter(self.sim, 'ntypes', Types.Int32)
-
-        Assign(self.sim, self.sim.nlocal,
-               Call_Int(self.sim, "pairs::dem_sc_grid",
-                        [xmax, ymax, zmax, spacing, diameter, min_diameter, max_diameter,
-                         initial_velocity, particle_density, ntypes]))
-        Return(self.sim, self.sim.nlocal)
-
     @pairs_interface_block
     def end(self):
         self.sim.module_name('end')
-        # Call_Void(self.sim, "pairs::print_timers", [])
+
+        if self.sim._enable_profiler:
+            PrintCode(self.sim, "LIKWID_MARKER_CLOSE;")
+            
+        Call_Void(self.sim, "pairs::print_timers", [])
+        Call_Void(self.sim, "pairs::log_timers", [])
         Call_Void(self.sim, "pairs::print_stats", [self.sim.nlocal, self.sim.nghost])
         PrintCode(self.sim, "delete pobj;")
         PrintCode(self.sim, "delete pairs_runtime;")
diff --git a/src/pairs/ir/block.py b/src/pairs/ir/block.py
index 2a14ea2776dcf3651a8f9d03b397bf9db1bd1fb6..5883d4157730380ff40dab9b9318f682bebc2cb8 100644
--- a/src/pairs/ir/block.py
+++ b/src/pairs/ir/block.py
@@ -12,9 +12,20 @@ def pairs_inline(func):
     return inner
 
 
+def pairs_block(func):
+    """This decorator needs the owning class of the method being decorated to have a 'run_on_device' member. 
+    If the variable doesn't exist it defaults to False here, hence pairs_host_block gets used."""
+    def wrapper(*args, **kwargs):
+        self = args[0]
+        decorator = pairs_device_block if getattr(self, "run_on_device", False) else pairs_host_block
+        return decorator(func)(*args, **kwargs)
+    return wrapper
+
+
 def pairs_host_block(func):
     def inner(*args, **kwargs):
         sim = args[0].sim # self.sim
+        profile = getattr(args[0], "profile", False)
         sim.init_block()
         func(*args, **kwargs)
         return Module(sim,
@@ -22,7 +33,8 @@ def pairs_host_block(func):
             block=Block(sim, sim._block),
             resizes_to_check=sim._resizes_to_check,
             check_properties_resize=sim._check_properties_resize,
-            run_on_device=False)
+            run_on_device=False,
+            profile=profile)
 
     return inner
 
@@ -30,6 +42,7 @@ def pairs_host_block(func):
 def pairs_device_block(func):
     def inner(*args, **kwargs):
         sim = args[0].sim # self.sim
+        profile = getattr(args[0], "profile", False)
         sim.init_block()
         func(*args, **kwargs)
         return Module(sim,
@@ -37,7 +50,8 @@ def pairs_device_block(func):
             block=Block(sim, sim._block),
             resizes_to_check=sim._resizes_to_check,
             check_properties_resize=sim._check_properties_resize,
-            run_on_device=True)
+            run_on_device=True,
+            profile=profile)
 
     return inner
 
diff --git a/src/pairs/ir/module.py b/src/pairs/ir/module.py
index 3f2df80f47f0d361b5f627b1478d9c87c742ff63..c7937ed9b478f2420a44633af040976884a18c58 100644
--- a/src/pairs/ir/module.py
+++ b/src/pairs/ir/module.py
@@ -16,8 +16,8 @@ class Module(ASTNode):
                  block=None, 
                  resizes_to_check={}, 
                  check_properties_resize=False, 
-                 run_on_device=False, 
-                 user_defined=False, 
+                 run_on_device=False,
+                 profile=False, 
                  interface=False):
         super().__init__(sim)
         self._id = Module.last_module
@@ -34,20 +34,17 @@ class Module(ASTNode):
         self._resizes_to_check = resizes_to_check
         self._check_properties_resize = check_properties_resize
         self._run_on_device = run_on_device
-        self._user_defined = user_defined
         self._interface = interface
         self._return_type = Types.Void
-        self._profile = False
+        self._profile = profile
+        
+        if profile:
+            self.sim.enable_profiler()
 
-        if user_defined:
-            assert not interface, ("User-defined modules can't be part of the interface directly."
-                                "Wrap them inside seperate interface modules.")
-            sim.add_udf_module(self)
+        if interface:
+            sim.add_interface_module(self)
         else:
-            if interface:
-                sim.add_interface_module(self)
-            else:
-                sim.add_module(self)
+            sim.add_module(self)
                 
         Module.last_module += 1
 
@@ -70,10 +67,6 @@ class Module(ASTNode):
     def run_on_device(self):
         return self._run_on_device
     
-    @property
-    def user_defined(self):
-        return self._user_defined
-
     @property
     def interface(self):
         return self._interface
diff --git a/src/pairs/ir/timers.py b/src/pairs/ir/timers.py
index b1cd21c0c80a675491bef0c10bfc98806b81e542..928a668cd80dcb0d180fe1fc8fa4386cd42ff5b6 100644
--- a/src/pairs/ir/timers.py
+++ b/src/pairs/ir/timers.py
@@ -1,12 +1,10 @@
 class Timers:
     Invalid = -1
-    All = 0
-    Communication = 1
-    DeviceTransfers = 2
-    Offset = 3
+    Communication = 0
+    DeviceTransfers = 1
+    Offset = 2
 
     def name(timer):
-        return "all"            if timer == Timers.All else             \
-               "mpi"            if timer == Timers.Communication else   \
-               "transfers"      if timer == Timers.DeviceTransfers else \
-               "invalid"
+        return "MARKERS::MPI"                if timer == Timers.Communication else   \
+               "MARKERS::DEVICE_TRANSFERS"   if timer == Timers.DeviceTransfers else \
+               "INVALID"
diff --git a/src/pairs/mapping/funcs.py b/src/pairs/mapping/funcs.py
index 2523ba4129e5bfaff41c06c174f5188e33b33b42..c84578735b0d3b20550d0827f61d72f2b44da783 100644
--- a/src/pairs/mapping/funcs.py
+++ b/src/pairs/mapping/funcs.py
@@ -15,7 +15,7 @@ from pairs.sim.flags import Flags
 from pairs.sim.interaction import ParticleInteraction
 from pairs.sim.global_interaction import  GlobalLocalInteraction, GlobalGlobalInteraction, GlobalReduction, SortGlobals, PackGlobals, ResetReductionProps, ReduceGlobals, UnpackGlobals
 from pairs.ir.module import Module
-from pairs.ir.block import Block, pairs_device_block
+from pairs.ir.block import Block, pairs_block
 from pairs.sim.lowerable import Lowerable
 
 class UndefinedSymbol():
@@ -288,10 +288,12 @@ class BuildParticleIR(ast.NodeVisitor):
         return op_class(self.sim, operand, None, BuildParticleIR.get_unary_op(node.op))
 
 class OneBodyKernel(Lowerable):
-    def __init__(self, sim, module_name):
+    def __init__(self, sim, module_name, run_on_device, profile):
         super().__init__(sim)
         self.block = Block(sim, [])
         self.module_name = module_name
+        self.run_on_device = run_on_device
+        self.profile = profile
 
     def add_statement(self, stmt):
         self.block.add_statement(stmt)
@@ -303,12 +305,12 @@ class OneBodyKernel(Lowerable):
             yield i
         self.sim.leave()
     
-    @pairs_device_block
+    @pairs_block
     def lower(self):
         self.sim.module_name(self.module_name)
         self.sim.add_statement(self.block)
 
-def compute(sim, func, cutoff_radius=None, symbols={}, parameters={}, compute_globals=False):
+def compute(sim, func, cutoff_radius=None, symbols={}, parameters={}, compute_globals=False, run_on_device=True, profile=False):
     if sim._generate_whole_program:
         assert not parameters, "Compute functions can't take custom parameters when generating whole program."
 
@@ -333,14 +335,14 @@ def compute(sim, func, cutoff_radius=None, symbols={}, parameters={}, compute_gl
     sim.module_name(func.__name__)
 
     if nparams == 1:
-        for i in OneBodyKernel(sim, func.__name__):
+        for i in OneBodyKernel(sim, func.__name__, run_on_device=run_on_device, profile=profile):
             ir = BuildParticleIR(sim, symbols, parameters)
             ir.add_symbols({params[0]: i})
             ir.visit(tree)
 
     else:
         # Compute local-local and local-global interactions
-        particle_interaction = ParticleInteraction(sim, func.__name__, nparams, cutoff_radius)
+        particle_interaction = ParticleInteraction(sim, func.__name__, nparams, cutoff_radius, run_on_device=run_on_device, profile=profile)
         for interaction_data in particle_interaction:
             # Start building IR
             ir = BuildParticleIR(sim, symbols, parameters)
@@ -409,5 +411,5 @@ def compute(sim, func, cutoff_radius=None, symbols={}, parameters={}, compute_gl
             
     # User defined functions are wrapped inside seperate interface modules here.
     # The udf's have the same name as their interface module but they get implemented in the pairs::internal scope.
-    sim.build_interface_module()  
+    sim.build_interface_module_with_statements(run_on_device, profile)  
     
diff --git a/src/pairs/sim/global_interaction.py b/src/pairs/sim/global_interaction.py
index 00f213ec7aebf9174ea8fa11748f422152830fa7..dca2a775ee301def37a7d334fd9232d878aac42b 100644
--- a/src/pairs/sim/global_interaction.py
+++ b/src/pairs/sim/global_interaction.py
@@ -104,19 +104,6 @@ class GlobalReduction:
                     for app in self.particle_interaction.apply_list[ishape*self.sim.max_shapes() + jshape]:
                         self.red_props.add(app.prop())
 
-        # self.sim.add_statement(self)
-
-    # @pairs_inline
-    # def lower(self):
-    #     SortGlobals(self)
-    #     PackGlobals(self, self.intermediate_buffer)
-    #     ResetReductionProps(self)
-    #     GlobalLocalInteraction(self)
-    #     PackGlobals(self, self.red_buffer)
-    #     ReduceGlobals(self)
-    #     UnpackGlobals(self)
-    #     GlobalGlobalInteraction(self)
-
     def global_particles(self):
         for p in For(self.sim, 0, self.sim.cell_lists.cell_sizes[0]):
             i = self.sim.cell_lists.cell_particles[0][p]
diff --git a/src/pairs/sim/instrumentation.py b/src/pairs/sim/instrumentation.py
index 7281f13fc9f848038c4df34248d65db7e8e13c8c..6e901986733b92d76aa56f40af8682cf6107685a 100644
--- a/src/pairs/sim/instrumentation.py
+++ b/src/pairs/sim/instrumentation.py
@@ -1,6 +1,7 @@
 from pairs.ir.block import pairs_inline
 from pairs.ir.functions import Call_Void
 from pairs.ir.timers import Timers
+from pairs.ir.types import Types
 from pairs.sim.lowerable import FinalLowerable
 
 class RegisterTimers(FinalLowerable):
@@ -12,9 +13,14 @@ class RegisterTimers(FinalLowerable):
         for t in range(Timers.Offset):
             Call_Void(self.sim, "pairs::register_timer", [t, Timers.name(t)])
 
-        for m in self.sim.module_list:
-            if m.name != 'main' and m.name != 'initialize':
-                Call_Void(self.sim, "pairs::register_timer", [m.module_id + Timers.Offset, m.name])
+        # Interface modules
+        for m in self.sim.interface_modules():
+            if m.name != 'initialize' and m.name != 'end' and m.return_type==Types.Void:
+                Call_Void(self.sim, "pairs::register_timer", [m.module_id + Timers.Offset, "INTERFACE_MODULES::" + m.name])
+        
+        # Internal modules
+        for m in self.sim.modules():
+            Call_Void(self.sim, "pairs::register_timer", [m.module_id + Timers.Offset, "INTERNAL_MODULES::" + m.name])
 
 
 class RegisterMarkers(FinalLowerable):
@@ -24,6 +30,7 @@ class RegisterMarkers(FinalLowerable):
     @pairs_inline
     def lower(self):
         if self.sim._enable_profiler:
-            for m in self.sim.module_list:
-                if m.name != 'main' and m.name != 'initialize' and m.must_profile():
+            # Only internal modules are profiled
+            for m in self.sim.modules():
+                if m.must_profile():
                     Call_Void(self.sim, "LIKWID_MARKER_REGISTER", [m.name])
diff --git a/src/pairs/sim/interaction.py b/src/pairs/sim/interaction.py
index f19289eb71e809dc531a88fd473cf8e0c1bfce0d..901c2d4b7f1930b9841b25e0f57e5e143fea77e0 100644
--- a/src/pairs/sim/interaction.py
+++ b/src/pairs/sim/interaction.py
@@ -1,7 +1,7 @@
 from pairs.ir.assign import Assign
 from pairs.ir.ast_term import ASTTerm
 from pairs.ir.scalars import ScalarOp
-from pairs.ir.block import Block, pairs_device_block
+from pairs.ir.block import Block, pairs_block
 from pairs.ir.branches import Filter, Branch
 from pairs.ir.loops import For, ParticleFor
 from pairs.ir.math import Sqrt, Abs, Min, Sign
@@ -307,8 +307,10 @@ class InteractionData:
         self.contact_normal().assign(contact_normal if s_relative else -contact_normal)
 
 class ParticleInteraction(Lowerable):
-    def __init__(self, sim, module_name,  nbody, cutoff_radius=None, use_cell_lists=False):
+    def __init__(self, sim, module_name,  nbody, cutoff_radius=None, use_cell_lists=False, run_on_device=True, profile=False):
         super().__init__(sim)
+        self.run_on_device = run_on_device
+        self.profile = profile
         self.module_name = module_name
         self.nbody = nbody
         self.contact_threshold = 0.0
@@ -316,6 +318,7 @@ class ParticleInteraction(Lowerable):
         self.maxs = self.sim.max_shapes()
         self.interactions_data = {}
         self.cutoff_radius = cutoff_radius
+        
         if any(self.sim.get_shape_id(s)==Shapes.PointMass for s in range(self.maxs)): 
             assert cutoff_radius is not None
 
@@ -415,7 +418,7 @@ class ParticleInteraction(Lowerable):
         self.sim.add_statement(Filter(self.sim, interaction_data.cutoff_condition, self.blocks[ishape*self.maxs + jshape]))
         self.apply_reductions(i, ishape, jshape)
 
-    @pairs_device_block
+    @pairs_block
     def lower(self):
         self.sim.module_name(f"{self.module_name}_local_interactions")
         if self.nbody == 2:
diff --git a/src/pairs/sim/simulation.py b/src/pairs/sim/simulation.py
index 9ac99771b38b07f860657e2b9bc9f8555526fff4..76e3cdb889ff4d0c6f9ef398c6e685ea7c312806 100644
--- a/src/pairs/sim/simulation.py
+++ b/src/pairs/sim/simulation.py
@@ -1,37 +1,23 @@
 from pairs.ir.arrays import Arrays
 from pairs.ir.block import Block
-from pairs.ir.branches import Filter
 from pairs.ir.features import Features, FeatureProperties
 from pairs.ir.kernel import Kernel
 from pairs.ir.layouts import Layouts
-from pairs.ir.module import Module, ModuleCall
+from pairs.ir.module import Module
 from pairs.ir.properties import Properties, ContactProperties
 from pairs.ir.symbols import Symbol
 from pairs.ir.types import Types
 from pairs.ir.variables import Variables
 #from pairs.graph.graphviz import ASTGraph
 from pairs.mapping.funcs import compute
-from pairs.sim.arrays import DeclareArrays
-from pairs.sim.cell_lists import CellLists, BuildCellLists, BuildCellListsStencil, PartitionCellLists, BuildCellNeighborLists
-from pairs.sim.comm import Comm, Synchronize, Borders, Exchange, ReverseComm
-from pairs.sim.contact_history import ContactHistory, BuildContactHistory, ClearUnusedContactHistory, ResetContactHistoryUsageStatus
-from pairs.sim.copper_fcc_lattice import CopperFCCLattice
-from pairs.sim.dem_sc_grid import DEMSCGrid
-from pairs.sim.domain import UpdateDomain
+from pairs.sim.cell_lists import CellLists, BuildCellLists, PartitionCellLists, BuildCellNeighborLists
+from pairs.sim.comm import Comm
+from pairs.sim.contact_history import ContactHistory
 from pairs.sim.domain_partitioners import DomainPartitioners
 from pairs.sim.domain_partitioning import BlockForest, DimensionRanges
 from pairs.sim.load_balancing_algorithms import LoadBalancingAlgorithms
-from pairs.sim.features import AllocateFeatureProperties
-from pairs.sim.grid import Grid2D, Grid3D, MutableGrid
-from pairs.sim.instrumentation import RegisterMarkers, RegisterTimers
-from pairs.sim.lattice import ParticleLattice
+from pairs.sim.grid import Grid3D
 from pairs.sim.neighbor_lists import NeighborLists, BuildNeighborLists
-from pairs.sim.properties import AllocateProperties, AllocateContactProperties, ResetVolatileProperties
-from pairs.sim.read_from_file import ReadParticleData
-from pairs.sim.thermo import ComputeThermo
-from pairs.sim.timestep import Timestep
-from pairs.sim.variables import DeclareVariables 
-from pairs.sim.vtk import VTKWrite
 from pairs.transformations import Transformations
 from pairs.code_gen.interface import InterfaceModules
 
@@ -176,45 +162,25 @@ class Simulation:
     def max_shapes(self):
         return len(self._shapes)
 
-    def add_udf_module(self, module):
-        assert isinstance(module, Module), "add_udf_module(): Given parameter is not of type Module!"
-        assert module.user_defined and not module.interface
-        if module.name not in [m.name for m in self.udf_module_list]:
-            self.udf_module_list.append(module)
-
     def add_interface_module(self, module):
         assert isinstance(module, Module), "add_interface_module(): Given parameter is not of type Module!"
-        assert module.interface and not module.user_defined
+        assert module.interface
         if module.name not in [m.name for m in self.interface_module_list]:
             self.interface_module_list.append(module)
 
     def add_module(self, module):
         assert isinstance(module, Module), "add_module(): Given parameter is not of type Module!"
-        assert not module.interface and not module.user_defined
+        assert not module.interface
         if module.name not in [m.name for m in self.module_list]:
             self.module_list.append(module)
 
     def interface_modules(self):
+        """List interface modules"""
         return self.interface_module_list
-    
-    def udf_modules(self):
-        return self.udf_module_list
-    
+        
     def modules(self):
-        """List simulation modules, with main always in the last position"""
-
-        sorted_mods = []
-        main_mod = None
-        for m in self.module_list:
-            if m.name != 'main':
-                sorted_mods.append(m)
-            else:
-                main_mod = m
-
-        if main_mod is not None:
-            sorted_mods += [main_mod]
-
-        return sorted_mods
+        """List internal modules"""
+        return self.module_list
 
     def add_kernel(self, kernel):
         assert isinstance(kernel, Kernel), "add_kernel(): Given parameter is not of type Kernel!"
@@ -330,8 +296,8 @@ class Simulation:
         self.neighbor_lists = NeighborLists(self, self.cell_lists)
         return self.neighbor_lists
 
-    def compute(self, func, cutoff_radius=None, symbols={}, parameters={}, compute_globals=False):
-        return compute(self, func, cutoff_radius, symbols, parameters, compute_globals)
+    def compute(self, func, cutoff_radius=None, symbols={}, parameters={}, compute_globals=False, run_on_device=True, profile=False):
+        return compute(self, func, cutoff_radius, symbols, parameters, compute_globals, run_on_device, profile)
 
 
     def init_block(self):
@@ -356,68 +322,13 @@ class Simulation:
         else:
             raise Exception("Two sizes assigned to same capacity!")
 
-    def build_setup_module_with_statements(self):
-        """Build a Module in the setup part of the program using the last initialized block"""
-
-        self.setup_functions.append(
-            Module(self,
-                name=self._module_name,
-                block=Block(self, self._block),
-                resizes_to_check=self._resizes_to_check,
-                check_properties_resize=self._check_properties_resize,
-                run_on_device=True))
-
-    def build_pre_step_module_with_statements(self, run_on_device=True, skip_first=False, profile=False):
-        """Build a Module in the pre-step part of the program using the last initialized block"""
-        module = Module(self, name=self._module_name,
-                              block=Block(self, self._block),
-                              resizes_to_check=self._resizes_to_check,
-                              check_properties_resize=self._check_properties_resize,
-                              run_on_device=run_on_device)
-
-        if profile:
-            module.profile()
-
-        if skip_first:
-            self.pre_step_functions.append((module, {'skip_first': True}))
-
-        else:
-            self.pre_step_functions.append(module)
-
-    def build_module_with_statements(self, run_on_device=True, skip_first=False, profile=False):
-        """Build a Module in the compute part of the program using the last initialized block"""
-        module = Module(self, name=self._module_name,
-                              block=Block(self, self._block),
-                              resizes_to_check=self._resizes_to_check,
-                              check_properties_resize=self._check_properties_resize,
-                              run_on_device=run_on_device)
-        if profile:
-            module.profile()
-
-        if skip_first:
-            self.functions.append((module, {'skip_first': True}))
-
-        else:
-            self.functions.append(module)
-
-    def build_user_defined_function(self, run_on_device=True):
-        """Build a user-defined Module that will be callable seperately as part of the interface"""
-        Module(self, name=self._module_name,
-                block=Block(self, self._block),
-                resizes_to_check=self._resizes_to_check,
-                check_properties_resize=self._check_properties_resize,
-                run_on_device=run_on_device,
-                user_defined=True)
-        
-
-    def build_interface_module(self, run_on_device=False):
+    def build_interface_module_with_statements(self, run_on_device=False, profile=False):
         """Build a user-defined Module that will be callable seperately as part of the interface"""
         Module(self, name=self._module_name,
                 block=Block(self, self._block),
                 resizes_to_check=self._resizes_to_check,
                 check_properties_resize=self._check_properties_resize,
                 run_on_device=run_on_device,
-                user_defined=False,
                 interface=True)
         
     def capture_statements(self, capture=True):
diff --git a/src/pairs/sim/timestep.py b/src/pairs/sim/timestep.py
deleted file mode 100644
index abef09a055507f431af554bf7163603e35f8e3cc..0000000000000000000000000000000000000000
--- a/src/pairs/sim/timestep.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from pairs.ir.scalars import ScalarOp
-from pairs.ir.block import Block
-from pairs.ir.branches import Branch, Filter
-from pairs.ir.functions import Call_Void
-from pairs.ir.loops import For
-from pairs.ir.timers import Timers
-
-
-class Timestep:
-    def __init__(self, sim, nsteps, item_list=None):
-        self.sim = sim
-        self.block = Block(sim, [])
-        self.timestep_loop = For(sim, 0, nsteps + 1, self.block) if self.sim._generate_whole_program else None
-
-        if item_list is not None:
-            for item in item_list:
-                if isinstance(item, tuple):
-                    stmt_else = None
-
-                    if len(item) == 2:
-                        stmt, params = item
-
-                    if len(item) == 3:
-                        stmt, stmt_else, params = item
-
-                    exec_every = 0 if 'every' not in params else params['every']
-                    skip_first = False if 'skip_first' not in params else params['skip_first']
-                    self.add(stmt, exec_every, stmt_else, skip_first)
-
-                else:
-                    self.add(item)
-
-    def timestep(self):
-        return self.timestep_loop.iter() if self.sim._generate_whole_program else self.sim.sim_timestep
-
-    def add(self, item, exec_every=0, item_else=None, skip_first=False):
-        assert exec_every >= 0, "exec_every parameter must be higher or equal than zero!"
-        stmts = item if not isinstance(item, Block) else item.statements()
-        stmts_else = None
-        ts = self.timestep() 
-        self.sim.enter(self.block)
-
-        if item_else is not None:
-            stmts_else = item_else if not isinstance(item_else, Block) else item_else.statements()
-
-        if exec_every > 0:
-            cond = ScalarOp.or_op(ScalarOp.cmp((ts + 1) % exec_every, 0), ScalarOp.cmp(ts, 0))
-            one_way = True if stmts_else is None else False
-
-            self.block.add_statement(
-                Branch(self.sim, ScalarOp.inline(cond), one_way,
-                    Block(self.sim, stmts),
-                    Block(self.sim, stmts_else) if not one_way else None))
-
-        elif skip_first:
-            self.block.add_statement(Filter(self.sim, ScalarOp.inline(ts > 0), Block(self.sim, stmts)))
-
-        else:
-            self.block.add_statement(stmts)
-
-        self.sim.leave()
-
-    def as_block(self):
-        _capture = self.sim._capture_statements
-        self.sim.capture_statements(False)
-
-        block = Block(self.sim, [Call_Void(self.sim, "pairs::start_timer", [Timers.All]),
-                                 self.timestep_loop if self.sim._generate_whole_program else self.block,
-                                 Call_Void(self.sim, "pairs::stop_timer", [Timers.All])])
-
-        self.sim.capture_statements(_capture)
-        return block
diff --git a/src/pairs/transformations/__init__.py b/src/pairs/transformations/__init__.py
index 837286506f288576b8ad82e456e7a4b09cc935a1..314fd176652708d4286f7f17dc3e7e64ffebfa5f 100644
--- a/src/pairs/transformations/__init__.py
+++ b/src/pairs/transformations/__init__.py
@@ -111,9 +111,5 @@ class Transformations:
         self.add_expression_declarations()
         # self.add_host_references_to_modules()
         self.add_device_references_to_modules()
-        
-        # TODO: Place stop timers before the function returns
-        # or simply don't instrument modules that have a non-void return type
-        # to avoid having to deal with returns within conditional blocks 
-        # self.add_instrumentation()
+        self.add_instrumentation()
 
diff --git a/src/pairs/transformations/devices.py b/src/pairs/transformations/devices.py
index 7a01fee52a1fbdb362d843dc508b26155bfd1d24..1eb6f1d36a7d97036de2336257508f0879dcd825 100644
--- a/src/pairs/transformations/devices.py
+++ b/src/pairs/transformations/devices.py
@@ -36,9 +36,6 @@ class AddDeviceCopies(Mutator):
                 if isinstance(s, ModuleCall):
                     copy_context = Contexts.Device if s.module.run_on_device else Contexts.Host
                     clear_context = Contexts.Host if s.module.run_on_device else Contexts.Device
-                    # new_stmts += [
-                    #     Call_Void(ast_node.sim, "pairs::start_timer", [Timers.DeviceTransfers])
-                    # ]
 
                     for array, action in s.module.arrays().items():
                         # TODO: Add device copies only if they are not mannualy taken care of inside the module
@@ -69,27 +66,17 @@ class AddDeviceCopies(Mutator):
                             # if var not in s.module.device_copies() and var in s.module.host_references():
                             #     new_stmts += [CopyVar(s.sim, var, Contexts.Host, action)]
 
-                    # new_stmts += [
-                    #     Call_Void(ast_node.sim, "pairs::stop_timer", [Timers.DeviceTransfers])
-                    # ]
 
                 new_stmts.append(s)
 
                 if isinstance(s, ModuleCall):
                     if s.module.run_on_device:
-                        # new_stmts += [
-                        #     Call_Void(ast_node.sim, "pairs::start_timer", [Timers.DeviceTransfers])
-                        # ]
-
                         for var, action in s.module.variables().items():
                             if action != Actions.ReadOnly and var.device_flag:
                                 new_stmts += [CopyVar(s.sim, var, Contexts.Host, action)]
 
                         if self.module_resizes[s.module]:
                             new_stmts += [CopyArray(s.sim, s.sim.resizes, Contexts.Host, Actions.Ignore)]
-                        # new_stmts += [
-                        #     Call_Void(ast_node.sim, "pairs::stop_timer", [Timers.DeviceTransfers])
-                        # ]
 
         ast_node.stmts = new_stmts
         return ast_node
diff --git a/src/pairs/transformations/instrumentation.py b/src/pairs/transformations/instrumentation.py
index 88b73c0d8406b97392d267ace3b1e1bbbb3ca068..5d039b8cb3ed8ee628b098a6126491b49402e107 100644
--- a/src/pairs/transformations/instrumentation.py
+++ b/src/pairs/transformations/instrumentation.py
@@ -1,8 +1,8 @@
 from pairs.ir.block import Block
 from pairs.ir.functions import Call_Void
-from pairs.ir.module import ModuleCall
 from pairs.ir.mutator import Mutator
 from pairs.ir.timers import Timers
+from pairs.ir.types import Types
 
 
 class AddModulesInstrumentation(Mutator):
@@ -12,7 +12,8 @@ class AddModulesInstrumentation(Mutator):
     def mutate_ModuleCall(self, ast_node):
         ast_node._module = self.mutate(ast_node._module)
         module = ast_node._module
-        if module.name == 'main' or module.name == 'initialize':
+
+        if module.name == 'initialize' or module.name == 'end' or module.return_type!=Types.Void:
             return ast_node
 
         if module.must_profile():