diff --git a/cbutil/ncu_parser.py b/cbutil/ncu_parser.py
index 820ae26645979cceb04805ba6b7ed635a20c8472..9af5f27c88e2f7a36791de14cc79ba9fe99164c3 100644
--- a/cbutil/ncu_parser.py
+++ b/cbutil/ncu_parser.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import re
 from cbutil.ncu_keys import (
     memory_write_data_key,
     memory_read_data_key,
@@ -27,17 +28,17 @@ def get_unit(col):
 
 
 def detect_prefix(unit):
-    if unit[0] == 'G':
+    if unit[0] == "G":
         return 1e9
-    elif unit[0] == 'M':
+    elif unit[0] == "M":
         return 1e6
-    elif unit[0] == 'K':
+    elif unit[0] == "K":
         return 1e3
-    elif unit[0] == 'm':
+    elif unit[0] == "m":
         return 1e-3
-    elif unit[0] == 'u':
+    elif unit[0] == "u":
         return 1e-6
-    elif unit[0] == 'n':
+    elif unit[0] == "n":
         return 1e-9
     else:
         return 1
@@ -61,35 +62,57 @@ def normalize_and_add_prefix(value, prefix: str):
 
 
 def extract_raw_counter(df: pd.DataFrame):
-
     fields = pd.DataFrame()
     tags = pd.DataFrame()
 
     tags["Block Size"] = df["Block Size"]
     tags["Grid Size"] = df["Grid Size"]
-    tags["GPU"] = df["device__attribute_display_name"]
+    tags["GPU"] = df["device__attribute_display_name"].replace(" ", "")
 
-    fields[memory_write_data_key] = normalize_and_add_prefix(df["dram__bytes_write.sum"], 'G')
-    fields[memory_read_data_key] = normalize_and_add_prefix(df["dram__bytes_read.sum"], 'G')
-    fields[memory_data_key] = fields[memory_write_data_key] + fields[memory_read_data_key]
+    fields[memory_write_data_key] = normalize_and_add_prefix(
+        df["dram__bytes_write.sum"], "G"
+    )
+    fields[memory_read_data_key] = normalize_and_add_prefix(
+        df["dram__bytes_read.sum"], "G"
+    )
+    fields[memory_data_key] = (
+        fields[memory_write_data_key] + fields[memory_read_data_key]
+    )
 
-    fields[memory_write_bandwidth_key] = normalize_and_add_prefix(df["dram__bytes_write.sum.per_second"], 'M')
-    fields[memory_read_bandwidth_key] = normalize_and_add_prefix(df["dram__bytes_read.sum.per_second"], 'M')
-    fields[memory_bandwidth_key] = normalize_and_add_prefix(df["dram__bytes.sum.per_second"], 'M')
+    fields[memory_write_bandwidth_key] = normalize_and_add_prefix(
+        df["dram__bytes_write.sum.per_second"], "M"
+    )
+    fields[memory_read_bandwidth_key] = normalize_and_add_prefix(
+        df["dram__bytes_read.sum.per_second"], "M"
+    )
+    fields[memory_bandwidth_key] = normalize_and_add_prefix(
+        df["dram__bytes.sum.per_second"], "M"
+    )
     fields[runtime_key] = get_normalized(df["gpu__time_duration.sum"])
 
-    fields[smsp_cycles_key] = get_normalized(df["smsp__cycles_elapsed.avg.per_second"])
+    fields[smsp_cycles_key] = get_normalized(
+        df["smsp__cycles_elapsed.avg.per_second"])
     fields[smsp_cycles_total_key] = fields[smsp_cycles_key] * fields[runtime_key]
     fields[fp_inst_per_cycle_key] = (
-        2 * df["smsp__sass_thread_inst_executed_op_dfma_pred_on.sum.per_cycle_elapsed"] +
-        df["smsp__sass_thread_inst_executed_op_dadd_pred_on.sum.per_cycle_elapsed"] +
-        df["smsp__sass_thread_inst_executed_op_dmul_pred_on.sum.per_cycle_elapsed"]
+        2 * df["smsp__sass_thread_inst_executed_op_dfma_pred_on.sum.per_cycle_elapsed"]
+        + df["smsp__sass_thread_inst_executed_op_dadd_pred_on.sum.per_cycle_elapsed"]
+        + df["smsp__sass_thread_inst_executed_op_dmul_pred_on.sum.per_cycle_elapsed"]
+    )
+    fields[total_fp_inst_key] = (
+        fields[fp_inst_per_cycle_key] * fields[smsp_cycles_total_key]
     )
-    fields[total_fp_inst_key] = fields[fp_inst_per_cycle_key] * fields[smsp_cycles_total_key]
 
-    fields[operational_intensity_key] = fields[total_fp_inst_key] / (fields[memory_data_key] * 1e9)
-    fields[p_max_key] = add_unit_prefix(fields[operational_intensity_key] * fields[memory_bandwidth_key] * 1e6, 'M')
-    fields[dp_key] = np.divide(np.asarray(fields[total_fp_inst_key]), fields[runtime_key]) / 1e6
+    fields[operational_intensity_key] = fields[total_fp_inst_key] / (
+        fields[memory_data_key] * 1e9
+    )
+    fields[p_max_key] = add_unit_prefix(
+        fields[operational_intensity_key] *
+        fields[memory_bandwidth_key] * 1e6, "M"
+    )
+    fields[dp_key] = (
+        np.divide(np.asarray(fields[total_fp_inst_key]),
+                  fields[runtime_key]) / 1e6
+    )
     return fields, tags