performance analysis improvements

marcomelloni · marcomelloni · commit 0fb99fc4ee71 · 2025-07-14T11:11:26.000+02:00
diff --git a/performance/performance_analysis.py b/performance/performance_analysis.py
@@ -6,14 +6,15 @@
 import pandas as pd
 
 
-DEFAULT_INPUT  = Path("../performance_log/performance_metrics.csv")
-DEFAULT_OUTPUT = Path("total_overhead_summary.csv")
+# Default paths (modify as needed from CLI with -i / -o)
+DEFAULT_INPUT  = Path("../performance_log/performance_metrics_v5.csv")
+DEFAULT_OUTPUT = Path("overhead_summary.csv")
 
 
 def parse_and_average(cell: str | float | int | pd.NA) -> float | np.float64:
     """
     Convert a semicolon-separated list of numbers (seconds) to the mean
-    in **milliseconds**.
+    in **milliseconds**. If the cell is empty or NaN → np.nan.
     """
     if pd.isna(cell):
         return np.nan
@@ -25,9 +26,9 @@ def parse_and_average(cell: str | float | int | pd.NA) -> float | np.float64:
 
 
 def build_parser() -> argparse.ArgumentParser:
-    """ Builds the command-line argument parser for the script."""
+    """Builds the command-line argument parser for the script."""
     parser = argparse.ArgumentParser(
-        description="Analyse PerformanceMonitor CSV and create a summary.")
+        description="Analyse PerformanceMonitor CSV and create an overhead summary.")
     parser.add_argument(
         "-i", "--input", type=Path, default=DEFAULT_INPUT,
         help=f"Path to the raw PerformanceMonitor CSV (default: {DEFAULT_INPUT})")
@@ -40,23 +41,56 @@ def build_parser() -> argparse.ArgumentParser:
 def main() -> None:
     args = build_parser().parse_args()
     df = pd.read_csv(args.input)
-    df.columns = df.columns.str.strip()  # remove any whitespace in headers
+    df.columns = df.columns.str.strip()          # Remove spaces from column names
 
-    required_cols = {"Client Protocol", "Simulation Type", "Total Overheads"}
+    # Check that required columns exist
+    required_cols = {
+        "Client Protocol",
+        "Simulation Type",
+        "Input Overhead",
+        "Output Overheads",
+        "Total Overheads",
+    }
     missing = required_cols - set(df.columns)
     if missing:
-        raise KeyError(f"Missing columns in CSV: {', '.join(missing)}")
+        raise KeyError(f"Missing columns in CSV: {', '.join(sorted(missing))}")
 
-    # Compute mean Total Overhead per operation
-    df["Avg Total Overhead"] = df["Total Overheads"].apply(parse_and_average)
+    # Convert the three overhead columns to milliseconds (average per row)
+    df["Avg Input Overhead"]   = df["Input Overhead"].apply(parse_and_average)
+    df["Avg Output Overhead"]  = df["Output Overheads"].apply(parse_and_average)
+    df["Avg Total Overhead"]   = df["Total Overheads"].apply(parse_and_average)
 
-    # Group and aggregate statistics
+    # Group by Client Protocol + Simulation Type and calculate statistics
     groups = df.groupby(["Client Protocol", "Simulation Type"])
-    summary = groups["Avg Total Overhead"].agg(
-        Median="median",
-        StdDev="std",
-        Pct5=lambda x: np.percentile(x.dropna(), 5),
-        Pct95=lambda x: np.percentile(x.dropna(), 95),
+
+    summary = groups.agg(
+        # Input Overhead
+        Input_Median = pd.NamedAgg(
+            column="Avg Input Overhead", aggfunc="median"),
+        Input_StdDev = pd.NamedAgg(
+            column="Avg Input Overhead", aggfunc="std"),
+        Input_Pct5  = pd.NamedAgg(
+            column="Avg Input Overhead", aggfunc=lambda x: np.percentile(x.dropna(), 5)),
+        Input_Pct95 = pd.NamedAgg(
+            column="Avg Input Overhead", aggfunc=lambda x: np.percentile(x.dropna(), 95)),
+        # Output Overhead
+        Output_Median = pd.NamedAgg(
+            column="Avg Output Overhead", aggfunc="median"),
+        Output_StdDev = pd.NamedAgg(
+            column="Avg Output Overhead", aggfunc="std"),
+        Output_Pct5  = pd.NamedAgg(
+            column="Avg Output Overhead", aggfunc=lambda x: np.percentile(x.dropna(), 5)),
+        Output_Pct95 = pd.NamedAgg(
+            column="Avg Output Overhead", aggfunc=lambda x: np.percentile(x.dropna(), 95)),
+        # Total Overhead
+        Total_Median = pd.NamedAgg(
+            column="Avg Total Overhead", aggfunc="median"),
+        Total_StdDev = pd.NamedAgg(
+            column="Avg Total Overhead", aggfunc="std"),
+        Total_Pct5  = pd.NamedAgg(
+            column="Avg Total Overhead", aggfunc=lambda x: np.percentile(x.dropna(), 5)),
+        Total_Pct95 = pd.NamedAgg(
+            column="Avg Total Overhead", aggfunc=lambda x: np.percentile(x.dropna(), 95)),
     ).reset_index()
 
     summary.to_csv(args.output, index=False)