[0.1.1] - 2025-07-07

Jakub-Espandr · Jakub-Espandr · commit 9b28c517fab8 · 2025-07-07T22:05:04.000+02:00
### Added
- Support for CSV files where the `ClassValue` column contains descriptive text after the class code (e.g., `C_1 - nezasazena uroda`).
- Automatic detection and processing of any number of classes (`C_1`, `C_2`, `C_3`, ...).

### Fixed
- Fixed a crash when loading CSV files where `ClassValue` did not exactly match `C_1`, `C_2`, etc.
- Improved error messages for invalid or empty data in CSV files.

### Changed
- Metrics calculation is now robust to various CSV formats and works for any number of classes.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,20 @@
 # Changelog
 
+## [0.1.1] - 2025-07-07
+
+### Added
+- Support for CSV files where the `ClassValue` column contains descriptive text after the class code (e.g., `C_1 - nezasazena uroda`).
+- Automatic detection and processing of any number of classes (`C_1`, `C_2`, `C_3`, ...).
+
+### Fixed
+- Fixed a crash when loading CSV files where `ClassValue` did not exactly match `C_1`, `C_2`, etc.
+- Improved error messages for invalid or empty data in CSV files.
+
+### Changed
+- Metrics calculation is now robust to various CSV formats and works for any number of classes.
+
+---
+
 ## [0.1.0] - 2025-06-22
 
 ### Added
diff --git a/core/metrics.py b/core/metrics.py
@@ -2,25 +2,64 @@
 from pathlib import Path
 from openpyxl import Workbook
 from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, cohen_kappa_score
-from .translations import TRANSLATIONS
+from .translations import TRANSLATIONS, get_class_names
 
 def compute_metrics(df, language='cs'):
     """Compute metrics from confusion matrix data"""
     df = df.copy()
     df.columns = df.columns.astype(str)
 
-    # Filter for C_1 and C_2 rows
-    df_cm = df[df['ClassValue'].isin(['C_1', 'C_2'])]
+    # Find C_* columns (handle cases like "C_1 - nezasazena uroda")
+    c_columns = [col for col in df.columns if col.startswith('C_') and '_' in col]
+    if not c_columns:
+        raise ValueError("No C_* columns found in the CSV file")
+    
+    # Sort columns to ensure C_1, C_2, C_3, etc. order
+    c_columns.sort(key=lambda x: int(x.split('_')[1].split()[0]) if x.split('_')[1].split()[0].isdigit() else 0)
+    
+    # Filter for rows that have ClassValue matching the C_* pattern
+    # Look for ClassValue entries that start with C_ and contain a number
+    class_values = []
+    for col in c_columns:
+        class_num = col.split('_')[1].split()[0]  # Extract number from C_1, C_2, etc.
+        if class_num.isdigit():
+            class_values.append(f'C_{class_num}')
+    
+    if not class_values:
+        raise ValueError("No valid class values found in ClassValue column")
+    
+    df_cm = df[df['ClassValue'].astype(str).str.startswith(tuple(class_values))]
+    
+    if df_cm.empty:
+        raise ValueError("No rows found with matching ClassValue entries")
 
-    # Handle decimal commas
-    df_cm['C_1'] = df_cm['C_1'].astype(str).str.replace(',', '.').astype(float).astype(int)
-    df_cm['C_2'] = df_cm['C_2'].astype(str).str.replace(',', '.').astype(float).astype(int)
+    # Handle decimal commas and convert to numeric
+    for col in c_columns:
+        df_cm[col] = df_cm[col].astype(str).str.replace(',', '.').astype(float).astype(int)
 
-    cm = df_cm[['C_1', 'C_2']].to_numpy()
+    # Create confusion matrix from the C_* columns
+    cm = df_cm[c_columns].to_numpy()
+    
+    if cm.size == 0 or cm.shape[0] == 0:
+        raise ValueError("Confusion matrix is empty")
 
-    y_true = [0] * int(cm[0, :].sum()) + [1] * int(cm[1, :].sum())
-    y_pred = [0] * int(cm[0, 0]) + [1] * int(cm[0, 1]) + [0] * int(cm[1, 0]) + [1] * int(cm[1, 1])
+    # Create y_true and y_pred arrays
+    y_true = []
+    y_pred = []
+    
+    for i, row in enumerate(cm):
+        # Add true labels (class i repeated by the sum of that row)
+        row_sum = int(row.sum())
+        y_true.extend([i] * row_sum)
+        
+        # Add predicted labels
+        for j, count in enumerate(row):
+            y_pred.extend([j] * int(count))
+    
+    if not y_true or not y_pred:
+        raise ValueError("No valid predictions found in the data")
 
+    # Calculate metrics
     precision = precision_score(y_true, y_pred, average=None, zero_division=0)
     recall = recall_score(y_true, y_pred, average=None, zero_division=0)
     f1 = f1_score(y_true, y_pred, average=None, zero_division=0)
@@ -31,12 +70,33 @@ def compute_metrics(df, language='cs'):
     avg_recall = round(recall_score(y_true, y_pred, average='macro', zero_division=0), 3)
     avg_f1 = round(f1_score(y_true, y_pred, average='macro', zero_division=0), 3)
 
-    class_names = TRANSLATIONS[language]['class_names']
-    return [
-        [class_names[0], round(precision[0], 3), round(recall[0], 3), round(f1[0], 3), accuracy, kappa],
-        [class_names[1], round(precision[1], 3), round(recall[1], 3), round(f1[1], 3), accuracy, kappa],
-        [class_names[2], avg_precision, avg_recall, avg_f1, accuracy, kappa]
-    ]
+    # Generate class names based on the number of classes found
+    class_names = get_class_names(len(c_columns), language)
+    
+    # Create results for each class
+    results = []
+    for i in range(len(c_columns)):
+        if i < len(precision):
+            results.append([
+                class_names[i] if i < len(class_names) else f"Class {i+1}", 
+                round(precision[i], 3), 
+                round(recall[i], 3), 
+                round(f1[i], 3), 
+                accuracy, 
+                kappa
+            ])
+    
+    # Add average row
+    results.append([
+        class_names[-1] if len(class_names) > len(c_columns) else "Average", 
+        avg_precision, 
+        avg_recall, 
+        avg_f1, 
+        accuracy, 
+        kappa
+    ])
+    
+    return results
 
 def export_to_excel(input_path, output_path, language='cs'):
     """Export metrics to Excel file"""
diff --git a/core/translations.py b/core/translations.py
@@ -118,4 +118,14 @@
         'excel_metrics_sheet': 'Metrics',
         'excel_data_sheet': 'Data'
     }
-} 
+}
+
+def get_class_names(num_classes, language='cs'):
+    """Generate class names based on the number of classes found"""
+    if language == 'cs':
+        class_names = [f"C_{i+1}" for i in range(num_classes)]
+        class_names.append("Průměr")
+    else:  # English
+        class_names = [f"C_{i+1}" for i in range(num_classes)]
+        class_names.append("Average")
+    return class_names