před 6 měsíci · f75165f1de
--- a/kmeans_gmm_anomaly/V0_V1_gmm_anomaly_unsupervised_with_comment.py
+++ b/kmeans_gmm_anomaly/V0_V1_gmm_anomaly_unsupervised_with_comment.py
@@ -0,0 +1,412 @@
 
				+# V0_V1_gmm_anomaly_unsupervised
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import matplotlib.pyplot as plt
			
 
				+from sklearn.mixture import GaussianMixture
			
 
				+from sklearn.preprocessing import StandardScaler
			
 
				+from sklearn.metrics import silhouette_score, davies_bouldin_score
			
 
				+import argparse
			
 
				+import os
			
 
				+import seaborn as sns
			
 
				+from sklearn.metrics import confusion_matrix, classification_report
			
 
				+from sklearn.decomposition import PCA  # For dimensionality reduction if needed
			
 
				+
			
 
				+# Set up command line arguments
			
 
				+parser = argparse.ArgumentParser(description='Anomaly detection using Gaussian Mixture Models (Unsupervised).')
			
 
				+parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
			
 
				+parser.add_argument('--n_components', type=int, default=5, help='Number of components for GMM.')
			
 
				+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
			
 
				+parser.add_argument('--plot_before', action='store_true', help='Plot data before training.')
			
 
				+parser.add_argument('--plot_after', action='store_true', help='Plot data after clustering (components).')
			
 
				+parser.add_argument('--plot_transition', action='store_true', help='Plot transition data with components.')
			
 
				+parser.add_argument('--plot_anomalies', action='store_true', help='Plot potential anomalies based on likelihood.')
			
 
				+parser.add_argument('--plot_anomaly_clusters', action='store_true', help='Highlight components with potential anomalies.')
			
 
				+parser.add_argument('--manual_anomaly_labels', type=str, default=None, help='Path to CSV with manual anomaly labels for evaluation.')
			
 
				+
			
 
				+options = parser.parse_args()
			
 
				+
			
 
				+# Number of components and sequence length from command line arguments
			
 
				+n_components = options.n_components
			
 
				+timesteps = options.timesteps
			
 
				+
			
 
				+#####################################################################################################
			
 
				+
			
 
				+NumberOfFailures = 4  # So far, we have only data for the first 4 types of failures
			
 
				+datafiles = [[], []]  # 0 for train,  1 for test
			
 
				+for i in range(NumberOfFailures + 1):
			
 
				+  datafiles[0].append([])
			
 
				+  datafiles[1].append([])
			
 
				+
			
 
				+# Next set of ddata corresponds to Freezer, SP=-26
			
 
				+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
			
 
				+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
			
 
				+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
			
 
				+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
			
 
				+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
			
 
				+
			
 
				+if options.transition:
			
 
				+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
			
 
				+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']  # with TRANSITION
			
 
				+  datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']  # with TRANSITION
			
 
				+  datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']  # with TRANSITION
			
 
				+  datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
			
 
				+
			
 
				+else:
			
 
				+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
			
 
				+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
			
 
				+  datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
			
 
				+  datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
			
 
				+  datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
			
 
				+
			
 
				+# Features used
			
 
				+features = ['r1 s1', 'r1 s4', 'r1 s5']
			
 
				+
			
 
				+featureNames = {}
			
 
				+featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>'
			
 
				+featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>'
			
 
				+featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>'
			
 
				+
			
 
				+unitNames = {}
			
 
				+unitNames['r1 s1'] = r'($^o$C)'
			
 
				+unitNames['r1 s4'] = r'($^o$C)'
			
 
				+unitNames['r1 s5'] = r'($^o$C)'
			
 
				+
			
 
				+NumFeatures = len(features)
			
 
				+
			
 
				+#####################################################################################################
			
 
				+
			
 
				+# Load and merge training data
			
 
				+dataTrain = []
			
 
				+for class_files in datafiles[0]:
			
 
				+  class_dfs = []
			
 
				+  for base_filename in class_files:
			
 
				+    script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				+    data_dir = os.path.join(script_dir, 'data')
			
 
				+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
			
 
				+    try:
			
 
				+      df = pd.read_csv(filepath)
			
 
				+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
			
 
				+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
			
 
				+      for col in features:
			
 
				+        df[col] = pd.to_numeric(df[col], errors='coerce')
			
 
				+      df = df.set_index('timestamp').resample('5Min')[features].mean()  # Resample and calculate mean only for features
			
 
				+      df = df[features].interpolate()  # Estimate missing values using linear interpolation
			
 
				+      class_dfs.append(df)
			
 
				+    except FileNotFoundError:
			
 
				+      print(f"Warning: File {filepath} not found and skipped.") 
			
 
				+  if class_dfs:
			
 
				+    dataTrain.append(pd.concat(class_dfs))
			
 
				+
			
 
				+# Concatenate all training data into a single DataFrame
			
 
				+combined_train_data = pd.concat(dataTrain)
			
 
				+
			
 
				+# Load and merge test data
			
 
				+dataTest = []
			
 
				+for class_files in datafiles[1]:
			
 
				+  class_dfs = []
			
 
				+  for base_filename in class_files:
			
 
				+    script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				+    data_dir = os.path.join(script_dir, 'data')
			
 
				+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
			
 
				+    try:
			
 
				+      df = pd.read_csv(filepath)
			
 
				+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
			
 
				+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
			
 
				+      for col in features:
			
 
				+        df[col] = pd.to_numeric(df[col], errors='coerce')
			
 
				+      df = df.set_index('timestamp').resample('5Min')[features].mean()  # Resample and calculate mean only for features
			
 
				+      df = df[features].interpolate()  # Estimate missing values using linear interpolation
			
 
				+      class_dfs.append(df)
			
 
				+    except FileNotFoundError:
			
 
				+      print(f"Warning: File {filepath} not found and skipped.") 
			
 
				+  if class_dfs:
			
 
				+    dataTest.append(pd.concat(class_dfs))
			
 
				+
			
 
				+# Plot data before training
			
 
				+if options.plot_before:
			
 
				+  num_features = len(features)
			
 
				+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
			
 
				+  for i, feature in enumerate(features):
			
 
				+    for k, df in enumerate(dataTrain):
			
 
				+      axes[i].plot(df.index, df[feature], label=f'Train Class {k}')
			
 
				+    axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
			
 
				+    axes[i].set_title(f'Train Data - {featureNames[feature]}')
			
 
				+    axes[i].legend()
			
 
				+  plt.tight_layout()
			
 
				+  plt.show()
			
 
				+
			
 
				+########################################################################################################
			
 
				+
			
 
				+# Normalize the data
			
 
				+scaler = StandardScaler()
			
 
				+scaled_train_data = scaler.fit_transform(combined_train_data[features])  # Normalize only the features
			
 
				+
			
 
				+scaled_test_data_list = []
			
 
				+for df in dataTest:
			
 
				+  scaled_test_data_list.append(scaler.transform(df[features]))  # Normalize only the features
			
 
				+
			
 
				+# Convert normalized data to DataFrame for easier handling
			
 
				+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
			
 
				+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
			
 
				+
			
 
				+############################################################################################################
			
 
				+
			
 
				+# Create time sequences
			
 
				+def create_sequences(data, timesteps):
			
 
				+  sequences = []
			
 
				+  for i in range(len(data) - timesteps + 1):
			
 
				+    sequences.append(data[i:i + timesteps])
			
 
				+  return np.array(sequences)
			
 
				+
			
 
				+# Create time sequences for training data
			
 
				+X_train_sequences = create_sequences(scaled_train_df.values, timesteps)
			
 
				+n_samples, n_timesteps, n_features = X_train_sequences.shape
			
 
				+X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_features)
			
 
				+
			
 
				+# Create time sequences for test data
			
 
				+X_test_sequences_list = [create_sequences(df.values, timesteps) for df in scaled_test_df_list]
			
 
				+X_test_reshaped_list = [seq.reshape(seq.shape[0], seq.shape[1] * seq.shape[2]) for seq in X_test_sequences_list]
			
 
				+
			
 
				+############################################################################################################
			
 
				+
			
 
				+# Train the GMM model
			
 
				+gmm = GaussianMixture(n_components=n_components, random_state=42, n_init=10, init_params='kmeans')
			
 
				+gmm.fit(X_train_reshaped)
			
 
				+train_labels_gmm = gmm.predict(X_train_reshaped)
			
 
				+
			
 
				+# Predict components for test data
			
 
				+cluster_labels_test_gmm_list = [gmm.predict(X_test_reshaped) for X_test_reshaped in X_test_reshaped_list]
			
 
				+
			
 
				+
			
 
				+# Evaluate clustering quality for test data (GMM)
			
 
				+for i, labels_test_gmm in enumerate(cluster_labels_test_gmm_list):
			
 
				+    if len(X_test_reshaped_list[i]) > 1 and len(np.unique(labels_test_gmm)) > 1:
			
 
				+        silhouette_avg_test_gmm = silhouette_score(X_test_reshaped_list[i], labels_test_gmm)
			
 
				+        db_index_test_gmm = davies_bouldin_score(X_test_reshaped_list[i], labels_test_gmm)
			
 
				+        print(f"\nSilhouette Score (Test Data - Set {i+1} - GMM): {silhouette_avg_test_gmm:.4f}")
			
 
				+        print(f"Davies-Bouldin Index (Test Data - Set {i+1} - GMM): {db_index_test_gmm:.4f}")
			
 
				+    else:
			
 
				+        print(f"\nCannot calculate Silhouette Score and Davies-Bouldin Index for Test Data - Set {i+1} (GMM) due to insufficient samples or only one component.")
			
 
				+
			
 
				+############################################################################################################################
			
 
				+
			
 
				+# Function to plot data with assigned components (for GMM)
			
 
				+def plot_clustered_data_gmm(original_data_list, cluster_labels_list, n_components, features, featureNames, unitNames, title_prefix=""):
			
 
				+  num_features = len(features)
			
 
				+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
			
 
				+  colors = plt.cm.viridis(np.linspace(0, 1, n_components))  # Assign colors to each component
			
 
				+
			
 
				+  for k, df in enumerate(original_data_list):
			
 
				+    df_index = df.index
			
 
				+    labels = cluster_labels_list[k]
			
 
				+    for i, feature in enumerate(features):
			
 
				+      for component_id in range(n_components):
			
 
				+        component_indices = np.where(labels == component_id)[0]
			
 
				+        if len(component_indices) > 0 and len(df_index) > component_indices[-1]:
			
 
				+          valid_indices = df_index[component_indices]
			
 
				+          axes[i].scatter(valid_indices, df[feature].loc[valid_indices], color=colors[component_id], label=f'Component {component_id}' if k == 0 else "", s=10)
			
 
				+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
			
 
				+      axes[i].set_title(f'{title_prefix} - {featureNames[feature]}')
			
 
				+      axes[i].legend(loc='upper right')
			
 
				+
			
 
				+  plt.tight_layout()
			
 
				+  plt.show()
			
 
				+
			
 
				+# Plot data after clustering (training) with GMM
			
 
				+if options.plot_after:
			
 
				+  train_original_indices = combined_train_data.index[timesteps - 1:]
			
 
				+  plot_df_train = pd.DataFrame(scaled_train_df.values[timesteps - 1:], index=train_original_indices, columns=features)
			
 
				+  trimmed_train_labels_gmm = train_labels_gmm[:len(plot_df_train)]
			
 
				+  plot_clustered_data_gmm([plot_df_train], [trimmed_train_labels_gmm], n_components, features, featureNames, unitNames, title_prefix="Train Data Components")
			
 
				+
			
 
				+  
			
 
				+  # Plot scatter plot of components for training data (using the first two features)
			
 
				+  if len(features) >= 2:
			
 
				+    plt.figure(figsize=(10, 8))
			
 
				+    scatter = plt.scatter(scaled_train_df[features[0]][:len(plot_df_train)], scaled_train_df[features[1]][:len(plot_df_train)], c=trimmed_train_labels_gmm, cmap='viridis')
			
 
				+    plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
			
 
				+    plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
			
 
				+    plt.title('Components of Train Data (Features 1 vs 2) - GMM')
			
 
				+    plt.colorbar(scatter, label='Component ID')
			
 
				+    plt.show()
			
 
				+
			
 
				+# Plot test data with predicted components (GMM)
			
 
				+if options.plot_transition:
			
 
				+  plot_clustered_data_gmm(dataTest, cluster_labels_test_gmm_list, n_components, features, featureNames, unitNames, title_prefix="Test Data Components")
			
 
				+
			
 
				+  # Evaluate clustering quality for test data (GMM)
			
 
				+  for i, labels_test_gmm in enumerate(cluster_labels_test_gmm_list):
			
 
				+    if len(X_test_reshaped_list[i]) > 1 and len(np.unique(labels_test_gmm)) > 1:
			
 
				+      silhouette_avg_test_gmm = silhouette_score(X_test_reshaped_list[i], labels_test_gmm)
			
 
				+      db_index_test_gmm = davies_bouldin_score(X_test_reshaped_list[i], labels_test_gmm)
			
 
				+      print(f"\nSilhouette Score (Test Data - Set {i+1} - GMM): {silhouette_avg_test_gmm:.4f}")
			
 
				+      print(f"Davies-Bouldin Index (Test Data - Set {i+1} - GMM): {db_index_test_gmm:.4f}")
			
 
				+    else:
			
 
				+      print(f"\nCannot calculate Silhouette Score and Davies-Bouldin Index for Test Data - Set {i+1} (GMM) due to insufficient samples or only one component.")
			
 
				+
			
 
				+    # Plot scatter plot of components for test data (using the first two features)
			
 
				+    if len(features) >= 2 and len(scaled_test_df_list[i]) > timesteps - 1:
			
 
				+      test_labels_to_plot_gmm = labels_test_gmm[:len(scaled_test_df_list[i]) - (timesteps - 1)]
			
 
				+      if len(test_labels_to_plot_gmm) > 0:
			
 
				+        plt.figure(figsize=(10, 8))
			
 
				+        scatter = plt.scatter(scaled_test_df_list[i][features[0]][timesteps-1:], scaled_test_df_list[i][features[1]][timesteps-1:], c=test_labels_to_plot_gmm, cmap='viridis')
			
 
				+        plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
			
 
				+        plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
			
 
				+        plt.title(f'Components of Test Data - Set {i+1} (Features 1 vs 2) - GMM')
			
 
				+        plt.colorbar(scatter, label='Component ID')
			
 
				+        plt.show()
			
 
				+
			
 
				+#####################################################################################################
			
 
				+# Anomaly detection based on likelihood in GMM
			
 
				+#####################################################################################################
			
 
				+
			
 
				+def calculate_anomaly_scores_gmm(data_reshaped, gmm_model):
			
 
				+  # Lower likelihood indicates higher probability of anomaly. We use the negative log-likelihood as the anomaly score.
			
 
				+  anomaly_scores = -gmm_model.score_samples(data_reshaped)
			
 
				+  return anomaly_scores
			
 
				+
			
 
				+train_anomaly_scores_gmm = calculate_anomaly_scores_gmm(X_train_reshaped, gmm)
			
 
				+
			
 
				+def plot_anomalies_gmm(original_data_list, anomaly_scores_list, threshold_factor=3):
			
 
				+  num_features = len(features)
			
 
				+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
			
 
				+  for k, df in enumerate(original_data_list):
			
 
				+    time_index = df.index[timesteps - 1:]
			
 
				+    anomaly_scores = anomaly_scores_list[k]
			
 
				+    threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
			
 
				+    anomalous_indices = np.where(anomaly_scores > threshold)[0]
			
 
				+    for i, feature in enumerate(features):
			
 
				+      # Ensure the lengths match for plotting - time_index length should match anomaly_scores length
			
 
				+      # Slicing original_df by time_index might not be needed if df is already aligned
			
 
				+      # Let's plot the full original data and mark anomalies based on time_index
			
 
				+      axes[i].plot(df.index, df[feature], label=f'Normal', alpha=0.7) # Plot full data dimly
			
 
				+
			
 
				+      if len(time_index) == len(anomaly_scores): # Check alignment
			
 
				+        # Now mark anomalies on the original data time axis
			
 
				+        anomaly_times = time_index[anomalous_indices]
			
 
				+        axes[i].scatter(anomaly_times, df[feature].loc[anomaly_times], color='red', label=f'Anomaly' if k == 0 else "", s=20, zorder=5) # Use zorder to ensure points are visible
			
 
				+      else:
			
 
				+        print(f"Warning: Data length mismatch for plotting anomalies in file {k}. Skipping anomaly plot for this file.")
			
 
				+     
			
 
				+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
			
 
				+      axes[i].set_title(f'Potential Anomalies (GMM) - {featureNames[feature]}')
			
 
				+      # Only add a single legend across all subplots
			
 
				+  handles, labels = [], []
			
 
				+  for ax in axes:
			
 
				+   for h, l in zip(*ax.get_legend_handles_labels()):
			
 
				+    if l not in labels:
			
 
				+     handles.append(h)
			
 
				+     labels.append(l)
			
 
				+  if handles:
			
 
				+   axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
			
 
				+
			
 
				+
			
 
				+  plt.tight_layout()
			
 
				+  plt.show()
			
 
				+
			
 
				+if options.plot_anomalies:
			
 
				+  test_anomaly_scores_gmm_list = [calculate_anomaly_scores_gmm(X_test_reshaped, gmm) for X_test_reshaped in X_test_reshaped_list]
			
 
				+  # Need original data frames for plotting
			
 
				+  plot_anomalies_gmm(dataTest, test_anomaly_scores_gmm_list)
			
 
				+
			
 
				+#####################################################################################################
			
 
				+# Display components that may contain anomalies (GMM)
			
 
				+#####################################################################################################
			
 
				+
			
 
				+def plot_anomaly_clusters_gmm(original_data_list, cluster_labels_list, anomaly_scores_list, n_components, features, featureNames, unitNames, threshold_factor=3):
			
 
				+  num_features = len(features)
			
 
				+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
			
 
				+  colors = plt.cm.viridis(np.linspace(0, 1, n_components))
			
 
				+
			
 
				+  for k, df in enumerate(original_data_list):
			
 
				+    time_index = df.index[timesteps - 1:]
			
 
				+    cluster_labels = cluster_labels_list[k]
			
 
				+    anomaly_scores = anomaly_scores_list[k]
			
 
				+    threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
			
 
				+    anomalous_indices = np.where(anomaly_scores > threshold)[0]
			
 
				+    anomalous_components = np.unique(cluster_labels[anomalous_indices])
			
 
				+
			
 
				+    for i, feature in enumerate(features):
			
 
				+      # Plot all data points colored by their component
			
 
				+      for component_id in range(n_components):
			
 
				+        component_indices = np.where(cluster_labels == component_id)[0]
			
 
				+        # Ensure component_indices are valid for time_index
			
 
				+        if len(component_indices) > 0:
			
 
				+          current_time_index = time_index[component_indices]
			
 
				+          # Need to get corresponding values from original_df using time index
			
 
				+          axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color=colors[component_id], label=f'Component {component_id}' if k == 0 else "", s=10, alpha=0.5)
			
 
				+
			
 
				+      # Highlight anomaly components - Plot ALL points belonging to the identified anomaly components
			
 
				+      # Loop through components that contain at least one anomalous point
			
 
				+      for anomaly_component in anomalous_components:
			
 
				+        # Find ALL indices that belong to this anomaly component
			
 
				+        component_all_indices = np.where(cluster_labels == anomaly_component)[0]
			
 
				+        if len(component_all_indices) > 0:
			
 
				+          current_time_index = time_index[component_all_indices]
			
 
				+          # Plot these points with a distinct color/marker
			
 
				+          axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color='red', label=f'Anomaly Comp. {anomaly_component}' if k == 0 else "", s=30, zorder=5)
			
 
				+
			
 
				+
			
 
				+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
			
 
				+      axes[i].set_title(f'Anomaly Components (GMM) - {featureNames[feature]}')
			
 
				+      # Only add a single legend across all subplots
			
 
				+  handles, labels = [], []
			
 
				+  for ax in axes:
			
 
				+   for h, l in zip(*ax.get_legend_handles_labels()):
			
 
				+    if l not in labels:
			
 
				+     handles.append(h)
			
 
				+     labels.append(l)
			
 
				+  if handles:
			
 
				+   axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
			
 
				+
			
 
				+
			
 
				+  plt.tight_layout()
			
 
				+  plt.show()
			
 
				+
			
 
				+if options.plot_anomaly_clusters:
			
 
				+  test_anomaly_scores_gmm_list = [calculate_anomaly_scores_gmm(X_test_reshaped, gmm) for X_test_reshaped in X_test_reshaped_list]
			
 
				+  plot_anomaly_clusters_gmm(dataTest, cluster_labels_test_gmm_list, test_anomaly_scores_gmm_list, n_components, features, featureNames, unitNames)
			
 
				+
			
 
				+#####################################################################################################
			
 
				+# Evaluation with manual labels (if provided) - similar for GMM
			
 
				+#####################################################################################################
			
 
				+
			
 
				+if options.manual_anomaly_labels:
			
 
				+  manual_labels_df = pd.read_csv(options.manual_anomaly_labels, index_col=0)
			
 
				+  all_true_labels = []
			
 
				+  all_predicted_anomalies = []
			
 
				+
			
 
				+  for k, df in enumerate(dataTest):
			
 
				+    # Ensure anomaly scores were calculated for this test set
			
 
				+    if k >= len(test_anomaly_scores_gmm_list):
			
 
				+     print(f"Warning: Anomaly scores not calculated for test set {k}. Skipping manual evaluation for this set.")
			
 
				+     continue
			
 
				+
			
 
				+    time_index = df.index[timesteps - 1:] # Time index aligned with sequences
			
 
				+    if not time_index.empty:
			
 
				+      # Reindex manual labels to match the time_index of sequences
			
 
				+      relevant_labels = manual_labels_df.reindex(time_index, method='nearest')['is_anomaly'].fillna(False).astype(int)
			
 
				+
			
 
				+      anomaly_scores = test_anomaly_scores_gmm_list[k]
			
 
				+      # Ensure lengths match before applying threshold
			
 
				+      if len(anomaly_scores) == len(relevant_labels):
			
 
				+        threshold = np.mean(anomaly_scores) + 3 * np.std(anomaly_scores) # Example threshold
			
 
				+        predicted_anomalies = (anomaly_scores > threshold).astype(int)
			
 
				+        all_true_labels.extend(relevant_labels.values)
			
 
				+        all_predicted_anomalies.extend(predicted_anomalies)
			
 
				+      else:
			
 
				+       print(f"Warning: Mismatch in lengths between anomaly scores ({len(anomaly_scores)}) and manual labels ({len(relevant_labels)}) for test set {k}. Skipping evaluation for this set.")
			
 
				+
			
 
				+
			
 
				+  if all_true_labels:
			
 
				+    print("\nEvaluation using manual anomaly labels (GMM):")
			
 
				+    print(classification_report(all_true_labels, all_predicted_anomalies, target_names=['Normal', 'Anomaly'], zero_division=0))
			
 
				+    cm = confusion_matrix(all_true_labels, all_predicted_anomalies)
			
 
				+    plt.figure(figsize=(8, 6))
			
 
				+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
			
 
				+    plt.title('Confusion Matrix (Manual Labels - GMM)')
			
 
				+    plt.xlabel('Predicted Label')
			
 
				+    plt.ylabel('True Label')
			
 
				+    plt.show()
			
 
				+  else:
			
 
				+    print("\nNo manual anomaly labels provided or aligned with test data for GMM evaluation.")
			
--- a/kmeans_gmm_anomaly/V0_V1_kmeans_anomaly_unsupervised_with_comment.py
+++ b/kmeans_gmm_anomaly/V0_V1_kmeans_anomaly_unsupervised_with_comment.py
@@ -0,0 +1,418 @@
 
				+# V0_V1_kmeans_anomaly_unsupervised_with_comment
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import matplotlib.pyplot as plt
			
 
				+from sklearn.cluster import KMeans
			
 
				+from sklearn.preprocessing import StandardScaler
			
 
				+from sklearn.metrics import silhouette_score, davies_bouldin_score
			
 
				+import argparse
			
 
				+import os
			
 
				+import seaborn as sns
			
 
				+from sklearn.metrics import confusion_matrix, classification_report
			
 
				+from sklearn.decomposition import PCA  # For dimensionality reduction if needed
			
 
				+
			
 
				+# Set up command line arguments
			
 
				+parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering (Unsupervised).')
			
 
				+parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
			
 
				+parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means.')
			
 
				+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
			
 
				+parser.add_argument('--plot_before', action='store_true', help='Plot data before training.')
			
 
				+parser.add_argument('--plot_after', action='store_true', help='Plot data after clustering.')
			
 
				+parser.add_argument('--plot_transition', action='store_true', help='Plot transition data with clusters.')
			
 
				+parser.add_argument('--plot_anomalies', action='store_true', help='Plot potential anomalies based on distance.')
			
 
				+parser.add_argument('--plot_anomaly_clusters', action='store_true', help='Highlight clusters with potential anomalies.')
			
 
				+parser.add_argument('--manual_anomaly_labels', type=str, default=None, help='Path to CSV with manual anomaly labels for evaluation.')
			
 
				+
			
 
				+options = parser.parse_args()
			
 
				+
			
 
				+# Number of clusters and sequence length from command line arguments
			
 
				+n_clusters = options.n_clusters
			
 
				+timesteps = options.timesteps
			
 
				+
			
 
				+#####################################################################################################
			
 
				+
			
 
				+NumberOfFailures = 4  # So far, we have only data for the first 4 types of failures
			
 
				+datafiles = [[], []]  # 0 for train,  1 for test
			
 
				+for i in range(NumberOfFailures + 1):
			
 
				+  datafiles[0].append([])
			
 
				+  datafiles[1].append([])
			
 
				+
			
 
				+# Next set of data corresponds to Freezer, SP=-26
			
 
				+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
			
 
				+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
			
 
				+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
			
 
				+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
			
 
				+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
			
 
				+
			
 
				+if options.transition:
			
 
				+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
			
 
				+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']  # with TRANSITION
			
 
				+  datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']  # with TRANSITION
			
 
				+  datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']  # with TRANSITION
			
 
				+  datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
			
 
				+
			
 
				+else:
			
 
				+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
			
 
				+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
			
 
				+  datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
			
 
				+  datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
			
 
				+  datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
			
 
				+
			
 
				+# Features used
			
 
				+features = ['r1 s1', 'r1 s4', 'r1 s5']
			
 
				+
			
 
				+featureNames = {}
			
 
				+featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>'
			
 
				+featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>'
			
 
				+featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>'
			
 
				+
			
 
				+unitNames = {}
			
 
				+unitNames['r1 s1'] = r'($^o$C)'
			
 
				+unitNames['r1 s4'] = r'($^o$C)'
			
 
				+unitNames['r1 s5'] = r'($^o$C)'
			
 
				+
			
 
				+NumFeatures = len(features)
			
 
				+
			
 
				+#####################################################################################################
			
 
				+
			
 
				+# Load and merge training data
			
 
				+dataTrain = []
			
 
				+for class_files in datafiles[0]:
			
 
				+  class_dfs = []
			
 
				+  for base_filename in class_files:
			
 
				+    script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				+    data_dir = os.path.join(script_dir, 'data')
			
 
				+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
			
 
				+    try:
			
 
				+      df = pd.read_csv(filepath)
			
 
				+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
			
 
				+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
			
 
				+      for col in features:
			
 
				+        df[col] = pd.to_numeric(df[col], errors='coerce')
			
 
				+      df = df.set_index('timestamp').resample('5Min')[features].mean()  # Resample and calculate mean only for features
			
 
				+      df = df[features].interpolate()  # Estimate missing values using linear interpolation
			
 
				+      class_dfs.append(df)
			
 
				+    except FileNotFoundError:
			
 
				+      print(f"Warning: File {filepath} not found and skipped.") 
			
 
				+  if class_dfs:
			
 
				+    dataTrain.append(pd.concat(class_dfs))
			
 
				+
			
 
				+# Concatenate all training data into a single DataFrame
			
 
				+combined_train_data = pd.concat(dataTrain)
			
 
				+
			
 
				+# Load and merge test data
			
 
				+dataTest = []
			
 
				+for class_files in datafiles[1]:
			
 
				+  class_dfs = []
			
 
				+  for base_filename in class_files:
			
 
				+    script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				+    data_dir = os.path.join(script_dir, 'data')
			
 
				+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
			
 
				+    try:
			
 
				+      df = pd.read_csv(filepath)
			
 
				+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
			
 
				+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
			
 
				+      for col in features:
			
 
				+        df[col] = pd.to_numeric(df[col], errors='coerce')
			
 
				+      df = df.set_index('timestamp').resample('5Min')[features].mean()  # Resample and calculate mean only for features
			
 
				+      df = df[features].interpolate()  # Estimate missing values using linear interpolation
			
 
				+      class_dfs.append(df)
			
 
				+    except FileNotFoundError:
			
 
				+      print(f"Warning: File {filepath} not found and skipped.")
			
 
				+  if class_dfs:
			
 
				+    dataTest.append(pd.concat(class_dfs))
			
 
				+
			
 
				+# Plot data before training
			
 
				+if options.plot_before:
			
 
				+  num_features = len(features)
			
 
				+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
			
 
				+  for i, feature in enumerate(features):
			
 
				+    for k, df in enumerate(dataTrain):
			
 
				+      axes[i].plot(df.index, df[feature], label=f'Train Class {k}')
			
 
				+    axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
			
 
				+    axes[i].set_title(f'Train Data - {featureNames[feature]}')
			
 
				+    axes[i].legend()
			
 
				+  plt.tight_layout()
			
 
				+  plt.show()
			
 
				+
			
 
				+########################################################################################################
			
 
				+
			
 
				+# Normalize the data
			
 
				+scaler = StandardScaler()
			
 
				+scaled_train_data = scaler.fit_transform(combined_train_data[features])  # Normalize only the features
			
 
				+
			
 
				+scaled_test_data_list = []
			
 
				+for df in dataTest:
			
 
				+  scaled_test_data_list.append(scaler.transform(df[features]))  # Normalize only the features
			
 
				+
			
 
				+# Convert normalized data to DataFrame for easier handling
			
 
				+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
			
 
				+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
			
 
				+
			
 
				+############################################################################################################
			
 
				+
			
 
				+# Create time sequences
			
 
				+def create_sequences(data, timesteps):
			
 
				+  sequences = []
			
 
				+  for i in range(len(data) - timesteps + 1):
			
 
				+    sequences.append(data[i:i + timesteps])
			
 
				+  return np.array(sequences)
			
 
				+
			
 
				+# Create time sequences for training data
			
 
				+X_train_sequences = create_sequences(scaled_train_df.values, timesteps)
			
 
				+n_samples, n_timesteps, n_features = X_train_sequences.shape
			
 
				+X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_features)
			
 
				+
			
 
				+# Create time sequences for test data
			
 
				+X_test_sequences_list = [create_sequences(df.values, timesteps) for df in scaled_test_df_list]
			
 
				+X_test_reshaped_list = [seq.reshape(seq.shape[0], seq.shape[1] * seq.shape[2]) for seq in X_test_sequences_list]
			
 
				+
			
 
				+############################################################################################################
			
 
				+
			
 
				+# Train the K-Means model
			
 
				+kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)  # n_init to prevent convergence to local optimal
			
 
				+kmeans.fit(X_train_reshaped)
			
 
				+train_labels = kmeans.labels_
			
 
				+
			
 
				+# Evaluate clustering quality for training data
			
 
				+silhouette_avg_train = silhouette_score(X_train_reshaped, train_labels)
			
 
				+db_index_train = davies_bouldin_score(X_train_reshaped, train_labels)
			
 
				+print("\nK-Means Model Evaluation on Training Data:")
			
 
				+print(f"\nSilhouette Score (Train Data): {silhouette_avg_train:.4f}")
			
 
				+print(f"Davies-Bouldin Index (Train Data): {db_index_train:.4f}")
			
 
				+
			
 
				+# Predict clusters for test data
			
 
				+cluster_labels_test_list = [kmeans.predict(X_test_reshaped) for X_test_reshaped in X_test_reshaped_list]
			
 
				+
			
 
				+# Evaluate clustering quality for test data (if there is more than one sample)
			
 
				+for i, labels_test in enumerate(cluster_labels_test_list):
			
 
				+  if len(X_test_reshaped_list[i]) > 1 and len(np.unique(labels_test)) > 1:
			
 
				+    silhouette_avg_test = silhouette_score(X_test_reshaped_list[i], labels_test)
			
 
				+    db_index_test = davies_bouldin_score(X_test_reshaped_list[i], labels_test)
			
 
				+    print(f"\nSilhouette Score (Test Data - Set {i+1}): {silhouette_avg_test:.4f}")
			
 
				+    print(f"Davies-Bouldin Index (Test Data - Set {i+1}): {db_index_test:.4f}")
			
 
				+  else:
			
 
				+    print(f"\nCannot calculate Silhouette Score and Davies-Bouldin Index for Test Data - Set {i+1} due to insufficient samples or only one cluster.")
			
 
				+############################################################################################################################
			
 
				+
			
 
				+# Function to plot data with assigned clusters
			
 
				+def plot_clustered_data(original_data_list, cluster_labels_list, n_clusters, features, featureNames, unitNames, title_prefix=""):
			
 
				+  num_features = len(features)
			
 
				+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
			
 
				+  colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))  # Assign colors to each cluster
			
 
				+
			
 
				+  for k, df in enumerate(original_data_list):
			
 
				+    # Change: Directly use DataFrame index
			
 
				+    df_index = df.index
			
 
				+    labels = cluster_labels_list[k]
			
 
				+    for i, feature in enumerate(features):
			
 
				+      for cluster_id in range(n_clusters):
			
 
				+        cluster_indices_kmeans = np.where(labels == cluster_id)[0]
			
 
				+        # Ensure indices are within the range of df_index
			
 
				+        if len(cluster_indices_kmeans) > 0 and len(df_index) > cluster_indices_kmeans[-1]:
			
 
				+          valid_indices = df_index[cluster_indices_kmeans]
			
 
				+          axes[i].scatter(valid_indices, df[feature].loc[valid_indices], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10)
			
 
				+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
			
 
				+      axes[i].set_title(f'{title_prefix} - {featureNames[feature]}')
			
 
				+      axes[i].legend(loc='upper right')
			
 
				+
			
 
				+  plt.tight_layout()
			
 
				+  plt.show()
			
 
				+
			
 
				+# Plot data after clustering (training)
			
 
				+if options.plot_after:
			
 
				+  # To plot training data, we need to map sequences back to original data
			
 
				+  train_original_indices = combined_train_data.index[timesteps - 1:]
			
 
				+  plot_df_train = pd.DataFrame(scaled_train_df.values[timesteps - 1:], index=train_original_indices, columns=features)
			
 
				+
			
 
				+  # Change: Slice train_labels to the length of plot_df_train
			
 
				+  trimmed_train_labels = train_labels[:len(plot_df_train)]
			
 
				+
			
 
				+  plot_clustered_data([plot_df_train], [trimmed_train_labels], n_clusters, features, featureNames, unitNames, title_prefix="Train Data Clusters")
			
 
				+
			
 
				+    # Plot scatter plot of clusters for training data (using the first two features)
			
 
				+  if len(features) >= 2:
			
 
				+    plt.figure(figsize=(10, 8))
			
 
				+    scatter = plt.scatter(scaled_train_df[features[0]][:len(plot_df_train)], scaled_train_df[features[1]][:len(plot_df_train)], c=trimmed_train_labels, cmap='viridis')
			
 
				+    plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
			
 
				+    plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
			
 
				+    plt.title('Clusters of Train Data (Features 1 vs 2)')
			
 
				+    plt.colorbar(scatter, label='Cluster ID')
			
 
				+    plt.show()
			
 
				+
			
 
				+# Plot test data with predicted clusters
			
 
				+if options.plot_transition:
			
 
				+  plot_clustered_data(dataTest, cluster_labels_test_list, n_clusters, features, featureNames, unitNames, title_prefix="Test Data Clusters")
			
 
				+
			
 
				+    # Plot scatter plot of clusters for test data (using the first two features)
			
 
				+  for i, labels_test in enumerate(cluster_labels_test_list):  
			
 
				+    if len(features) >= 2 and len(scaled_test_df_list[i]) > timesteps - 1:
			
 
				+      # Change: Ensure label length matches data length
			
 
				+      test_labels_to_plot = labels_test[:len(scaled_test_df_list[i]) - (timesteps - 1)]
			
 
				+      if len(test_labels_to_plot) > 0:
			
 
				+        plt.figure(figsize=(10, 8))
			
 
				+        scatter = plt.scatter(scaled_test_df_list[i][features[0]][timesteps-1:], scaled_test_df_list[i][features[1]][timesteps-1:], c=test_labels_to_plot, cmap='viridis')
			
 
				+        plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
			
 
				+        plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
			
 
				+        plt.title(f'Clusters of Test Data - Set {i+1} (Features 1 vs 2)')
			
 
				+        plt.colorbar(scatter, label='Cluster ID')
			
 
				+        plt.show()
			
 
				+
			
 
				+#####################################################################################################
			
 
				+# Anomaly detection based on distance from cluster center
			
 
				+#####################################################################################################
			
 
				+
			
 
				+def calculate_anomaly_scores(data_reshaped, kmeans_model):
			
 
				+  distances = []
			
 
				+  for i in range(len(data_reshaped)):
			
 
				+    distance = np.linalg.norm(data_reshaped[i] - kmeans_model.cluster_centers_[kmeans_model.labels_[i]])
			
 
				+    distances.append(distance)
			
 
				+  return np.array(distances)
			
 
				+
			
 
				+train_anomaly_scores = calculate_anomaly_scores(X_train_reshaped, kmeans)
			
 
				+
			
 
				+def plot_anomalies(original_data_list, anomaly_scores_list, threshold_factor=3):
			
 
				+  num_features = len(features)
			
 
				+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
			
 
				+  for k, df in enumerate(original_data_list):
			
 
				+    time_index = df.index[timesteps - 1:]
			
 
				+    anomaly_scores = anomaly_scores_list[k]
			
 
				+    threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
			
 
				+    anomalous_indices = np.where(anomaly_scores > threshold)[0]
			
 
				+    for i, feature in enumerate(features):
			
 
				+      # Ensure lengths match for plotting
			
 
				+      # Plot full original data dimly
			
 
				+      axes[i].plot(df.index, df[feature], label=f'Normal', alpha=0.7)
			
 
				+
			
 
				+      if len(time_index) == len(anomaly_scores): # Check alignment
			
 
				+        # Now mark anomalies on the original data time axis
			
 
				+        anomaly_times = time_index[anomalous_indices]
			
 
				+        axes[i].scatter(anomaly_times, df[feature].loc[anomaly_times], color='red', label=f'Anomaly' if k == 0 else "", s=20, zorder=5) # Use zorder
			
 
				+      else:
			
 
				+        print(f"Warning: Data length mismatch for plotting anomalies in file {k}. Skipping anomaly plot for this file.")
			
 
				+
			
 
				+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
			
 
				+      axes[i].set_title(f'Potential Anomalies - {featureNames[feature]}')
			
 
				+      # Only add a single legend across all subplots
			
 
				+  handles, labels = [], []
			
 
				+  for ax in axes:
			
 
				+   for h, l in zip(*ax.get_legend_handles_labels()):
			
 
				+    if l not in labels:
			
 
				+     handles.append(h)
			
 
				+     labels.append(l)
			
 
				+  if handles:
			
 
				+   axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
			
 
				+
			
 
				+
			
 
				+  plt.tight_layout()
			
 
				+  plt.show()
			
 
				+
			
 
				+
			
 
				+if options.plot_anomalies:
			
 
				+  test_anomaly_scores_list = [calculate_anomaly_scores(X_test_reshaped, kmeans) for X_test_reshaped in X_test_reshaped_list]
			
 
				+  # Need original data frames for plotting
			
 
				+  plot_anomalies(dataTest, test_anomaly_scores_list)
			
 
				+
			
 
				+#####################################################################################################
			
 
				+# Display clusters that may contain anomalies
			
 
				+#####################################################################################################
			
 
				+
			
 
				+def plot_anomaly_clusters(original_data_list, cluster_labels_list, anomaly_scores_list, n_clusters, features, featureNames, unitNames, threshold_factor=3):
			
 
				+  num_features = len(features)
			
 
				+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
			
 
				+  colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
			
 
				+
			
 
				+  for k, df in enumerate(original_data_list):
			
 
				+    time_index = df.index[timesteps - 1:]
			
 
				+    cluster_labels = cluster_labels_list[k]
			
 
				+    anomaly_scores = anomaly_scores_list[k]
			
 
				+    threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
			
 
				+    anomalous_indices = np.where(anomaly_scores > threshold)[0]
			
 
				+    anomalous_clusters = np.unique(cluster_labels[anomalous_indices])
			
 
				+
			
 
				+    for i, feature in enumerate(features):
			
 
				+      # Plot all data points colored by their cluster
			
 
				+      for cluster_id in range(n_clusters):
			
 
				+        cluster_indices = np.where(cluster_labels == cluster_id)[0]
			
 
				+        # Ensure cluster_indices are valid for time_index
			
 
				+        if len(cluster_indices) > 0:
			
 
				+          current_time_index = time_index[cluster_indices]
			
 
				+          # Need to get corresponding values from original_df using time index
			
 
				+          axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10, alpha=0.5)
			
 
				+
			
 
				+      # Highlight anomaly clusters - Plot ALL points belonging to the identified anomaly clusters
			
 
				+      # Loop through clusters that contain at least one anomalous point
			
 
				+      for anomaly_cluster in anomalous_clusters:
			
 
				+        # Find ALL indices that belong to this anomaly cluster
			
 
				+        cluster_all_indices = np.where(cluster_labels == anomaly_cluster)[0]
			
 
				+        if len(cluster_all_indices) > 0:
			
 
				+          current_time_index = time_index[cluster_all_indices]
			
 
				+          # Plot these points with a distinct color/marker
			
 
				+          axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color='red', label=f'Anomaly Cluster {anomaly_cluster}' if k == 0 else "", s=30, zorder=5)
			
 
				+
			
 
				+
			
 
				+  axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
			
 
				+  axes[i].set_title(f'Anomaly Clusters - {featureNames[feature]}')
			
 
				+  # Only add a single legend across all subplots
			
 
				+  handles, labels = [], []
			
 
				+  for ax in axes:
			
 
				+   for h, l in zip(*ax.get_legend_handles_labels()):
			
 
				+    if l not in labels:
			
 
				+     handles.append(h)
			
 
				+     labels.append(l)
			
 
				+  if handles:
			
 
				+   axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
			
 
				+
			
 
				+
			
 
				+  plt.tight_layout()
			
 
				+  plt.show()
			
 
				+
			
 
				+
			
 
				+if options.plot_anomaly_clusters:
			
 
				+  test_anomaly_scores_list = [calculate_anomaly_scores(X_test_reshaped, kmeans) for X_test_reshaped in X_test_reshaped_list]
			
 
				+  plot_anomaly_clusters(dataTest, cluster_labels_test_list, test_anomaly_scores_list, n_clusters, features, featureNames, unitNames)
			
 
				+
			
 
				+
			
 
				+#####################################################################################################
			
 
				+# Evaluation with manual labels (if provided)
			
 
				+#####################################################################################################
			
 
				+
			
 
				+if options.manual_anomaly_labels:
			
 
				+  manual_labels_df = pd.read_csv(options.manual_anomaly_labels, index_col=0)
			
 
				+  all_true_labels = []
			
 
				+  all_predicted_anomalies = []
			
 
				+
			
 
				+  for k, df in enumerate(dataTest):
			
 
				+    # Ensure anomaly scores were calculated for this test set
			
 
				+    if k >= len(test_anomaly_scores_list):
			
 
				+     print(f"Warning: Anomaly scores not calculated for test set {k}. Skipping manual evaluation for this set.")
			
 
				+     continue
			
 
				+
			
 
				+    time_index = df.index[timesteps - 1:] # Time index aligned with sequences
			
 
				+    if not time_index.empty:
			
 
				+      # Assuming the manual labels CSV has a boolean 'is_anomaly' column and its index aligns with the test data
			
 
				+      relevant_labels = manual_labels_df.reindex(time_index, method='nearest')['is_anomaly'].fillna(False).astype(int)
			
 
				+
			
 
				+      anomaly_scores = test_anomaly_scores_list[k]
			
 
				+      # Ensure lengths match before applying threshold
			
 
				+      if len(anomaly_scores) == len(relevant_labels):
			
 
				+        threshold = np.mean(anomaly_scores) + 3 * np.std(anomaly_scores) # Example threshold
			
 
				+        predicted_anomalies = (anomaly_scores > threshold).astype(int)
			
 
				+        all_true_labels.extend(relevant_labels.values)
			
 
				+        all_predicted_anomalies.extend(predicted_anomalies)
			
 
				+      else:
			
 
				+       print(f"Warning: Mismatch in lengths between anomaly scores ({len(anomaly_scores)}) and manual labels ({len(relevant_labels)}) for test set {k}. Skipping evaluation for this set.")
			
 
				+
			
 
				+
			
 
				+  if all_true_labels:
			
 
				+    print("\nEvaluation using manual anomaly labels:")
			
 
				+    print(classification_report(all_true_labels, all_predicted_anomalies, target_names=['Normal', 'Anomaly'], zero_division=0))
			
 
				+    cm = confusion_matrix(all_true_labels, all_predicted_anomalies)
			
 
				+    plt.figure(figsize=(8, 6))
			
 
				+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
			
 
				+    plt.title('Confusion Matrix (Manual Labels)')
			
 
				+    plt.xlabel('Predicted Label')
			
 
				+    plt.ylabel('True Label')
			
 
				+    plt.show()
			
 
				+  else:
			
 
				+    print("\nNo manual anomaly labels provided or aligned with test data.")