Ako data: Unsupervised/reinforcement learning for anomaly detection and clustering. Unsupervised, trained, and evaluated on labeled anomaly data.

V8_RobustScaler_kmeans_anomaly_with_clear_comment.py 36KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. # Import necessary libraries
  2. import pandas as pd
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. from sklearn.cluster import KMeans
  6. # Import scalers and metrics from scikit-learn
  7. from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder # RobustScaler is used for scaling
  8. from sklearn.metrics import silhouette_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
  9. import argparse # For parsing command line arguments
  10. import os # For path manipulation
  11. import seaborn as sns # For enhanced data visualization (like confusion matrix)
  12. # Command line arguments setup
  13. # Defines the command line interface for the script, allowing users to specify parameters
  14. parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering with visualization.')
  15. # --timesteps: Number of data points in each sequence (time window)
  16. parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
  17. # --n_clusters: Number of clusters for K-Means. Expected to be number of failure types + normal state.
  18. parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means (should match the number of failure types + normal).')
  19. # --n_init: Number of times K-Means is run with different initial centroids. The best result is chosen.
  20. parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.')
  21. # --transition: Flag to use data files that include transition periods for testing.
  22. parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
  23. # Plotting flags: control which plots are displayed.
  24. parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
  25. parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
  26. parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies (based on clusters).')
  27. parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances (based on clusters).')
  28. # Parse the arguments provided by the user
  29. options = parser.parse_args()
  30. # Assign parsed arguments to variables
  31. n_clusters = options.n_clusters
  32. timesteps = options.timesteps
  33. n_init = options.n_init
  34. #####################################################################################################
  35. # Data File Configuration
  36. #####################################################################################################
  37. # Number of distinct failure types we have data for (excluding the normal state)
  38. NumberOfFailures = 4 # So far, we have only data for the first 4 types of failures
  39. # List to hold file paths for training and testing data
  40. # datafiles[0]: training data files, datafiles[1]: testing data files
  41. # Inner lists correspond to different classes/failure types (0: Normal, 1-4: Failure Types)
  42. datafiles = [[], []] # 0 for train, 1 for test
  43. # Initialize inner lists for each class (Normal + NumberOfFailures)
  44. for i in range(NumberOfFailures + 1):
  45. datafiles[0].append([])
  46. datafiles[1].append([])
  47. # Assign specific filenames to each class for the training set
  48. # datafiles[0][0]: Normal training data
  49. # datafiles[0][1]: Failure Type 1 training data
  50. # ... and so on
  51. datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
  52. datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
  53. datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
  54. datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
  55. datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
  56. # Assign specific filenames for the testing set
  57. # Uses different files based on whether the --transition flag is set
  58. if options.transition:
  59. # Test files including transition data
  60. datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
  61. datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_'] # with TRANSITION
  62. datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_'] # with TRANSITION
  63. datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_'] # with TRANSITION
  64. datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
  65. else:
  66. # Test files without explicit transition data
  67. datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
  68. datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
  69. datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
  70. datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
  71. datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
  72. # Features (columns) to be used from the data files
  73. features = ['r1 s1', 'r1 s4', 'r1 s5']
  74. # Store the initial count of features before potentially adding derived features
  75. n_original_features = len(features) # Store the original number of features
  76. # Dictionaries to map feature names to display names (e.g., for plots)
  77. featureNames = {}
  78. featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>' # Evaporator Temperature
  79. featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>' # Condenser Temperature
  80. featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>' # Air Temperature
  81. # Dictionaries to map feature names to their units (e.g., for plots)
  82. unitNames = {}
  83. unitNames['r1 s1'] = r'($^o$C)'
  84. unitNames['r1 s4'] = r'($^o$C)'
  85. unitNames['r1 s5'] = r'($^o$C)'
  86. # Redundant variable, but kept from original code
  87. NumFeatures = len(features)
  88. #####################################################################################################
  89. # Data Loading and Preprocessing (Training Data)
  90. #####################################################################################################
  91. # List to hold DataFrames for training data, organized by class
  92. dataTrain = []
  93. # Loop through each list of files for each training class
  94. for class_files in datafiles[0]:
  95. class_dfs = [] # List to hold dataframes for current class
  96. # Loop through each filename in the current class
  97. for base_filename in class_files:
  98. # Construct the full file path
  99. script_dir = os.path.dirname(os.path.abspath(__file__)) # Get directory of the current script
  100. data_dir = os.path.join(script_dir, 'data') # Assume data is in a 'data' subdirectory
  101. filepath = os.path.join(data_dir, f'{base_filename}.csv') # Full path to the CSV file
  102. try:
  103. # Read the CSV file into a pandas DataFrame
  104. df = pd.read_csv(filepath)
  105. # Convert 'datetime' column to datetime objects using two possible formats, coercing errors
  106. df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
  107. df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
  108. # Convert feature columns to numeric, coercing errors to NaN
  109. for col in features:
  110. df[col] = pd.to_numeric(df[col], errors='coerce')
  111. # Set the timestamp as index, resample to 5-minute frequency, and calculate the mean for features
  112. df = df.set_index('timestamp').resample('5Min')[features].mean() # Resample and calculate mean only for features
  113. # Estimate missing values (NaN) using linear interpolation
  114. df = df[features].interpolate() # Estimate missing values using linear interpolation
  115. # Append the processed DataFrame to the list for the current class
  116. class_dfs.append(df)
  117. except FileNotFoundError:
  118. # Print a warning if a file is not found and skip it
  119. print(f"Warning: File {filepath} not found and skipped.")
  120. # If any files were successfully loaded for this class, concatenate them
  121. if class_dfs:
  122. dataTrain.append(pd.concat(class_dfs))
  123. # Concatenate all class DataFrames into a single DataFrame for training
  124. combined_train_data = pd.concat(dataTrain)
  125. #####################################################################################################
  126. # Data Loading and Preprocessing (Test Data)
  127. #####################################################################################################
  128. # List to hold DataFrames for test data, organized by class
  129. # Each element in dataTest corresponds to a different class (Normal, Failure Type 1, etc.)
  130. dataTest = []
  131. # Loop through each list of files for each test class
  132. for class_files in datafiles[1]:
  133. class_dfs = [] # List to hold dataframes for current class
  134. # Loop through each filename in the current class
  135. for base_filename in class_files:
  136. # Construct the full file path
  137. script_dir = os.path.dirname(os.path.abspath(__file__)) # Get directory of the current script
  138. data_dir = os.path.join(script_dir, 'data') # Assume data is in a 'data' subdirectory
  139. filepath = os.path.join(data_dir, f'{base_filename}.csv') # Full path to the CSV file
  140. try:
  141. # Read the CSV file into a pandas DataFrame
  142. df = pd.read_csv(filepath)
  143. # Convert 'datetime' column to datetime objects using two possible formats, coercing errors
  144. df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
  145. df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
  146. # Convert feature columns to numeric, coercing errors to NaN
  147. for col in features:
  148. df[col] = pd.to_numeric(df[col], errors='coerce')
  149. # Set the timestamp as index, resample to 5-minute frequency, and calculate the mean for features
  150. df = df.set_index('timestamp').resample('5Min')[features].mean() # Resample and calculate mean only for features
  151. # Estimate missing values (NaN) using linear interpolation
  152. df = df[features].interpolate() # Estimate missing values using linear interpolation
  153. # Append the processed DataFrame to the list for the current class
  154. class_dfs.append(df)
  155. except FileNotFoundError:
  156. # Print a warning if a file is not found and skip it
  157. print(f"Warning: File {filepath} not found and skipped.")
  158. # If any files were successfully loaded for this class, concatenate them
  159. if class_dfs:
  160. dataTest.append(pd.concat(class_dfs))
  161. #####################################################################################################
  162. # Raw Data Plotting (Optional)
  163. #####################################################################################################
  164. # Plot raw data if the --plot_raw flag is provided
  165. if options.plot_raw:
  166. num_features = len(features)
  167. # Create a figure and a set of subplots (one for each feature)
  168. fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
  169. # Ensure axes is an array even if there's only one feature
  170. if num_features == 1:
  171. axes = [axes]
  172. # Loop through each feature
  173. for i, feature in enumerate(features):
  174. # Loop through each test data DataFrame (each class)
  175. for k, df in enumerate(dataTest):
  176. # Plot the feature data over time for the current class
  177. axes[i].plot(df.index, df[feature], label=f'Class {k}')
  178. # Set ylabel and title for the subplot
  179. axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
  180. axes[i].set_title(featureNames[feature])
  181. # Add legend to the subplot
  182. axes[i].legend()
  183. # Adjust layout to prevent labels overlapping
  184. plt.tight_layout()
  185. # Display the plot
  186. plt.show()
  187. # exit(0) # Uncomment to exit after plotting raw data
  188. ########################################################################################################
  189. # Data Scaling
  190. ########################################################################################################
  191. # Initialize the scaler (RobustScaler is less affected by outliers than StandardScaler)
  192. # StandardScaler() # Original scaler
  193. scaler = RobustScaler() # Changed from StandardScaler
  194. # Fit the scaler on the training data and transform it
  195. # Only the original features are scaled
  196. scaled_train_data = scaler.fit_transform(combined_train_data[features]) # Normalize only the original features
  197. # Transform the test data using the scaler fitted on the training data
  198. # A list comprehension is used to transform each test DataFrame
  199. scaled_test_data_list = [scaler.transform(df[features]) for df in dataTest] # Normalize only the original features
  200. # Convert normalized data back to pandas DataFrames for easier handling (optional but can be useful)
  201. scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
  202. scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
  203. ############################################################################################################
  204. # Sequence Creation with Rate of Change Feature Engineering
  205. ############################################################################################################
  206. # Function to create time sequences from data and append the rate of change as new features
  207. def create_sequences_with_rate_of_change(data, timesteps, original_features_count): # Parameter name indicates count
  208. sequences = [] # List to store the created sequences
  209. # Iterate through the data to create overlapping sequences
  210. for i in range(len(data) - timesteps + 1):
  211. # Extract a sequence of 'timesteps' length
  212. sequence = data[i:i + timesteps]
  213. # Calculate the difference between consecutive points along the time axis (axis=0)
  214. # This computes the rate of change for each feature across timesteps
  215. rate_of_change = np.diff(sequence[:timesteps], axis=0)
  216. # Pad the rate of change to have the same number of timesteps as the original sequence
  217. # np.diff reduces the number of timesteps by 1, so we add a row of zeros at the beginning
  218. # Use the count of original features for padding dimension
  219. padding = np.zeros((1, original_features_count)) # Corrected: Use the features count
  220. rate_of_change_padded = np.vstack((padding, rate_of_change)) # Stack the padding on top
  221. # Concatenate the original sequence and the padded rate of change sequence horizontally
  222. # Resulting sequence has 'timesteps' rows and '2 * original_features_count' columns
  223. sequences.append(np.hstack((sequence, rate_of_change_padded))) # Concatenate original and rate of change
  224. # Convert the list of sequences into a NumPy array
  225. return np.array(sequences)
  226. # Create time sequences with rate of change for the scaled training data
  227. # The output shape will be (num_training_sequences, timesteps, 2 * n_original_features)
  228. X_train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features) # Pass n_original_features
  229. # Create time sequences with rate of change for each scaled test data DataFrame
  230. # X_test_sequences_list will be a list of arrays, one for each test class
  231. X_test_sequences_list = [create_sequences_with_rate_of_change(df.values, timesteps, n_original_features) for df in scaled_test_df_list] # Pass n_original_features
  232. ############################################################################################################
  233. # K-Means Clustering Model
  234. ############################################################################################################
  235. # Reshape the training sequences for K-Means
  236. # K-Means expects a 2D array (samples, features)
  237. # We flatten each sequence (timesteps * total_features) into a single row
  238. n_samples, n_timesteps, n_total_features = X_train_sequences.shape
  239. X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_total_features)
  240. # Train the K-Means model
  241. # n_clusters: Number of clusters (expected to be number of classes)
  242. # random_state=42: Ensures reproducibility of initial centroids for n_init runs
  243. # n_init=10: Runs K-Means 10 times with different centroid seeds and picks the best result (lowest inertia)
  244. kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # n_init to avoid convergence to local optima
  245. # Fit the K-Means model on the reshaped training data
  246. kmeans.fit(X_train_reshaped)
  247. ############################################################################################################################
  248. # Predict Clusters for Test Data
  249. ############################################################################################################################
  250. # List to store predicted cluster labels for each test data DataFrame
  251. cluster_labels_test_list = []
  252. # List to store reshaped test data (useful for evaluation metrics later)
  253. X_test_reshaped_list = []
  254. # kmeans_models = [] # To store kmeans model for each test set (This variable is declared but not used subsequently)
  255. # Loop through each test data sequence array (each class)
  256. for i, X_test_seq in enumerate(X_test_sequences_list):
  257. # Get dimensions of the current test sequence array
  258. n_samples_test, n_timesteps_test, n_total_features_test = X_test_seq.shape
  259. # Reshape the test sequences for prediction (flatten each sequence)
  260. X_test_reshaped = X_test_seq.reshape(n_samples_test, n_timesteps_test * n_total_features_test)
  261. # Predict cluster labels for the reshaped test data
  262. labels = kmeans.predict(X_test_reshaped)
  263. # Append the predicted labels and reshaped data to the lists
  264. cluster_labels_test_list.append(labels)
  265. X_test_reshaped_list.append(X_test_reshaped) # Append reshaped data
  266. # kmeans_models.append(kmeans) # Store the trained kmeans model (Variable declared but not used)
  267. ############################################################################################################################
  268. # Plotting Clustered Data (Optional)
  269. ############################################################################################################
  270. # Function to plot the original data points colored by their assigned cluster label
  271. # Plots only the original features
  272. def plot_clustered_data(original_data_list, cluster_labels_list, n_clusters, features, featureNames, unitNames):
  273. num_features = len(features)
  274. # Create subplots, one for each original feature
  275. fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
  276. # Ensure axes is an array even if only one feature
  277. if num_features == 1:
  278. axes = [axes]
  279. # Generate a color map for the clusters
  280. colors = plt.cm.viridis(np.linspace(0, 1, n_clusters)) # Assign colors to each cluster
  281. # Loop through each original test data DataFrame (each class)
  282. for k, df in enumerate(original_data_list):
  283. original_indices = df.index # Get the original time index
  284. # Get the time index corresponding to the start of each sequence (shifted by timesteps-1)
  285. time_index = original_indices[timesteps - 1:]
  286. # Loop through each original feature
  287. for i, feature in enumerate(features):
  288. # Loop through each predicted cluster ID
  289. for cluster_id in range(n_clusters):
  290. # Find the indices in the current test data corresponding to the current cluster ID
  291. cluster_indices_kmeans = np.where(cluster_labels_list[k] == cluster_id)[0]
  292. # If there are data points assigned to this cluster
  293. if len(cluster_indices_kmeans) > 0:
  294. # Scatter plot the data points for this cluster
  295. # x-axis: time_index points corresponding to the sequence end
  296. # y-axis: original feature values at those time_index points
  297. # color: color assigned to the cluster
  298. # label: label for the cluster (only show for the first class (k==0) to avoid redundant legends)
  299. # s=10: size of the scatter points
  300. axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10)
  301. # Set ylabel and title for the subplot
  302. axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
  303. axes[i].set_title(featureNames[feature])
  304. # Add legend to the last subplot (or each if desired)
  305. axes[num_features - 1].legend(loc='upper right') # Place legend on the last subplot
  306. # Adjust layout and display the plot
  307. plt.tight_layout()
  308. plt.show()
  309. # Call the plotting function if the --plot_clustered flag is provided
  310. if options.plot_clustered:
  311. plot_clustered_data(dataTest, cluster_labels_test_list, n_clusters, features, featureNames, unitNames)
  312. #####################################################################################################
  313. # Evaluation and plotting of anomalies and misclassified instances (based on cluster labels)
  314. #####################################################################################################
  315. # Function to evaluate clustering results and plot anomalies/misclassified instances
  316. def evaluate_and_plot_anomalies(kmeans_model, scaled_test_data_list, n_clusters, original_test_data_list, true_labels_list, features, featureNames, unitNames, plot_anomalies=False, plot_misclassified=False):
  317. # Lists to store collected data and labels across all test classes
  318. all_y_true_categorical = [] # Stores true labels (0, 1, 2, ...) for each sequence
  319. all_predicted_cluster_labels = [] # Stores predicted cluster ID for each sequence
  320. all_original_test_sequences = [] # Stores the original feature values for each sequence (for plotting)
  321. # Lists to store evaluation metrics per test class (before combining)
  322. inertia_values = [] # Inertia values for each class's data predicted by the model
  323. silhouette_scores = [] # Silhouette scores for each class's data predicted by the model
  324. # Loop through each test class data (scaled, original, and true labels)
  325. for i, (scaled_test_df, original_test_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)):
  326. # Create sequences with rate of change for the current scaled test data
  327. X_test_sequences = create_sequences_with_rate_of_change(scaled_test_df.values, timesteps, n_original_features) # Pass n_original_features
  328. # Skip evaluation for this class if no sequences were generated (data too short)
  329. if X_test_sequences.size == 0:
  330. print(f"Warning: No test sequences generated for class {i}. Skipping evaluation for this class.")
  331. continue
  332. # Reshape the sequences for prediction by the trained K-Means model
  333. n_samples_test = X_test_sequences.shape[0]
  334. X_test_reshaped = X_test_sequences.reshape(n_samples_test, -1)
  335. # Predict cluster labels for the current test class data
  336. cluster_labels_predicted = kmeans_model.predict(X_test_reshaped)
  337. # Calculate and store Inertia for the current class's data (based on the overall model)
  338. # This is different from the model's final inertia on training data
  339. inertia_values.append(kmeans_model.inertia_) # Note: This seems to append the total model inertia, not per-class inertia. It might be intended to be calculated differently here. Keeping original code logic.
  340. # Calculate and store Silhouette score if possible (requires >1 unique labels and >0 samples)
  341. if len(np.unique(cluster_labels_predicted)) > 1 and len(cluster_labels_predicted) > 0:
  342. silhouette_scores.append(silhouette_score(X_test_reshaped, cluster_labels_predicted))
  343. else:
  344. silhouette_scores.append(np.nan) # Append NaN if silhouette cannot be calculated
  345. # Get the time indices corresponding to the end of each sequence in the original data
  346. original_indices = original_test_df.index[timesteps - 1:]
  347. # Collect true labels, predicted labels, and original sequences for evaluation/plotting
  348. # Loop through the sequences generated for the current class
  349. for j, label in enumerate(y_true_categorical[timesteps - 1:]): # Iterate over true labels corresponding to sequence ends
  350. all_y_true_categorical.append(label) # Append the true label
  351. all_predicted_cluster_labels.append(cluster_labels_predicted[j]) # Append the predicted cluster label
  352. # Get the start and end index in the original DataFrame for the current sequence
  353. start_index = original_test_df.index.get_loc(original_indices[j]) - (timesteps - 1)
  354. end_index = start_index + timesteps
  355. # Extract and append the original feature values for the current sequence
  356. all_original_test_sequences.append(original_test_df[features].iloc[start_index:end_index].values) # Append
  357. # Convert collected lists to NumPy arrays for easier handling
  358. all_y_true_categorical = np.array(all_y_true_categorical)
  359. all_predicted_cluster_labels = np.array(all_predicted_cluster_labels)
  360. all_original_test_sequences = np.array(all_original_test_sequences)
  361. # Print evaluation metrics (based on collected values across all test classes)
  362. print("\nEvaluation Metrics:")
  363. # Print mean Inertia (likely the final Inertia of the trained model as per the loop)
  364. print(f"Inertia (final): {np.mean(inertia_values):.4f}") # Check if this is the intended calculation
  365. # Print mean Silhouette score across classes (ignoring NaNs)
  366. print(f"Average Silhouette Score (valid cases): {np.nanmean(silhouette_scores):.4f}")
  367. # Analyze clusters and assign a dominant true label to each cluster ID
  368. # This helps in mapping cluster IDs back to meaningful class labels for evaluation
  369. cluster_dominant_label = {} # Dictionary to store the dominant true label for each cluster ID
  370. for cluster_id in range(n_clusters): # Loop through each cluster ID
  371. # Find indices of all sequences assigned to the current cluster ID
  372. indices_in_cluster = np.where(all_predicted_cluster_labels == cluster_id)[0]
  373. # If there are sequences in this cluster
  374. if len(indices_in_cluster) > 0:
  375. # Get the true labels for all sequences in this cluster
  376. labels_in_cluster = all_y_true_categorical[indices_in_cluster]
  377. # If there are labels (and thus samples) in this cluster
  378. if len(labels_in_cluster) > 0:
  379. # Find the most frequent true label (dominant label) in this cluster
  380. dominant_label = np.argmax(np.bincount(labels_in_cluster))
  381. cluster_dominant_label[cluster_id] = dominant_label # Store the dominant label
  382. else:
  383. cluster_dominant_label[cluster_id] = -1 # Assign -1 if no data points have true labels (shouldn't happen if indices_in_cluster > 0 and all_y_true_categorical is aligned)
  384. else:
  385. cluster_dominant_label[cluster_id] = -1 # Assign -1 if the cluster is empty
  386. # Create predicted labels in numeric form based on the dominant true label of the assigned cluster
  387. # This maps the predicted cluster ID for each sequence to the dominant true label of that cluster
  388. predicted_labels_numeric = np.array([cluster_dominant_label.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels])
  389. # Evaluate the clustering's ability to separate classes using classification metrics
  390. # Only consider instances where a dominant label could be assigned (predicted_labels_numeric != -1)
  391. valid_indices = predicted_labels_numeric != -1 # Indices where a dominant label mapping exists
  392. # Perform evaluation if there are valid instances and more than one true class represented
  393. if np.sum(valid_indices) > 0 and len(np.unique(all_y_true_categorical[valid_indices])) > 1:
  394. print("\nEvaluation Results (Clusters vs True Labels):")
  395. # Print classification report (Precision, Recall, F1-score per class, and overall metrics)
  396. print(classification_report(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices]))
  397. # Compute the confusion matrix
  398. cm = confusion_matrix(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices])
  399. # Plot the confusion matrix using seaborn heatmap
  400. plt.figure(figsize=(8, 6))
  401. sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') # annot=True shows values, fmt='d' formats as integers
  402. plt.xlabel('Predicted Cluster (Dominant True Label)') # Label for x-axis
  403. plt.ylabel('True Label') # Label for y-axis
  404. plt.title('Confusion Matrix (Clusters vs True Labels)') # Title of the plot
  405. plt.show() # Display the plot
  406. else:
  407. print("\nCould not perform detailed evaluation (not enough data or classes).")
  408. #################################################################################################
  409. # Plotting Anomalies (Optional)
  410. #################################################################################################
  411. # Plot detected anomalies if the --plot_anomalies flag is provided
  412. # Anomalies are defined here as instances assigned to clusters whose dominant true label is > 0 (Failure types)
  413. if plot_anomalies:
  414. print("\nChecking anomaly data:")
  415. # Identify clusters that predominantly contain non-normal true labels (failure types)
  416. anomaly_clusters = [cluster_id for cluster_id, label in cluster_dominant_label.items() if label > 0]
  417. # Find indices of all sequences assigned to these "anomaly" clusters
  418. anomaly_indices = np.where(np.isin(all_predicted_cluster_labels, anomaly_clusters))[0]
  419. # If any anomalies are detected
  420. if len(anomaly_indices) > 0:
  421. # Limit the number of anomaly plots to show
  422. num_anomalies_to_plot = min(5, len(anomaly_indices))
  423. colors = ['red', 'green', 'blue'] # Define different colors for features
  424. # Randomly select and plot a few anomaly sequences
  425. for i in np.random.choice(anomaly_indices, num_anomalies_to_plot, replace=False):
  426. # Print shape and sample values for the sequence being plotted
  427. print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
  428. print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
  429. # Create a new figure for each anomaly plot
  430. plt.figure(figsize=(12, 6))
  431. # Plot each feature in the sequence over time steps
  432. for j, feature in enumerate(features):
  433. # Plot the feature values (y-axis) against time steps (x-axis)
  434. plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
  435. # Get the true label and predicted cluster for the title
  436. true_label = all_y_true_categorical[i]
  437. predicted_cluster_for_title = all_predicted_cluster_labels[i]
  438. # Set the title for the anomaly plot, including true label and predicted cluster
  439. plt.title(f'Detected Anomaly (True: {true_label}, Cluster: {predicted_cluster_for_title})') # Corrected title format
  440. plt.xlabel('Time Step')
  441. plt.ylabel('Value')
  442. plt.legend() # Add legend to identify features
  443. plt.show() # Display the plot
  444. else:
  445. print("No anomalies detected based on cluster dominance.")
  446. #################################################################################################
  447. # Plotting Misclassified Instances (Optional)
  448. #################################################################################################
  449. # Plot misclassified instances if the --plot_misclassified flag is provided
  450. # Misclassified are defined here as instances where the true label is DIFFERENT from the dominant label of the assigned cluster
  451. if plot_misclassified:
  452. print("\nChecking misclassified data:")
  453. # Find indices where the true label does not match the dominant label of the predicted cluster
  454. misclassified_indices = np.where(all_y_true_categorical != predicted_labels_numeric)[0]
  455. # If any misclassified instances are found
  456. if len(misclassified_indices) > 0:
  457. # Limit the number of misclassified plots to show
  458. num_misclassified_to_plot = min(5, len(misclassified_indices))
  459. colors = ['red', 'green', 'blue'] # Define different colors for features
  460. # Randomly select and plot a few misclassified sequences
  461. for i in np.random.choice(misclassified_indices, num_misclassified_to_plot, replace=False):
  462. # Print shape and sample values for the sequence being plotted
  463. print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
  464. print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
  465. # Create a new figure for each misclassified plot
  466. plt.figure(figsize=(12, 6))
  467. # Plot each feature in the sequence over time steps
  468. for j, feature in enumerate(features):
  469. # Plot the feature values (y-axis) against time steps (x-axis)
  470. plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
  471. # FIXED: Get labels using index i for plot title
  472. true_label = all_y_true_categorical[i] # Get the true label
  473. predicted_label = predicted_labels_numeric[i] # Get the numeric predicted label (dominant cluster label)
  474. # Set the title for the misclassified plot, including true label and predicted cluster's dominant label
  475. plt.title(f'Misclassified Instance (True: {true_label}, Predicted Cluster Dominant Label: {predicted_label})') # Corrected title format
  476. plt.xlabel('Time Step')
  477. plt.ylabel('Value')
  478. plt.legend() # Add legend to identify features
  479. plt.show() # Display the plot
  480. else:
  481. print("No misclassified instances found based on cluster dominance.")
  482. # Return the true and predicted labels for potential further use
  483. return all_y_true_categorical, predicted_labels_numeric
  484. #####################################################################################################
  485. # Main Execution
  486. #####################################################################################################
  487. # Create the list of true labels for the test data
  488. # Assign a numeric label (0, 1, 2, ...) to each sequence based on its original file class
  489. true_labels_list = []
  490. for i, df in enumerate(dataTest): # Loop through each test DataFrame (each class)
  491. # Create a numpy array of the same length as the DataFrame, filled with the class index (i)
  492. true_labels_list.append(np.full(len(df), i))
  493. # Call the evaluation and plotting function with the necessary data and options
  494. y_true_final, y_pred_final = evaluate_and_plot_anomalies(kmeans, scaled_test_df_list, n_clusters, dataTest, true_labels_list, features, featureNames, unitNames, plot_anomalies=options.plot_anomalies, plot_misclassified=options.plot_misclassified)
  495. #####################################################################################################
  496. # Final Evaluation Metrics (on combined test data)
  497. #####################################################################################################
  498. # Calculate and print final Inertia and Silhouette Score for the combined test data
  499. # Check if there's any reshaped test data available
  500. if X_test_reshaped_list:
  501. # Vertically stack all reshaped test data arrays into a single array
  502. X_test_combined_reshaped = np.vstack(X_test_reshaped_list)
  503. # Concatenate all predicted cluster labels into a single array
  504. all_cluster_labels_test = np.concatenate(cluster_labels_test_list)
  505. # Print K-Means evaluation metrics on the combined test data
  506. print("\nK-Means Model Evaluation on Combined Test Data:")
  507. # Print the final Inertia of the trained K-Means model
  508. print(f"Inertia: {kmeans.inertia_:.4f}")
  509. # Calculate and print Silhouette Score if possible
  510. # Requires more than one unique predicted label and at least one sample
  511. if len(np.unique(all_cluster_labels_test)) > 1 and len(all_cluster_labels_test) > 0:
  512. silhouette = silhouette_score(X_test_combined_reshaped, all_cluster_labels_test)
  513. print(f"Silhouette Score: {silhouette:.4f}")
  514. else:
  515. print("Silhouette Score: Not applicable for single cluster.")
  516. else:
  517. # Print a message if no test data sequences were available for evaluation
  518. print("\nNo test data sequences available to evaluate Inertia and Silhouette Score.")

Powered by TurnKey Linux.