Ako data: Unsupervised/reinforcement learning for anomaly detection and clustering. Unsupervised, trained, and evaluated on labeled anomaly data.

V8_kmeans_anomaly_with_clear_comment.py 47KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646
  1. # Import necessary libraries for data manipulation, numerical operations, plotting, and machine learning
  2. import pandas as pd # For data manipulation and analysis (DataFrames)
  3. import numpy as np # For numerical operations, especially array handling
  4. import matplotlib.pyplot as plt # For creating static, interactive, and animated visualizations
  5. from sklearn.cluster import KMeans # K-Means clustering algorithm
  6. # Import preprocessing tools and metrics from scikit-learn
  7. from sklearn.preprocessing import StandardScaler, LabelEncoder # StandardScaler is used for scaling (RobustScaler was used in another version)
  8. from sklearn.metrics import silhouette_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report # Evaluation metrics
  9. import argparse # For parsing command line arguments
  10. import os # For interacting with the operating system, like manipulating file paths
  11. import seaborn as sns # For drawing attractive statistical graphics (used for confusion matrix heatmap)
  12. # --- Command line arguments setup ---
  13. # This section defines and parses command-line arguments to control script behavior
  14. parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering with visualization.')
  15. # Define arguments with their expected data type, default value, and a help message
  16. parser.add_argument('--timesteps', type=int, default=30, help='Number of timesteps for sequences.') # Length of time window for each sequence
  17. parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means (should match the number of failure types + normal).') # Number of clusters
  18. parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.') # Number of times K-Means algorithm will be run with different centroid seeds
  19. parser.add_argument('--transition', action='store_true', help='Use transition data for testing.') # Flag: if present, use transition test data files
  20. # Plotting flags: action='store_true' means the variable becomes True if the flag is present
  21. parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
  22. parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
  23. parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies (based on clusters).')
  24. parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances (based on clusters).')
  25. # Parse the arguments provided by the user when running the script
  26. options = parser.parse_args()
  27. # Assign the parsed arguments to variables for easier access throughout the script
  28. n_clusters = options.n_clusters
  29. timesteps = options.timesteps
  30. n_init = options.n_init
  31. #####################################################################################################
  32. # --- Data File Configuration ---
  33. # This section defines the list of data files to be used for training and testing
  34. #####################################################################################################
  35. # Specify the number of failure types data is available for (excluding the normal state, which is class 0)
  36. NumberOfFailures = 4 # So far, we have only data for the first 4 types of failures
  37. # Initialize a nested list to store filenames for training and testing data
  38. # datafiles[0] will store training files, datafiles[1] will store testing files
  39. # Each inner list datafiles[train/test][i] corresponds to class i (0 for Normal, 1 to NumberOfFailures for failure types)
  40. datafiles = [[], []] # datafiles[0] for train data filenames, datafiles[1] for test data filenames
  41. # Populate the inner lists. We need NumberOfFailures + 1 inner lists for classes (0 to 4).
  42. for i in range(NumberOfFailures + 1):
  43. datafiles[0].append([]) # Add an empty list for the current class's training files
  44. datafiles[1].append([]) # Add an empty list for the current class's testing files
  45. # Manually assign the base filenames for each class for the training set
  46. # These filenames are expected to be in a 'data' subdirectory relative to the script
  47. datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_'] # Normal training data files (Class 0)
  48. datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_'] # Failure Type 1 training data files (Class 1)
  49. datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_'] # Failure Type 2 training data files (Class 2)
  50. datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_'] # Failure Type 3 training data files (Class 3)
  51. datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_'] # Failure Type 4 training data files (Class 4)
  52. # Assign base filenames for the testing set based on the --transition flag
  53. if options.transition:
  54. # Use test files that are specified as including transition periods
  55. datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_'] # Normal test data files (Class 0)
  56. datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_'] # Failure Type 1 test data files (with TRANSITION) (Class 1)
  57. datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_'] # Failure Type 2 test data files (with TRANSITION) (Class 2)
  58. datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_'] # Failure Type 3 test data files (with TRANSITION) (Class 3)
  59. datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_'] # Failure Type 4 test data files (Class 4)
  60. else:
  61. # Use test files that are specified as not including transition periods
  62. datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_'] # Normal test data files (Class 0)
  63. datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_'] # Failure Type 1 test data files (Class 1)
  64. datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_'] # Failure Type 2 test data files (Class 2)
  65. datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_'] # Failure Type 3 test data files (Class 3)
  66. datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_'] # Failure Type 4 test data files (Class 4)
  67. # --- Feature Definition ---
  68. # Define the list of features (column names) to be extracted from the data files
  69. features = ['r1 s1', 'r1 s4', 'r1 s5']
  70. # Store the original number of features. This is needed later for sequence creation.
  71. n_original_features = len(features) # Store the original number of features
  72. # Dictionaries to store display names (using LaTeX-like format) and units for features, mainly used for plot labels
  73. featureNames = {}
  74. featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>' # Evaporator Temperature
  75. featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>' # Condenser Temperature
  76. featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>' # Air Temperature
  77. unitNames = {}
  78. unitNames['r1 s1'] = r'($^o$C)' # Degrees Celsius unit
  79. unitNames['r1 s4'] = r'($^o$C)'
  80. unitNames['r1 s5'] = r'($^o$C)'
  81. # Redundant variable, but kept from original code
  82. NumFeatures = len(features)
  83. #####################################################################################################
  84. # --- Data Loading and Preprocessing (Training Data) ---
  85. # This section loads, cleans, and preprocesses the training data
  86. #####################################################################################################
  87. # List to hold processed DataFrames for training data, organized by class
  88. # Each element dataTrain[i] is a DataFrame containing concatenated data for class i
  89. dataTrain = []
  90. # Loop through each list of filenames for each training class (Class 0, 1, 2, 3, 4)
  91. for class_files in datafiles[0]:
  92. class_dfs = [] # Temporary list to hold DataFrames loaded for the current class
  93. # Loop through each base filename in the current class's list
  94. for base_filename in class_files:
  95. # Construct the full absolute file path assuming data is in a 'data' subdirectory
  96. script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory where the current script is located
  97. data_dir = os.path.join(script_dir, 'data') # Define the path to the 'data' subdirectory
  98. filepath = os.path.join(data_dir, f'{base_filename}.csv') # Combine directory and filename
  99. try:
  100. # Read the CSV file into a pandas DataFrame
  101. df = pd.read_csv(filepath)
  102. # Convert the 'datetime' column to datetime objects. Try two formats and coerce errors (set invalid parsing to NaT).
  103. df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
  104. # Fill any NaT values from the first attempt by trying a second format.
  105. df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
  106. # Convert the specified feature columns to numeric type. Coerce errors (set invalid parsing to NaN).
  107. for col in features:
  108. df[col] = pd.to_numeric(df[col], errors='coerce')
  109. # Set the 'timestamp' column as the DataFrame index
  110. # Resample the data to a 5-minute frequency ('5Min').
  111. # Select only the specified 'features' columns.
  112. # Calculate the mean for each 5-minute interval.
  113. df = df.set_index('timestamp').resample('5Min')[features].mean() # Set index, resample, and calculate mean for features
  114. # Estimate any remaining missing values (NaN) within the feature columns using linear interpolation
  115. df = df[features].interpolate() # Estimate missing values using linear interpolation
  116. # Append the processed DataFrame to the list for the current class
  117. class_dfs.append(df)
  118. except FileNotFoundError:
  119. # If a file is not found, print a warning and skip processing it
  120. print(f"Warning: File {filepath} not found and skipped.")
  121. # If the list of DataFrames for the current class is not empty, concatenate them into a single DataFrame
  122. if class_dfs:
  123. dataTrain.append(pd.concat(class_dfs))
  124. # Concatenate all DataFrames from all training classes into a single large DataFrame for training the scaler and the model
  125. combined_train_data = pd.concat(dataTrain)
  126. #####################################################################################################
  127. # --- Data Loading and Preprocessing (Test Data) ---
  128. # This section loads, cleans, and preprocesses the testing data
  129. # The process is identical to the training data loading, but done separately for test files
  130. #####################################################################################################
  131. # List to hold processed DataFrames for test data, organized by class
  132. # Each element dataTest[i] is a DataFrame containing concatenated data for class i
  133. dataTest = []
  134. # Loop through each list of filenames for each test class (Class 0, 1, 2, 3, 4)
  135. for class_files in datafiles[1]:
  136. class_dfs = [] # Temporary list to hold DataFrames loaded for the current class
  137. # Loop through each base filename in the current class's list
  138. for base_filename in class_files:
  139. # Construct the full absolute file path
  140. script_dir = os.path.dirname(os.path.abspath(__file__))
  141. data_dir = os.path.join(script_dir, 'data')
  142. filepath = os.path.join(data_dir, f'{base_filename}.csv')
  143. try:
  144. # Read the CSV file into a pandas DataFrame
  145. df = pd.read_csv(filepath)
  146. # Convert the 'datetime' column to datetime objects
  147. df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
  148. df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
  149. # Convert the specified feature columns to numeric type
  150. for col in features:
  151. df[col] = pd.to_numeric(df[col], errors='coerce')
  152. # Set the 'timestamp' column as the index and resample to 5-minute frequency, calculating the mean
  153. df = df.set_index('timestamp').resample('5Min')[features].mean() # Set index, resample, and calculate mean for features
  154. # Estimate any remaining missing values (NaN) using linear interpolation
  155. df = df[features].interpolate() # Estimate missing values using linear interpolation
  156. # Append the processed DataFrame to the list for the current class
  157. class_dfs.append(df)
  158. except FileNotFoundError:
  159. # If a file is not found, print a warning and skip processing it
  160. print(f"Warning: File {filepath} not found and skipped.")
  161. # If the list of DataFrames for the current class is not empty, concatenate them into a single DataFrame
  162. if class_dfs:
  163. dataTest.append(pd.concat(class_dfs))
  164. #####################################################################################################
  165. # --- Raw Data Plotting (Optional) ---
  166. # This section plots the unprocessed data for visualization if the --plot_raw flag is set
  167. #####################################################################################################
  168. # Check if the plot_raw argument was set to True
  169. if options.plot_raw:
  170. num_features = len(features) # Get the number of features to determine plot layout
  171. # Create a figure and a grid of subplots. Share the x-axis among all subplots.
  172. fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
  173. # Ensure 'axes' is always an array, even if there's only one feature (and thus only one subplot)
  174. if num_features == 1:
  175. axes = [axes]
  176. # Loop through each feature by index (i) and name (feature)
  177. for i, feature in enumerate(features):
  178. # Loop through each test data DataFrame (representing a different class)
  179. for k, df in enumerate(dataTest):
  180. # Plot the data for the current feature and class over time
  181. # Use f-string to include the class number in the label
  182. axes[i].plot(df.index, df[feature], label=f'Class {k}')
  183. # Set the label for the y-axis using the feature's display name and unit
  184. axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
  185. # Set the title for the subplot using the feature's display name
  186. axes[i].set_title(featureNames[feature])
  187. # Add a legend to identify the classes being plotted in each subplot
  188. # Placing it on the last subplot is common to avoid repetition
  189. axes[num_features - 1].legend(loc='upper right') # Place legend on the last subplot
  190. # Adjust layout to prevent plot elements (like labels) from overlapping
  191. plt.tight_layout()
  192. # Display the plot window
  193. plt.show()
  194. # exit(0) # Uncomment this line if you want the script to stop after showing raw data plots
  195. ########################################################################################################
  196. # --- Data Scaling ---
  197. # This section scales the data using StandardScaler
  198. ########################################################################################################
  199. # Initialize the StandardScaler. This scaler standardizes features by removing the mean and scaling to unit variance.
  200. scaler = StandardScaler() # Using StandardScaler in this version
  201. # Fit the scaler on the training data and then transform the training data.
  202. # The scaler learns the mean and standard deviation from the combined_train_data[features].
  203. scaled_train_data = scaler.fit_transform(combined_train_data[features]) # Fit on training, transform training
  204. # Transform each test data DataFrame using the scaler fitted on the training data.
  205. # The same scaling parameters (mean, standard deviation) from the training data are applied to the test data.
  206. # A list comprehension efficiently applies the transformation to each DataFrame in dataTest.
  207. scaled_test_data_list = []
  208. for df in dataTest: # Loop through each test DataFrame
  209. scaled_test_data_list.append(scaler.transform(df[features])) # Transform each test DataFrame
  210. # Convert the scaled NumPy arrays back into pandas DataFrames.
  211. # This step is optional but can be helpful for inspection and keeping track of timestamps/column names.
  212. scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
  213. # Create a list of scaled test DataFrames, maintaining original indices and column names.
  214. scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
  215. ############################################################################################################
  216. # --- Sequence Creation with Rate of Change Feature Engineering ---
  217. # This section defines a function to create time sequences and adds rate of change features
  218. ############################################################################################################
  219. # Function to create time sequences (windows) from data and append rate of change features
  220. # 'data': Input NumPy array (scaled feature values)
  221. # 'timesteps': The length of each sequence (number of time points)
  222. # 'original_features': The number of features in the input data (used for padding) - NOTE: Parameter name is potentially misleading, should be count
  223. def create_sequences_with_rate_of_change(data, timesteps, original_features): # NOTE: original_features parameter likely intended as original_features_count
  224. sequences = [] # List to store the generated sequences
  225. # Iterate through the data to create overlapping sequences.
  226. # The loop runs until the last possible start index for a sequence of length 'timesteps'.
  227. for i in range(len(data) - timesteps + 1):
  228. # Extract a sequence (slice) of 'timesteps' length starting from index 'i'.
  229. sequence = data[i:i + timesteps]
  230. # Calculate the difference between consecutive time points within the sequence.
  231. # np.diff with axis=0 calculates the difference along the rows (time).
  232. # This results in an array with shape (timesteps - 1, number_of_features).
  233. rate_of_change = np.diff(sequence[:timesteps], axis=0)
  234. # Pad the rate of change array to match the original sequence's length ('timesteps').
  235. # np.diff reduces the dimension by 1, so we add a row of zeros at the beginning.
  236. # The padding shape should be (1 row, number of columns equal to original features count).
  237. padding = np.zeros((1, original_features)) # NOTE: This line caused a TypeError in previous debugging if original_features was not the count. It should likely use n_original_features or a correctly passed count.
  238. # Vertically stack the padding row on top of the rate of change array.
  239. # The result has shape (timesteps, number_of_features).
  240. rate_of_change_padded = np.vstack((padding, rate_of_change)) # Stack padding on top of diff
  241. # Horizontally stack the original sequence and the padded rate of change array.
  242. # The resulting combined sequence has shape (timesteps, 2 * number_of_features).
  243. sequences.append(np.hstack((sequence, rate_of_change_padded))) # Concatenate original sequence and padded rate of change
  244. # Convert the list of 3D sequences into a single 3D NumPy array.
  245. return np.array(sequences)
  246. # Create time sequences with rate of change for the scaled training data.
  247. # The output is a 3D array: (number of sequences, timesteps, 2 * n_original_features).
  248. X_train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features) # Pass n_original_features (correctly passes count)
  249. # Create time sequences with rate of change for each scaled test data DataFrame.
  250. # This results in a list where each element is a 3D array of sequences for a specific test class.
  251. X_test_sequences_list = [create_sequences_with_rate_of_change(df.values, timesteps, n_original_features) for df in scaled_test_df_list] # Pass n_original_features (correctly passes count)
  252. ############################################################################################################
  253. # --- K-Means Clustering Model ---
  254. # This section initializes, trains, and applies the K-Means model
  255. ############################################################################################################
  256. # Reshape the training sequences into a 2D array for the K-Means algorithm.
  257. # K-Means expects data in the shape (number of samples, number of features).
  258. # Each sequence (timesteps * total_features) is flattened into a single row for clustering.
  259. n_samples, n_timesteps, n_total_features = X_train_sequences.shape # Get dimensions of the sequence array
  260. X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_total_features) # Flatten each sequence
  261. # Initialize the KMeans model.
  262. # n_clusters: The desired number of clusters (set by command line argument).
  263. # random_state=42: Sets the seed for random number generation for initial centroids, ensuring reproducibility.
  264. # n_init=10: Runs the K-Means algorithm 10 times with different centroid initializations and selects the best result based on inertia.
  265. kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # Initialize KMeans model
  266. # Train (fit) the KMeans model on the reshaped training data.
  267. kmeans.fit(X_train_reshaped)
  268. ############################################################################################################################
  269. # --- Predict Clusters for Test Data ---
  270. # This section applies the trained K-Means model to the test data to get cluster assignments
  271. ############################################################################################################################
  272. # List to store the predicted cluster labels for each test data class.
  273. cluster_labels_test_list = []
  274. # List to store the reshaped test data arrays (flattened sequences), needed for later evaluation metrics.
  275. X_test_reshaped_list = []
  276. kmeans_models = [] # This variable was declared but not used in the original code (To store kmeans model for each test set)
  277. # Loop through each test data sequence array (one for each test class).
  278. for i, X_test_seq in enumerate(X_test_sequences_list):
  279. # Get the dimensions of the current test sequence array.
  280. n_samples_test, n_timesteps_test, n_total_features_test = X_test_seq.shape
  281. # Reshape the test sequences for prediction, flattening each sequence into a single data point for KMeans.
  282. X_test_reshaped = X_test_seq.reshape(n_samples_test, n_timesteps_test * n_total_features_test)
  283. # Use the trained K-Means model to predict the cluster label for each reshaped test sequence.
  284. labels = kmeans.predict(X_test_reshaped)
  285. # Append the predicted labels for the current test class to the list.
  286. cluster_labels_test_list.append(labels)
  287. # Append the reshaped test data for the current class to the list (needed for Silhouette score calculation later).
  288. X_test_reshaped_list.append(X_test_reshaped) # Append reshaped data
  289. # kmeans_models.append(kmeans) # Append the trained kmeans model (Variable not used)
  290. ############################################################################################################################
  291. # --- Plotting Clustered Data (Optional) ---
  292. # This function plots the original data points, colored according to their predicted cluster
  293. ############################################################################################################
  294. # Function to plot the original feature data, with points colored based on their assigned cluster ID.
  295. # 'original_data_list': List of original (unscaled) test DataFrames, one per class.
  296. # 'cluster_labels_list': List of predicted cluster label arrays, one for each corresponding test DataFrame.
  297. # 'n_clusters': Total number of clusters used.
  298. # 'features', 'featureNames', 'unitNames': Dictionaries for plotting labels.
  299. def plot_clustered_data(original_data_list, cluster_labels_list, n_clusters, features, featureNames, unitNames):
  300. num_features = len(features) # Number of features to plot (determines number of subplots)
  301. # Create a figure and a set of subplots (one row, num_features columns). Share the x-axis.
  302. fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
  303. # Ensure 'axes' is always an array, even if there's only one feature (and thus only one subplot)
  304. if num_features == 1:
  305. axes = [axes]
  306. # Generate a color map to assign distinct colors to each cluster ID
  307. colors = plt.cm.viridis(np.linspace(0, 1, n_clusters)) # Assign colors to each cluster
  308. # Loop through each original test data DataFrame and its corresponding cluster labels
  309. for k, df in enumerate(original_data_list): # k is the index of the test class/DataFrame
  310. original_indices = df.index # Get the time index from the original DataFrame
  311. # The cluster labels correspond to sequences, which start 'timesteps' points later than the raw data.
  312. # Get the time index corresponding to the end of each sequence for plotting.
  313. time_index = original_indices[timesteps - 1:]
  314. # Loop through each original feature to plot it
  315. for i, feature in enumerate(features): # i is the index of the feature
  316. # Loop through each possible cluster ID
  317. for cluster_id in range(n_clusters):
  318. # Find the indices within the current test data corresponding to the current cluster ID
  319. cluster_indices_kmeans = np.where(cluster_labels_list[k] == cluster_id)[0]
  320. # If there are any data points assigned to this cluster
  321. if len(cluster_indices_kmeans) > 0:
  322. # Plot the original feature data points for this specific cluster ID.
  323. # x-axis: time_index points corresponding to the sequence end
  324. # y-axis: original feature values at those time_index points
  325. # color: color assigned to the cluster
  326. # label: label for the cluster (only show for the first class (k==0) to avoid redundant legends)
  327. # s=10: size of the scatter markers
  328. axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10)
  329. # Set the y-axis label and title for the current feature's subplot
  330. axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
  331. axes[i].set_title(featureNames[feature])
  332. # Add a legend to the plot. Place it on the last subplot.
  333. axes[num_features - 1].legend(loc='upper right') # Place legend on the last subplot
  334. # Adjust layout to prevent plot elements (like labels) from overlapping
  335. plt.tight_layout()
  336. # Display the plot window
  337. plt.show()
  338. # Call the plotting function if the --plot_clustered command line flag is provided
  339. if options.plot_clustered:
  340. plot_clustered_data(dataTest, cluster_labels_test_list, n_clusters, features, featureNames, unitNames)
  341. #####################################################################################################
  342. # --- Evaluation and plotting of anomalies and misclassified instances (based on cluster labels) ---
  343. # This section evaluates clustering performance using classification metrics and plots specific instances
  344. #####################################################################################################
  345. # Function to evaluate clustering results and plot anomalies/misclassified instances
  346. # 'kmeans_model': The trained K-Means model.
  347. # 'scaled_test_data_list': List of scaled test data DataFrames.
  348. # 'n_clusters': Number of clusters.
  349. # 'original_test_data_list': List of original (unscaled) test data DataFrames.
  350. # 'true_labels_list': List of arrays containing true class labels for test data.
  351. # 'features', 'featureNames', 'unitNames': Feature information for plotting.
  352. # 'plot_anomalies', 'plot_misclassified': Boolean flags from command line arguments.
  353. def evaluate_and_plot_anomalies(kmeans_model, scaled_test_data_list, n_clusters, original_test_data_list, true_labels_list, features, featureNames, unitNames, plot_anomalies=False, plot_misclassified=False):
  354. # Lists to accumulate true labels, predicted cluster labels, and original sequences for ALL test data
  355. all_y_true_categorical = [] # Stores the true class label (0, 1, etc.) for each sequence across all test data
  356. all_predicted_cluster_labels = [] # Stores the predicted cluster ID for each sequence across all test data
  357. all_original_test_sequences = [] # Stores the original feature values for each sequence (window) across all test data, used for plotting
  358. # Lists to store evaluation metrics calculated per test class DataFrame
  359. inertia_values = [] # Inertia score when the model predicts on each class's data (Note: this likely appends the model's overall inertia repeatedly)
  360. silhouette_scores = [] # Silhouette score calculated for each class's data based on predicted clusters
  361. # Loop through each test data class (scaled data, original data, and true labels)
  362. for i, (scaled_test_df, original_test_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)): # i is the class index
  363. # Create sequences with rate of change for the current scaled test data DataFrame
  364. X_test_sequences = create_sequences_with_rate_of_change(scaled_test_df.values, timesteps, n_original_features) # Create sequences for the current class
  365. # If no sequences could be generated for this class (e.g., data too short), print warning and skip
  366. if X_test_sequences.size == 0:
  367. print(f"Warning: No test sequences generated for class {i}. Skipping evaluation for this class.")
  368. continue # Skip to the next iteration (next class)
  369. # Get the number of sequences generated for the current class
  370. n_samples_test = X_test_sequences.shape[0]
  371. # Reshape these sequences for prediction by the K-Means model (flatten each sequence)
  372. X_test_reshaped = X_test_sequences.reshape(n_samples_test, -1)
  373. # Predict the cluster label for each reshaped sequence using the trained model
  374. cluster_labels_predicted = kmeans_model.predict(X_test_reshaped)
  375. # Append the model's inertia (this seems like a repeated value of the final training inertia)
  376. inertia_values.append(kmeans_model.inertia_) # Check if this is the intended calculation for per-class evaluation
  377. # Calculate the Silhouette score for the current class's data based on its predicted clusters
  378. # Only calculate if there's more than one unique predicted label and at least one sample
  379. if len(np.unique(cluster_labels_predicted)) > 1 and len(cluster_labels_predicted) > 0:
  380. silhouette_scores.append(silhouette_score(X_test_reshaped, cluster_labels_predicted))
  381. else:
  382. silhouette_scores.append(np.nan) # Append NaN if silhouette cannot be calculated
  383. # Get the original time indices corresponding to the *end* of each sequence for this class
  384. original_indices = original_test_df.index[timesteps - 1:]
  385. # Collect true labels, predicted cluster labels, and the actual original sequences for this class
  386. # Loop through the generated sequences (represented by index j) and their corresponding true labels
  387. for j, label in enumerate(y_true_categorical[timesteps - 1:]): # Iterate over true labels aligned with sequence ends
  388. all_y_true_categorical.append(label) # Add the true label
  389. all_predicted_cluster_labels.append(cluster_labels_predicted[j]) # Add the predicted cluster label
  390. # Determine the start and end indices in the original DataFrame for the current sequence (j)
  391. # The sequence at index j in the sequences array corresponds to data starting at index 'start_index' in the original DataFrame
  392. start_index = original_test_df.index.get_loc(original_indices[j]) - (timesteps - 1)
  393. end_index = start_index + timesteps
  394. # Extract and add the original (unscaled) feature values for this specific sequence
  395. all_original_test_sequences.append(original_test_df[features].iloc[start_index:end_index].values) # Append original sequence data
  396. # Convert the accumulated lists across all test classes into NumPy arrays
  397. all_y_true_categorical = np.array(all_y_true_categorical) # Array of true labels for all sequences
  398. all_predicted_cluster_labels = np.array(all_predicted_cluster_labels) # Array of predicted cluster IDs for all sequences
  399. all_original_test_sequences = np.array(all_original_test_sequences) # 3D array of original sequence data for all sequences
  400. # Print overall evaluation metrics based on the accumulated data
  401. print("\nEvaluation Metrics:")
  402. # Print the mean of the recorded inertia values (Note: Check if this average of the model's final inertia is the desired metric here)
  403. print(f"Inertia (final): {np.mean(inertia_values):.4f}")
  404. # Print the mean of the calculated silhouette scores per class (ignoring NaNs)
  405. print(f"Average Silhouette Score (valid cases): {np.nanmean(silhouette_scores):.4f}")
  406. # --- Cluster Analysis: Map Cluster IDs to Dominant True Labels ---
  407. # This maps each cluster ID to the true class label that appears most frequently among the sequences assigned to that cluster.
  408. cluster_dominant_label = {} # Dictionary: {cluster_id: dominant_true_label}
  409. for cluster_id in range(n_clusters): # Loop through each possible cluster ID
  410. # Find the indices of all sequences that were predicted to belong to the current cluster_id
  411. indices_in_cluster = np.where(all_predicted_cluster_labels == cluster_id)[0]
  412. # If the cluster is not empty (contains sequences)
  413. if len(indices_in_cluster) > 0:
  414. # Get the true labels of all sequences that fall into this cluster
  415. labels_in_cluster = all_y_true_categorical[indices_in_cluster]
  416. # If there are actual labels in this subset (should be true if indices_in_cluster > 0)
  417. if len(labels_in_cluster) > 0:
  418. # Count the occurrences of each true label in this cluster and find the index of the most frequent one
  419. dominant_label = np.argmax(np.bincount(labels_in_cluster))
  420. cluster_dominant_label[cluster_id] = dominant_label # Assign the dominant true label to the cluster ID
  421. else:
  422. cluster_dominant_label[cluster_id] = -1 # If for some reason no labels were found, mark as -1
  423. else:
  424. cluster_dominant_label[cluster_id] = -1 # If the cluster is empty, mark as -1
  425. # --- Generate Predicted Labels for Classification Evaluation ---
  426. # Create a new array of predicted labels, where each sequence's predicted cluster ID is mapped to the cluster's dominant true label.
  427. # This allows treating the clustering result as a classification output for evaluation.
  428. # Use .get(cluster_id, -1) to handle cases where a cluster_id might not be in cluster_dominant_label (e.g., if cluster was empty).
  429. predicted_labels_numeric = np.array([cluster_dominant_label.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels])
  430. # --- Classification Evaluation (using mapped labels) ---
  431. # Evaluate the performance by comparing the true labels with the predicted labels derived from cluster dominance.
  432. # Only include instances where a dominant label was successfully assigned (not -1).
  433. valid_indices = predicted_labels_numeric != -1 # Indices of sequences that have a valid predicted numeric label
  434. # Proceed with evaluation metrics and confusion matrix if there are valid instances and more than one true class is present in these instances.
  435. if np.sum(valid_indices) > 0 and len(np.unique(all_y_true_categorical[valid_indices])) > 1:
  436. print("\nEvaluation Results (Clusters vs True Labels):")
  437. # Print a detailed classification report (Precision, Recall, F1-score, Support for each class, and overall metrics).
  438. print(classification_report(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices]))
  439. # Compute the Confusion Matrix: rows are true labels, columns are predicted dominant labels.
  440. cm = confusion_matrix(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices])
  441. # Create a figure for the confusion matrix plot.
  442. plt.figure(figsize=(8, 6))
  443. # Use seaborn heatmap to visualize the confusion matrix.
  444. # annot=True displays the values in the cells.
  445. # fmt='d' formats the values as integers.
  446. # cmap='Blues' sets the color map.
  447. sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
  448. plt.xlabel('Predicted Cluster (Dominant True Label)') # Label for the x-axis
  449. plt.ylabel('True Label') # Label for the y-axis
  450. plt.title('Confusion Matrix (Clusters vs True Labels)') # Title of the plot
  451. plt.show() # Display the plot
  452. else:
  453. print("\nCould not perform detailed evaluation (not enough data or classes with assigned dominant labels).")
  454. #################################################################################################
  455. # --- Plotting Anomalies (Optional) ---
  456. # This section plots time series data for sequences identified as anomalies based on clustering
  457. #################################################################################################
  458. # Check if the --plot_anomalies command line flag is provided
  459. if plot_anomalies:
  460. print("\nChecking anomaly data:")
  461. # Define "anomaly clusters" as those whose dominant true label is greater than 0 (i.e., corresponds to any failure type).
  462. anomaly_clusters = [cluster_id for cluster_id, label in cluster_dominant_label.items() if label > 0]
  463. # Find the indices of all sequences that were assigned to any of the "anomaly clusters".
  464. anomaly_indices = np.where(np.isin(all_predicted_cluster_labels, anomaly_clusters))[0]
  465. # If any anomaly sequences were found
  466. if len(anomaly_indices) > 0:
  467. # Determine how many anomaly sequences to plot (up to 5, or fewer if less were found).
  468. num_anomalies_to_plot = min(5, len(anomaly_indices))
  469. colors = ['red', 'green', 'blue'] # Define a simple list of colors to cycle through for features
  470. # Randomly select a few anomaly sequences and plot their original data.
  471. for i in np.random.choice(anomaly_indices, num_anomalies_to_plot, replace=False): # Select random indices without replacement
  472. # Print shape and first few values of the original sequence being plotted for debugging/information
  473. print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
  474. print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
  475. # Create a new figure for each individual anomaly plot.
  476. plt.figure(figsize=(12, 6))
  477. # Plot each original feature within the selected sequence over the timesteps.
  478. for j, feature in enumerate(features): # j is feature index, feature is feature name
  479. # Plot the feature values (y-axis) against the sequence timestep index (x-axis from 0 to timesteps-1).
  480. # Use colors cycling through the defined list.
  481. plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
  482. # NOTE: true_label and predicted_cluster_for_title variables are not defined in this block before use in the title. This will cause a NameError.
  483. # You need to add lines like:
  484. # true_label = all_y_true_categorical[i]
  485. # predicted_cluster_for_title = all_predicted_cluster_labels[i]
  486. plt.title('Detected Anomalies (based on cluster dominance)') # Title of the plot
  487. plt.xlabel('Time Step') # Label for the x-axis (timestep within the sequence)
  488. plt.ylabel('Value') # Label for the y-axis (feature value)
  489. plt.legend() # Add a legend to identify which line corresponds to which feature
  490. plt.show() # Display the plot window
  491. else:
  492. # If no sequences were assigned to anomaly clusters, print a message.
  493. print("No anomalies detected based on cluster dominance.")
  494. #################################################################################################
  495. # --- Plotting Misclassified Instances (Optional) ---
  496. # This section plots time series data for sequences that were "misclassified" based on cluster dominance
  497. #################################################################################################
  498. # Check if the --plot_misclassified command line flag is provided
  499. if plot_misclassified:
  500. print("\nChecking misclassified data:")
  501. # Find the indices of all sequences where the true label does NOT match the dominant label of their assigned cluster.
  502. misclassified_indices = np.where(all_y_true_categorical != predicted_labels_numeric)[0]
  503. # If any misclassified instances are found
  504. if len(misclassified_indices) > 0:
  505. # Determine how many misclassified sequences to plot (up to 5, or fewer if less were found).
  506. num_misclassified_to_plot = min(5, len(misclassified_indices))
  507. colors = ['red', 'green', 'blue'] # Define a simple list of colors to cycle through for features
  508. # Randomly select a few misclassified sequences and plot their original data.
  509. for i in np.random.choice(misclassified_indices, num_misclassified_to_plot, replace=False): # Select random indices without replacement
  510. # Print shape and first few values of the original sequence being plotted for debugging/information
  511. print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
  512. print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
  513. # Create a new figure for each individual misclassified plot.
  514. plt.figure(figsize=(12, 6))
  515. # Plot each original feature within the selected sequence over the timesteps.
  516. for j, feature in enumerate(features): # j is feature index, feature is feature name
  517. # Plot the feature values (y-axis) against the sequence timestep index (x-axis from 0 to timesteps-1).
  518. # Use colors cycling through the defined list.
  519. plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
  520. # NOTE: true_label and predicted_label are defined here, which fixes the NameError in this block.
  521. true_label = all_y_true_categorical[i] # Get the true label for the current sequence
  522. predicted_label = predicted_labels_numeric[i] # Get the predicted numeric label (dominant cluster label) for the current sequence
  523. # Set the title for the misclassified plot, indicating true label and the dominant label of the predicted cluster.
  524. plt.title(f'Misclassified Instance (True: {true_label}, Predicted Cluster: {predicted_label})') # Title including true label and dominant predicted label
  525. plt.xlabel('Time Step') # Label for the x-axis
  526. plt.ylabel('Value') # Label for the y-axis
  527. plt.legend() # Add a legend to identify features
  528. plt.show() # Display the plot window
  529. else:
  530. # If no misclassified sequences were found, print a message.
  531. print("No misclassified instances found based on cluster dominance.")
  532. # Return the arrays of true labels and predicted numeric labels (based on cluster dominance)
  533. # These can be used for further analysis or saving results
  534. return all_y_true_categorical, predicted_labels_numeric
  535. #####################################################################################################
  536. # --- Main Execution Flow ---
  537. # This is the main part of the script that calls the functions to run the analysis
  538. #####################################################################################################
  539. # Create a list of true class labels for the test data.
  540. # This list will contain arrays, where each array corresponds to a test class and contains the true label for every data point in that class.
  541. # The labels are the index of the class (0 for Normal, 1 for Failure 1, etc.).
  542. true_labels_list = []
  543. for i, df in enumerate(dataTest): # Loop through each test DataFrame (each class)
  544. # Create a numpy array filled with the class index 'i', with a length equal to the number of rows in the DataFrame.
  545. true_labels_list.append(np.full(len(df), i))
  546. # Call the main evaluation and plotting function.
  547. # Pass the trained model, scaled/original test data, number of clusters, true labels, feature info, and plotting options.
  548. # This function performs the prediction on test data, calculates evaluation metrics, and handles plotting based on flags.
  549. y_true_final, y_pred_final = evaluate_and_plot_anomalies(kmeans, scaled_test_df_list, n_clusters, dataTest, true_labels_list, features, featureNames, unitNames, plot_anomalies=options.plot_anomalies, plot_misclassified=options.plot_misclassified)
  550. #####################################################################################################
  551. # --- Final Evaluation Metrics (on combined test data) ---
  552. # This section calculates and prints overall evaluation metrics after processing all test data
  553. #####################################################################################################
  554. # Calculate and print the final Inertia and Silhouette Score for the combined test data.
  555. # Check if there is any reshaped test data available (i.e., if any test data files were processed).
  556. if X_test_reshaped_list:
  557. # Vertically stack all the reshaped test data arrays from different classes into a single array.
  558. # This array contains all flattened sequences from all test data.
  559. X_test_combined_reshaped = np.vstack(X_test_reshaped_list)
  560. # Concatenate all the predicted cluster labels from different classes into a single array.
  561. all_cluster_labels_test = np.concatenate(cluster_labels_test_list)
  562. # Print a header for the final evaluation metrics.
  563. print("\nK-Means Model Evaluation on Combined Test Data:")
  564. # Print the final Inertia of the trained K-Means model on the training data.
  565. # Note: This inertia value is from the model fitting process, not a specific calculation on the combined test data.
  566. print(f"Inertia: {kmeans.inertia_:.4f}")
  567. # Calculate and print the Silhouette Score for the combined test data based on the predicted cluster labels.
  568. # This metric evaluates how well-separated the clusters are based on the data points within them.
  569. # Only calculate if there is more than one unique predicted cluster label and at least one data point.
  570. if len(np.unique(all_cluster_labels_test)) > 1 and len(all_cluster_labels_test) > 0:
  571. silhouette = silhouette_score(X_test_combined_reshaped, all_cluster_labels_test)
  572. print(f"Silhouette Score: {silhouette:.4f}")
  573. else:
  574. # If Silhouette score cannot be calculated, print a message.
  575. print("Silhouette Score: Not applicable for single cluster.")
  576. else:
  577. # If no test data sequences were available at all, print a message.
  578. print("\nNo test data sequences available to evaluate Inertia and Silhouette Score.")

Powered by TurnKey Linux.