|
@@ -0,0 +1,646 @@
|
|
1
|
+# Import necessary libraries for data manipulation, numerical operations, plotting, and machine learning
|
|
2
|
+import pandas as pd # For data manipulation and analysis (DataFrames)
|
|
3
|
+import numpy as np # For numerical operations, especially array handling
|
|
4
|
+import matplotlib.pyplot as plt # For creating static, interactive, and animated visualizations
|
|
5
|
+from sklearn.cluster import KMeans # K-Means clustering algorithm
|
|
6
|
+# Import preprocessing tools and metrics from scikit-learn
|
|
7
|
+from sklearn.preprocessing import StandardScaler, LabelEncoder # StandardScaler is used for scaling (RobustScaler was used in another version)
|
|
8
|
+from sklearn.metrics import silhouette_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report # Evaluation metrics
|
|
9
|
+import argparse # For parsing command line arguments
|
|
10
|
+import os # For interacting with the operating system, like manipulating file paths
|
|
11
|
+import seaborn as sns # For drawing attractive statistical graphics (used for confusion matrix heatmap)
|
|
12
|
+
|
|
13
|
+# --- Command line arguments setup ---
|
|
14
|
+# This section defines and parses command-line arguments to control script behavior
|
|
15
|
+parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering with visualization.')
|
|
16
|
+
|
|
17
|
+# Define arguments with their expected data type, default value, and a help message
|
|
18
|
+parser.add_argument('--timesteps', type=int, default=30, help='Number of timesteps for sequences.') # Length of time window for each sequence
|
|
19
|
+parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means (should match the number of failure types + normal).') # Number of clusters
|
|
20
|
+parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.') # Number of times K-Means algorithm will be run with different centroid seeds
|
|
21
|
+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.') # Flag: if present, use transition test data files
|
|
22
|
+# Plotting flags: action='store_true' means the variable becomes True if the flag is present
|
|
23
|
+parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
|
|
24
|
+parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
|
|
25
|
+parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies (based on clusters).')
|
|
26
|
+parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances (based on clusters).')
|
|
27
|
+
|
|
28
|
+# Parse the arguments provided by the user when running the script
|
|
29
|
+options = parser.parse_args()
|
|
30
|
+
|
|
31
|
+# Assign the parsed arguments to variables for easier access throughout the script
|
|
32
|
+n_clusters = options.n_clusters
|
|
33
|
+timesteps = options.timesteps
|
|
34
|
+n_init = options.n_init
|
|
35
|
+
|
|
36
|
+#####################################################################################################
|
|
37
|
+# --- Data File Configuration ---
|
|
38
|
+# This section defines the list of data files to be used for training and testing
|
|
39
|
+#####################################################################################################
|
|
40
|
+
|
|
41
|
+# Specify the number of failure types data is available for (excluding the normal state, which is class 0)
|
|
42
|
+NumberOfFailures = 4 # So far, we have only data for the first 4 types of failures
|
|
43
|
+
|
|
44
|
+# Initialize a nested list to store filenames for training and testing data
|
|
45
|
+# datafiles[0] will store training files, datafiles[1] will store testing files
|
|
46
|
+# Each inner list datafiles[train/test][i] corresponds to class i (0 for Normal, 1 to NumberOfFailures for failure types)
|
|
47
|
+datafiles = [[], []] # datafiles[0] for train data filenames, datafiles[1] for test data filenames
|
|
48
|
+# Populate the inner lists. We need NumberOfFailures + 1 inner lists for classes (0 to 4).
|
|
49
|
+for i in range(NumberOfFailures + 1):
|
|
50
|
+ datafiles[0].append([]) # Add an empty list for the current class's training files
|
|
51
|
+ datafiles[1].append([]) # Add an empty list for the current class's testing files
|
|
52
|
+
|
|
53
|
+# Manually assign the base filenames for each class for the training set
|
|
54
|
+# These filenames are expected to be in a 'data' subdirectory relative to the script
|
|
55
|
+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_'] # Normal training data files (Class 0)
|
|
56
|
+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_'] # Failure Type 1 training data files (Class 1)
|
|
57
|
+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_'] # Failure Type 2 training data files (Class 2)
|
|
58
|
+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_'] # Failure Type 3 training data files (Class 3)
|
|
59
|
+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_'] # Failure Type 4 training data files (Class 4)
|
|
60
|
+
|
|
61
|
+# Assign base filenames for the testing set based on the --transition flag
|
|
62
|
+if options.transition:
|
|
63
|
+ # Use test files that are specified as including transition periods
|
|
64
|
+ datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_'] # Normal test data files (Class 0)
|
|
65
|
+ datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_'] # Failure Type 1 test data files (with TRANSITION) (Class 1)
|
|
66
|
+ datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_'] # Failure Type 2 test data files (with TRANSITION) (Class 2)
|
|
67
|
+ datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_'] # Failure Type 3 test data files (with TRANSITION) (Class 3)
|
|
68
|
+ datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_'] # Failure Type 4 test data files (Class 4)
|
|
69
|
+else:
|
|
70
|
+ # Use test files that are specified as not including transition periods
|
|
71
|
+ datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_'] # Normal test data files (Class 0)
|
|
72
|
+ datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_'] # Failure Type 1 test data files (Class 1)
|
|
73
|
+ datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_'] # Failure Type 2 test data files (Class 2)
|
|
74
|
+ datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_'] # Failure Type 3 test data files (Class 3)
|
|
75
|
+ datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_'] # Failure Type 4 test data files (Class 4)
|
|
76
|
+
|
|
77
|
+# --- Feature Definition ---
|
|
78
|
+# Define the list of features (column names) to be extracted from the data files
|
|
79
|
+features = ['r1 s1', 'r1 s4', 'r1 s5']
|
|
80
|
+# Store the original number of features. This is needed later for sequence creation.
|
|
81
|
+n_original_features = len(features) # Store the original number of features
|
|
82
|
+
|
|
83
|
+# Dictionaries to store display names (using LaTeX-like format) and units for features, mainly used for plot labels
|
|
84
|
+featureNames = {}
|
|
85
|
+featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>' # Evaporator Temperature
|
|
86
|
+featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>' # Condenser Temperature
|
|
87
|
+featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>' # Air Temperature
|
|
88
|
+
|
|
89
|
+unitNames = {}
|
|
90
|
+unitNames['r1 s1'] = r'($^o$C)' # Degrees Celsius unit
|
|
91
|
+unitNames['r1 s4'] = r'($^o$C)'
|
|
92
|
+unitNames['r1 s5'] = r'($^o$C)'
|
|
93
|
+
|
|
94
|
+# Redundant variable, but kept from original code
|
|
95
|
+NumFeatures = len(features)
|
|
96
|
+
|
|
97
|
+#####################################################################################################
|
|
98
|
+# --- Data Loading and Preprocessing (Training Data) ---
|
|
99
|
+# This section loads, cleans, and preprocesses the training data
|
|
100
|
+#####################################################################################################
|
|
101
|
+
|
|
102
|
+# List to hold processed DataFrames for training data, organized by class
|
|
103
|
+# Each element dataTrain[i] is a DataFrame containing concatenated data for class i
|
|
104
|
+dataTrain = []
|
|
105
|
+# Loop through each list of filenames for each training class (Class 0, 1, 2, 3, 4)
|
|
106
|
+for class_files in datafiles[0]:
|
|
107
|
+ class_dfs = [] # Temporary list to hold DataFrames loaded for the current class
|
|
108
|
+ # Loop through each base filename in the current class's list
|
|
109
|
+ for base_filename in class_files:
|
|
110
|
+ # Construct the full absolute file path assuming data is in a 'data' subdirectory
|
|
111
|
+ script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory where the current script is located
|
|
112
|
+ data_dir = os.path.join(script_dir, 'data') # Define the path to the 'data' subdirectory
|
|
113
|
+ filepath = os.path.join(data_dir, f'{base_filename}.csv') # Combine directory and filename
|
|
114
|
+
|
|
115
|
+ try:
|
|
116
|
+ # Read the CSV file into a pandas DataFrame
|
|
117
|
+ df = pd.read_csv(filepath)
|
|
118
|
+ # Convert the 'datetime' column to datetime objects. Try two formats and coerce errors (set invalid parsing to NaT).
|
|
119
|
+ df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
|
|
120
|
+ # Fill any NaT values from the first attempt by trying a second format.
|
|
121
|
+ df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
|
|
122
|
+ # Convert the specified feature columns to numeric type. Coerce errors (set invalid parsing to NaN).
|
|
123
|
+ for col in features:
|
|
124
|
+ df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
125
|
+ # Set the 'timestamp' column as the DataFrame index
|
|
126
|
+ # Resample the data to a 5-minute frequency ('5Min').
|
|
127
|
+ # Select only the specified 'features' columns.
|
|
128
|
+ # Calculate the mean for each 5-minute interval.
|
|
129
|
+ df = df.set_index('timestamp').resample('5Min')[features].mean() # Set index, resample, and calculate mean for features
|
|
130
|
+ # Estimate any remaining missing values (NaN) within the feature columns using linear interpolation
|
|
131
|
+ df = df[features].interpolate() # Estimate missing values using linear interpolation
|
|
132
|
+ # Append the processed DataFrame to the list for the current class
|
|
133
|
+ class_dfs.append(df)
|
|
134
|
+ except FileNotFoundError:
|
|
135
|
+ # If a file is not found, print a warning and skip processing it
|
|
136
|
+ print(f"Warning: File {filepath} not found and skipped.")
|
|
137
|
+ # If the list of DataFrames for the current class is not empty, concatenate them into a single DataFrame
|
|
138
|
+ if class_dfs:
|
|
139
|
+ dataTrain.append(pd.concat(class_dfs))
|
|
140
|
+
|
|
141
|
+# Concatenate all DataFrames from all training classes into a single large DataFrame for training the scaler and the model
|
|
142
|
+combined_train_data = pd.concat(dataTrain)
|
|
143
|
+
|
|
144
|
+#####################################################################################################
|
|
145
|
+# --- Data Loading and Preprocessing (Test Data) ---
|
|
146
|
+# This section loads, cleans, and preprocesses the testing data
|
|
147
|
+# The process is identical to the training data loading, but done separately for test files
|
|
148
|
+#####################################################################################################
|
|
149
|
+
|
|
150
|
+# List to hold processed DataFrames for test data, organized by class
|
|
151
|
+# Each element dataTest[i] is a DataFrame containing concatenated data for class i
|
|
152
|
+dataTest = []
|
|
153
|
+# Loop through each list of filenames for each test class (Class 0, 1, 2, 3, 4)
|
|
154
|
+for class_files in datafiles[1]:
|
|
155
|
+ class_dfs = [] # Temporary list to hold DataFrames loaded for the current class
|
|
156
|
+ # Loop through each base filename in the current class's list
|
|
157
|
+ for base_filename in class_files:
|
|
158
|
+ # Construct the full absolute file path
|
|
159
|
+ script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
160
|
+ data_dir = os.path.join(script_dir, 'data')
|
|
161
|
+ filepath = os.path.join(data_dir, f'{base_filename}.csv')
|
|
162
|
+
|
|
163
|
+ try:
|
|
164
|
+ # Read the CSV file into a pandas DataFrame
|
|
165
|
+ df = pd.read_csv(filepath)
|
|
166
|
+ # Convert the 'datetime' column to datetime objects
|
|
167
|
+ df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
|
|
168
|
+ df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
|
|
169
|
+ # Convert the specified feature columns to numeric type
|
|
170
|
+ for col in features:
|
|
171
|
+ df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
172
|
+ # Set the 'timestamp' column as the index and resample to 5-minute frequency, calculating the mean
|
|
173
|
+ df = df.set_index('timestamp').resample('5Min')[features].mean() # Set index, resample, and calculate mean for features
|
|
174
|
+ # Estimate any remaining missing values (NaN) using linear interpolation
|
|
175
|
+ df = df[features].interpolate() # Estimate missing values using linear interpolation
|
|
176
|
+ # Append the processed DataFrame to the list for the current class
|
|
177
|
+ class_dfs.append(df)
|
|
178
|
+ except FileNotFoundError:
|
|
179
|
+ # If a file is not found, print a warning and skip processing it
|
|
180
|
+ print(f"Warning: File {filepath} not found and skipped.")
|
|
181
|
+ # If the list of DataFrames for the current class is not empty, concatenate them into a single DataFrame
|
|
182
|
+ if class_dfs:
|
|
183
|
+ dataTest.append(pd.concat(class_dfs))
|
|
184
|
+
|
|
185
|
+#####################################################################################################
|
|
186
|
+# --- Raw Data Plotting (Optional) ---
|
|
187
|
+# This section plots the unprocessed data for visualization if the --plot_raw flag is set
|
|
188
|
+#####################################################################################################
|
|
189
|
+
|
|
190
|
+# Check if the plot_raw argument was set to True
|
|
191
|
+if options.plot_raw:
|
|
192
|
+ num_features = len(features) # Get the number of features to determine plot layout
|
|
193
|
+ # Create a figure and a grid of subplots. Share the x-axis among all subplots.
|
|
194
|
+ fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
|
|
195
|
+ # Ensure 'axes' is always an array, even if there's only one feature (and thus only one subplot)
|
|
196
|
+ if num_features == 1:
|
|
197
|
+ axes = [axes]
|
|
198
|
+ # Loop through each feature by index (i) and name (feature)
|
|
199
|
+ for i, feature in enumerate(features):
|
|
200
|
+ # Loop through each test data DataFrame (representing a different class)
|
|
201
|
+ for k, df in enumerate(dataTest):
|
|
202
|
+ # Plot the data for the current feature and class over time
|
|
203
|
+ # Use f-string to include the class number in the label
|
|
204
|
+ axes[i].plot(df.index, df[feature], label=f'Class {k}')
|
|
205
|
+ # Set the label for the y-axis using the feature's display name and unit
|
|
206
|
+ axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
|
|
207
|
+ # Set the title for the subplot using the feature's display name
|
|
208
|
+ axes[i].set_title(featureNames[feature])
|
|
209
|
+ # Add a legend to identify the classes being plotted in each subplot
|
|
210
|
+ # Placing it on the last subplot is common to avoid repetition
|
|
211
|
+ axes[num_features - 1].legend(loc='upper right') # Place legend on the last subplot
|
|
212
|
+
|
|
213
|
+ # Adjust layout to prevent plot elements (like labels) from overlapping
|
|
214
|
+ plt.tight_layout()
|
|
215
|
+ # Display the plot window
|
|
216
|
+ plt.show()
|
|
217
|
+ # exit(0) # Uncomment this line if you want the script to stop after showing raw data plots
|
|
218
|
+
|
|
219
|
+########################################################################################################
|
|
220
|
+# --- Data Scaling ---
|
|
221
|
+# This section scales the data using StandardScaler
|
|
222
|
+########################################################################################################
|
|
223
|
+
|
|
224
|
+# Initialize the StandardScaler. This scaler standardizes features by removing the mean and scaling to unit variance.
|
|
225
|
+scaler = StandardScaler() # Using StandardScaler in this version
|
|
226
|
+
|
|
227
|
+# Fit the scaler on the training data and then transform the training data.
|
|
228
|
+# The scaler learns the mean and standard deviation from the combined_train_data[features].
|
|
229
|
+scaled_train_data = scaler.fit_transform(combined_train_data[features]) # Fit on training, transform training
|
|
230
|
+
|
|
231
|
+# Transform each test data DataFrame using the scaler fitted on the training data.
|
|
232
|
+# The same scaling parameters (mean, standard deviation) from the training data are applied to the test data.
|
|
233
|
+# A list comprehension efficiently applies the transformation to each DataFrame in dataTest.
|
|
234
|
+scaled_test_data_list = []
|
|
235
|
+for df in dataTest: # Loop through each test DataFrame
|
|
236
|
+ scaled_test_data_list.append(scaler.transform(df[features])) # Transform each test DataFrame
|
|
237
|
+
|
|
238
|
+# Convert the scaled NumPy arrays back into pandas DataFrames.
|
|
239
|
+# This step is optional but can be helpful for inspection and keeping track of timestamps/column names.
|
|
240
|
+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
|
|
241
|
+# Create a list of scaled test DataFrames, maintaining original indices and column names.
|
|
242
|
+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
|
|
243
|
+
|
|
244
|
+############################################################################################################
|
|
245
|
+# --- Sequence Creation with Rate of Change Feature Engineering ---
|
|
246
|
+# This section defines a function to create time sequences and adds rate of change features
|
|
247
|
+############################################################################################################
|
|
248
|
+
|
|
249
|
+# Function to create time sequences (windows) from data and append rate of change features
|
|
250
|
+# 'data': Input NumPy array (scaled feature values)
|
|
251
|
+# 'timesteps': The length of each sequence (number of time points)
|
|
252
|
+# 'original_features': The number of features in the input data (used for padding) - NOTE: Parameter name is potentially misleading, should be count
|
|
253
|
+def create_sequences_with_rate_of_change(data, timesteps, original_features): # NOTE: original_features parameter likely intended as original_features_count
|
|
254
|
+ sequences = [] # List to store the generated sequences
|
|
255
|
+ # Iterate through the data to create overlapping sequences.
|
|
256
|
+ # The loop runs until the last possible start index for a sequence of length 'timesteps'.
|
|
257
|
+ for i in range(len(data) - timesteps + 1):
|
|
258
|
+ # Extract a sequence (slice) of 'timesteps' length starting from index 'i'.
|
|
259
|
+ sequence = data[i:i + timesteps]
|
|
260
|
+ # Calculate the difference between consecutive time points within the sequence.
|
|
261
|
+ # np.diff with axis=0 calculates the difference along the rows (time).
|
|
262
|
+ # This results in an array with shape (timesteps - 1, number_of_features).
|
|
263
|
+ rate_of_change = np.diff(sequence[:timesteps], axis=0)
|
|
264
|
+ # Pad the rate of change array to match the original sequence's length ('timesteps').
|
|
265
|
+ # np.diff reduces the dimension by 1, so we add a row of zeros at the beginning.
|
|
266
|
+ # The padding shape should be (1 row, number of columns equal to original features count).
|
|
267
|
+ padding = np.zeros((1, original_features)) # NOTE: This line caused a TypeError in previous debugging if original_features was not the count. It should likely use n_original_features or a correctly passed count.
|
|
268
|
+ # Vertically stack the padding row on top of the rate of change array.
|
|
269
|
+ # The result has shape (timesteps, number_of_features).
|
|
270
|
+ rate_of_change_padded = np.vstack((padding, rate_of_change)) # Stack padding on top of diff
|
|
271
|
+ # Horizontally stack the original sequence and the padded rate of change array.
|
|
272
|
+ # The resulting combined sequence has shape (timesteps, 2 * number_of_features).
|
|
273
|
+ sequences.append(np.hstack((sequence, rate_of_change_padded))) # Concatenate original sequence and padded rate of change
|
|
274
|
+ # Convert the list of 3D sequences into a single 3D NumPy array.
|
|
275
|
+ return np.array(sequences)
|
|
276
|
+
|
|
277
|
+# Create time sequences with rate of change for the scaled training data.
|
|
278
|
+# The output is a 3D array: (number of sequences, timesteps, 2 * n_original_features).
|
|
279
|
+X_train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features) # Pass n_original_features (correctly passes count)
|
|
280
|
+
|
|
281
|
+# Create time sequences with rate of change for each scaled test data DataFrame.
|
|
282
|
+# This results in a list where each element is a 3D array of sequences for a specific test class.
|
|
283
|
+X_test_sequences_list = [create_sequences_with_rate_of_change(df.values, timesteps, n_original_features) for df in scaled_test_df_list] # Pass n_original_features (correctly passes count)
|
|
284
|
+
|
|
285
|
+############################################################################################################
|
|
286
|
+# --- K-Means Clustering Model ---
|
|
287
|
+# This section initializes, trains, and applies the K-Means model
|
|
288
|
+############################################################################################################
|
|
289
|
+
|
|
290
|
+# Reshape the training sequences into a 2D array for the K-Means algorithm.
|
|
291
|
+# K-Means expects data in the shape (number of samples, number of features).
|
|
292
|
+# Each sequence (timesteps * total_features) is flattened into a single row for clustering.
|
|
293
|
+n_samples, n_timesteps, n_total_features = X_train_sequences.shape # Get dimensions of the sequence array
|
|
294
|
+X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_total_features) # Flatten each sequence
|
|
295
|
+
|
|
296
|
+# Initialize the KMeans model.
|
|
297
|
+# n_clusters: The desired number of clusters (set by command line argument).
|
|
298
|
+# random_state=42: Sets the seed for random number generation for initial centroids, ensuring reproducibility.
|
|
299
|
+# n_init=10: Runs the K-Means algorithm 10 times with different centroid initializations and selects the best result based on inertia.
|
|
300
|
+kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # Initialize KMeans model
|
|
301
|
+# Train (fit) the KMeans model on the reshaped training data.
|
|
302
|
+kmeans.fit(X_train_reshaped)
|
|
303
|
+
|
|
304
|
+############################################################################################################################
|
|
305
|
+# --- Predict Clusters for Test Data ---
|
|
306
|
+# This section applies the trained K-Means model to the test data to get cluster assignments
|
|
307
|
+############################################################################################################################
|
|
308
|
+
|
|
309
|
+# List to store the predicted cluster labels for each test data class.
|
|
310
|
+cluster_labels_test_list = []
|
|
311
|
+# List to store the reshaped test data arrays (flattened sequences), needed for later evaluation metrics.
|
|
312
|
+X_test_reshaped_list = []
|
|
313
|
+kmeans_models = [] # This variable was declared but not used in the original code (To store kmeans model for each test set)
|
|
314
|
+
|
|
315
|
+# Loop through each test data sequence array (one for each test class).
|
|
316
|
+for i, X_test_seq in enumerate(X_test_sequences_list):
|
|
317
|
+ # Get the dimensions of the current test sequence array.
|
|
318
|
+ n_samples_test, n_timesteps_test, n_total_features_test = X_test_seq.shape
|
|
319
|
+ # Reshape the test sequences for prediction, flattening each sequence into a single data point for KMeans.
|
|
320
|
+ X_test_reshaped = X_test_seq.reshape(n_samples_test, n_timesteps_test * n_total_features_test)
|
|
321
|
+ # Use the trained K-Means model to predict the cluster label for each reshaped test sequence.
|
|
322
|
+ labels = kmeans.predict(X_test_reshaped)
|
|
323
|
+ # Append the predicted labels for the current test class to the list.
|
|
324
|
+ cluster_labels_test_list.append(labels)
|
|
325
|
+ # Append the reshaped test data for the current class to the list (needed for Silhouette score calculation later).
|
|
326
|
+ X_test_reshaped_list.append(X_test_reshaped) # Append reshaped data
|
|
327
|
+ # kmeans_models.append(kmeans) # Append the trained kmeans model (Variable not used)
|
|
328
|
+
|
|
329
|
+############################################################################################################################
|
|
330
|
+# --- Plotting Clustered Data (Optional) ---
|
|
331
|
+# This function plots the original data points, colored according to their predicted cluster
|
|
332
|
+############################################################################################################
|
|
333
|
+
|
|
334
|
+# Function to plot the original feature data, with points colored based on their assigned cluster ID.
|
|
335
|
+# 'original_data_list': List of original (unscaled) test DataFrames, one per class.
|
|
336
|
+# 'cluster_labels_list': List of predicted cluster label arrays, one for each corresponding test DataFrame.
|
|
337
|
+# 'n_clusters': Total number of clusters used.
|
|
338
|
+# 'features', 'featureNames', 'unitNames': Dictionaries for plotting labels.
|
|
339
|
+def plot_clustered_data(original_data_list, cluster_labels_list, n_clusters, features, featureNames, unitNames):
|
|
340
|
+ num_features = len(features) # Number of features to plot (determines number of subplots)
|
|
341
|
+ # Create a figure and a set of subplots (one row, num_features columns). Share the x-axis.
|
|
342
|
+ fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
|
|
343
|
+ # Ensure 'axes' is always an array, even if there's only one feature (and thus only one subplot)
|
|
344
|
+ if num_features == 1:
|
|
345
|
+ axes = [axes]
|
|
346
|
+ # Generate a color map to assign distinct colors to each cluster ID
|
|
347
|
+ colors = plt.cm.viridis(np.linspace(0, 1, n_clusters)) # Assign colors to each cluster
|
|
348
|
+
|
|
349
|
+ # Loop through each original test data DataFrame and its corresponding cluster labels
|
|
350
|
+ for k, df in enumerate(original_data_list): # k is the index of the test class/DataFrame
|
|
351
|
+ original_indices = df.index # Get the time index from the original DataFrame
|
|
352
|
+ # The cluster labels correspond to sequences, which start 'timesteps' points later than the raw data.
|
|
353
|
+ # Get the time index corresponding to the end of each sequence for plotting.
|
|
354
|
+ time_index = original_indices[timesteps - 1:]
|
|
355
|
+
|
|
356
|
+ # Loop through each original feature to plot it
|
|
357
|
+ for i, feature in enumerate(features): # i is the index of the feature
|
|
358
|
+ # Loop through each possible cluster ID
|
|
359
|
+ for cluster_id in range(n_clusters):
|
|
360
|
+ # Find the indices within the current test data corresponding to the current cluster ID
|
|
361
|
+ cluster_indices_kmeans = np.where(cluster_labels_list[k] == cluster_id)[0]
|
|
362
|
+ # If there are any data points assigned to this cluster
|
|
363
|
+ if len(cluster_indices_kmeans) > 0:
|
|
364
|
+ # Plot the original feature data points for this specific cluster ID.
|
|
365
|
+ # x-axis: time_index points corresponding to the sequence end
|
|
366
|
+ # y-axis: original feature values at those time_index points
|
|
367
|
+ # color: color assigned to the cluster
|
|
368
|
+ # label: label for the cluster (only show for the first class (k==0) to avoid redundant legends)
|
|
369
|
+ # s=10: size of the scatter markers
|
|
370
|
+ axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10)
|
|
371
|
+ # Set the y-axis label and title for the current feature's subplot
|
|
372
|
+ axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
|
|
373
|
+ axes[i].set_title(featureNames[feature])
|
|
374
|
+ # Add a legend to the plot. Place it on the last subplot.
|
|
375
|
+ axes[num_features - 1].legend(loc='upper right') # Place legend on the last subplot
|
|
376
|
+
|
|
377
|
+ # Adjust layout to prevent plot elements (like labels) from overlapping
|
|
378
|
+ plt.tight_layout()
|
|
379
|
+ # Display the plot window
|
|
380
|
+ plt.show()
|
|
381
|
+
|
|
382
|
+# Call the plotting function if the --plot_clustered command line flag is provided
|
|
383
|
+if options.plot_clustered:
|
|
384
|
+ plot_clustered_data(dataTest, cluster_labels_test_list, n_clusters, features, featureNames, unitNames)
|
|
385
|
+
|
|
386
|
+#####################################################################################################
|
|
387
|
+# --- Evaluation and plotting of anomalies and misclassified instances (based on cluster labels) ---
|
|
388
|
+# This section evaluates clustering performance using classification metrics and plots specific instances
|
|
389
|
+#####################################################################################################
|
|
390
|
+
|
|
391
|
+# Function to evaluate clustering results and plot anomalies/misclassified instances
|
|
392
|
+# 'kmeans_model': The trained K-Means model.
|
|
393
|
+# 'scaled_test_data_list': List of scaled test data DataFrames.
|
|
394
|
+# 'n_clusters': Number of clusters.
|
|
395
|
+# 'original_test_data_list': List of original (unscaled) test data DataFrames.
|
|
396
|
+# 'true_labels_list': List of arrays containing true class labels for test data.
|
|
397
|
+# 'features', 'featureNames', 'unitNames': Feature information for plotting.
|
|
398
|
+# 'plot_anomalies', 'plot_misclassified': Boolean flags from command line arguments.
|
|
399
|
+def evaluate_and_plot_anomalies(kmeans_model, scaled_test_data_list, n_clusters, original_test_data_list, true_labels_list, features, featureNames, unitNames, plot_anomalies=False, plot_misclassified=False):
|
|
400
|
+ # Lists to accumulate true labels, predicted cluster labels, and original sequences for ALL test data
|
|
401
|
+ all_y_true_categorical = [] # Stores the true class label (0, 1, etc.) for each sequence across all test data
|
|
402
|
+ all_predicted_cluster_labels = [] # Stores the predicted cluster ID for each sequence across all test data
|
|
403
|
+ all_original_test_sequences = [] # Stores the original feature values for each sequence (window) across all test data, used for plotting
|
|
404
|
+
|
|
405
|
+ # Lists to store evaluation metrics calculated per test class DataFrame
|
|
406
|
+ inertia_values = [] # Inertia score when the model predicts on each class's data (Note: this likely appends the model's overall inertia repeatedly)
|
|
407
|
+ silhouette_scores = [] # Silhouette score calculated for each class's data based on predicted clusters
|
|
408
|
+
|
|
409
|
+ # Loop through each test data class (scaled data, original data, and true labels)
|
|
410
|
+ for i, (scaled_test_df, original_test_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)): # i is the class index
|
|
411
|
+ # Create sequences with rate of change for the current scaled test data DataFrame
|
|
412
|
+ X_test_sequences = create_sequences_with_rate_of_change(scaled_test_df.values, timesteps, n_original_features) # Create sequences for the current class
|
|
413
|
+ # If no sequences could be generated for this class (e.g., data too short), print warning and skip
|
|
414
|
+ if X_test_sequences.size == 0:
|
|
415
|
+ print(f"Warning: No test sequences generated for class {i}. Skipping evaluation for this class.")
|
|
416
|
+ continue # Skip to the next iteration (next class)
|
|
417
|
+ # Get the number of sequences generated for the current class
|
|
418
|
+ n_samples_test = X_test_sequences.shape[0]
|
|
419
|
+ # Reshape these sequences for prediction by the K-Means model (flatten each sequence)
|
|
420
|
+ X_test_reshaped = X_test_sequences.reshape(n_samples_test, -1)
|
|
421
|
+ # Predict the cluster label for each reshaped sequence using the trained model
|
|
422
|
+ cluster_labels_predicted = kmeans_model.predict(X_test_reshaped)
|
|
423
|
+
|
|
424
|
+ # Append the model's inertia (this seems like a repeated value of the final training inertia)
|
|
425
|
+ inertia_values.append(kmeans_model.inertia_) # Check if this is the intended calculation for per-class evaluation
|
|
426
|
+ # Calculate the Silhouette score for the current class's data based on its predicted clusters
|
|
427
|
+ # Only calculate if there's more than one unique predicted label and at least one sample
|
|
428
|
+ if len(np.unique(cluster_labels_predicted)) > 1 and len(cluster_labels_predicted) > 0:
|
|
429
|
+ silhouette_scores.append(silhouette_score(X_test_reshaped, cluster_labels_predicted))
|
|
430
|
+ else:
|
|
431
|
+ silhouette_scores.append(np.nan) # Append NaN if silhouette cannot be calculated
|
|
432
|
+
|
|
433
|
+ # Get the original time indices corresponding to the *end* of each sequence for this class
|
|
434
|
+ original_indices = original_test_df.index[timesteps - 1:]
|
|
435
|
+
|
|
436
|
+ # Collect true labels, predicted cluster labels, and the actual original sequences for this class
|
|
437
|
+ # Loop through the generated sequences (represented by index j) and their corresponding true labels
|
|
438
|
+ for j, label in enumerate(y_true_categorical[timesteps - 1:]): # Iterate over true labels aligned with sequence ends
|
|
439
|
+ all_y_true_categorical.append(label) # Add the true label
|
|
440
|
+ all_predicted_cluster_labels.append(cluster_labels_predicted[j]) # Add the predicted cluster label
|
|
441
|
+ # Determine the start and end indices in the original DataFrame for the current sequence (j)
|
|
442
|
+ # The sequence at index j in the sequences array corresponds to data starting at index 'start_index' in the original DataFrame
|
|
443
|
+ start_index = original_test_df.index.get_loc(original_indices[j]) - (timesteps - 1)
|
|
444
|
+ end_index = start_index + timesteps
|
|
445
|
+ # Extract and add the original (unscaled) feature values for this specific sequence
|
|
446
|
+ all_original_test_sequences.append(original_test_df[features].iloc[start_index:end_index].values) # Append original sequence data
|
|
447
|
+
|
|
448
|
+ # Convert the accumulated lists across all test classes into NumPy arrays
|
|
449
|
+ all_y_true_categorical = np.array(all_y_true_categorical) # Array of true labels for all sequences
|
|
450
|
+ all_predicted_cluster_labels = np.array(all_predicted_cluster_labels) # Array of predicted cluster IDs for all sequences
|
|
451
|
+ all_original_test_sequences = np.array(all_original_test_sequences) # 3D array of original sequence data for all sequences
|
|
452
|
+
|
|
453
|
+ # Print overall evaluation metrics based on the accumulated data
|
|
454
|
+ print("\nEvaluation Metrics:")
|
|
455
|
+ # Print the mean of the recorded inertia values (Note: Check if this average of the model's final inertia is the desired metric here)
|
|
456
|
+ print(f"Inertia (final): {np.mean(inertia_values):.4f}")
|
|
457
|
+ # Print the mean of the calculated silhouette scores per class (ignoring NaNs)
|
|
458
|
+ print(f"Average Silhouette Score (valid cases): {np.nanmean(silhouette_scores):.4f}")
|
|
459
|
+
|
|
460
|
+ # --- Cluster Analysis: Map Cluster IDs to Dominant True Labels ---
|
|
461
|
+ # This maps each cluster ID to the true class label that appears most frequently among the sequences assigned to that cluster.
|
|
462
|
+ cluster_dominant_label = {} # Dictionary: {cluster_id: dominant_true_label}
|
|
463
|
+ for cluster_id in range(n_clusters): # Loop through each possible cluster ID
|
|
464
|
+ # Find the indices of all sequences that were predicted to belong to the current cluster_id
|
|
465
|
+ indices_in_cluster = np.where(all_predicted_cluster_labels == cluster_id)[0]
|
|
466
|
+ # If the cluster is not empty (contains sequences)
|
|
467
|
+ if len(indices_in_cluster) > 0:
|
|
468
|
+ # Get the true labels of all sequences that fall into this cluster
|
|
469
|
+ labels_in_cluster = all_y_true_categorical[indices_in_cluster]
|
|
470
|
+ # If there are actual labels in this subset (should be true if indices_in_cluster > 0)
|
|
471
|
+ if len(labels_in_cluster) > 0:
|
|
472
|
+ # Count the occurrences of each true label in this cluster and find the index of the most frequent one
|
|
473
|
+ dominant_label = np.argmax(np.bincount(labels_in_cluster))
|
|
474
|
+ cluster_dominant_label[cluster_id] = dominant_label # Assign the dominant true label to the cluster ID
|
|
475
|
+ else:
|
|
476
|
+ cluster_dominant_label[cluster_id] = -1 # If for some reason no labels were found, mark as -1
|
|
477
|
+ else:
|
|
478
|
+ cluster_dominant_label[cluster_id] = -1 # If the cluster is empty, mark as -1
|
|
479
|
+
|
|
480
|
+ # --- Generate Predicted Labels for Classification Evaluation ---
|
|
481
|
+ # Create a new array of predicted labels, where each sequence's predicted cluster ID is mapped to the cluster's dominant true label.
|
|
482
|
+ # This allows treating the clustering result as a classification output for evaluation.
|
|
483
|
+ # Use .get(cluster_id, -1) to handle cases where a cluster_id might not be in cluster_dominant_label (e.g., if cluster was empty).
|
|
484
|
+ predicted_labels_numeric = np.array([cluster_dominant_label.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels])
|
|
485
|
+
|
|
486
|
+ # --- Classification Evaluation (using mapped labels) ---
|
|
487
|
+ # Evaluate the performance by comparing the true labels with the predicted labels derived from cluster dominance.
|
|
488
|
+ # Only include instances where a dominant label was successfully assigned (not -1).
|
|
489
|
+ valid_indices = predicted_labels_numeric != -1 # Indices of sequences that have a valid predicted numeric label
|
|
490
|
+ # Proceed with evaluation metrics and confusion matrix if there are valid instances and more than one true class is present in these instances.
|
|
491
|
+ if np.sum(valid_indices) > 0 and len(np.unique(all_y_true_categorical[valid_indices])) > 1:
|
|
492
|
+ print("\nEvaluation Results (Clusters vs True Labels):")
|
|
493
|
+ # Print a detailed classification report (Precision, Recall, F1-score, Support for each class, and overall metrics).
|
|
494
|
+ print(classification_report(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices]))
|
|
495
|
+ # Compute the Confusion Matrix: rows are true labels, columns are predicted dominant labels.
|
|
496
|
+ cm = confusion_matrix(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices])
|
|
497
|
+ # Create a figure for the confusion matrix plot.
|
|
498
|
+ plt.figure(figsize=(8, 6))
|
|
499
|
+ # Use seaborn heatmap to visualize the confusion matrix.
|
|
500
|
+ # annot=True displays the values in the cells.
|
|
501
|
+ # fmt='d' formats the values as integers.
|
|
502
|
+ # cmap='Blues' sets the color map.
|
|
503
|
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
|
504
|
+ plt.xlabel('Predicted Cluster (Dominant True Label)') # Label for the x-axis
|
|
505
|
+ plt.ylabel('True Label') # Label for the y-axis
|
|
506
|
+ plt.title('Confusion Matrix (Clusters vs True Labels)') # Title of the plot
|
|
507
|
+ plt.show() # Display the plot
|
|
508
|
+ else:
|
|
509
|
+ print("\nCould not perform detailed evaluation (not enough data or classes with assigned dominant labels).")
|
|
510
|
+
|
|
511
|
+ #################################################################################################
|
|
512
|
+ # --- Plotting Anomalies (Optional) ---
|
|
513
|
+ # This section plots time series data for sequences identified as anomalies based on clustering
|
|
514
|
+ #################################################################################################
|
|
515
|
+
|
|
516
|
+ # Check if the --plot_anomalies command line flag is provided
|
|
517
|
+ if plot_anomalies:
|
|
518
|
+ print("\nChecking anomaly data:")
|
|
519
|
+ # Define "anomaly clusters" as those whose dominant true label is greater than 0 (i.e., corresponds to any failure type).
|
|
520
|
+ anomaly_clusters = [cluster_id for cluster_id, label in cluster_dominant_label.items() if label > 0]
|
|
521
|
+ # Find the indices of all sequences that were assigned to any of the "anomaly clusters".
|
|
522
|
+ anomaly_indices = np.where(np.isin(all_predicted_cluster_labels, anomaly_clusters))[0]
|
|
523
|
+ # If any anomaly sequences were found
|
|
524
|
+ if len(anomaly_indices) > 0:
|
|
525
|
+ # Determine how many anomaly sequences to plot (up to 5, or fewer if less were found).
|
|
526
|
+ num_anomalies_to_plot = min(5, len(anomaly_indices))
|
|
527
|
+ colors = ['red', 'green', 'blue'] # Define a simple list of colors to cycle through for features
|
|
528
|
+ # Randomly select a few anomaly sequences and plot their original data.
|
|
529
|
+ for i in np.random.choice(anomaly_indices, num_anomalies_to_plot, replace=False): # Select random indices without replacement
|
|
530
|
+ # Print shape and first few values of the original sequence being plotted for debugging/information
|
|
531
|
+ print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
|
|
532
|
+ print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
|
|
533
|
+ # Create a new figure for each individual anomaly plot.
|
|
534
|
+ plt.figure(figsize=(12, 6))
|
|
535
|
+ # Plot each original feature within the selected sequence over the timesteps.
|
|
536
|
+ for j, feature in enumerate(features): # j is feature index, feature is feature name
|
|
537
|
+ # Plot the feature values (y-axis) against the sequence timestep index (x-axis from 0 to timesteps-1).
|
|
538
|
+ # Use colors cycling through the defined list.
|
|
539
|
+ plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
|
|
540
|
+ # NOTE: true_label and predicted_cluster_for_title variables are not defined in this block before use in the title. This will cause a NameError.
|
|
541
|
+ # You need to add lines like:
|
|
542
|
+ # true_label = all_y_true_categorical[i]
|
|
543
|
+ # predicted_cluster_for_title = all_predicted_cluster_labels[i]
|
|
544
|
+ plt.title('Detected Anomalies (based on cluster dominance)') # Title of the plot
|
|
545
|
+ plt.xlabel('Time Step') # Label for the x-axis (timestep within the sequence)
|
|
546
|
+ plt.ylabel('Value') # Label for the y-axis (feature value)
|
|
547
|
+ plt.legend() # Add a legend to identify which line corresponds to which feature
|
|
548
|
+ plt.show() # Display the plot window
|
|
549
|
+ else:
|
|
550
|
+ # If no sequences were assigned to anomaly clusters, print a message.
|
|
551
|
+ print("No anomalies detected based on cluster dominance.")
|
|
552
|
+
|
|
553
|
+ #################################################################################################
|
|
554
|
+ # --- Plotting Misclassified Instances (Optional) ---
|
|
555
|
+ # This section plots time series data for sequences that were "misclassified" based on cluster dominance
|
|
556
|
+ #################################################################################################
|
|
557
|
+
|
|
558
|
+ # Check if the --plot_misclassified command line flag is provided
|
|
559
|
+ if plot_misclassified:
|
|
560
|
+ print("\nChecking misclassified data:")
|
|
561
|
+ # Find the indices of all sequences where the true label does NOT match the dominant label of their assigned cluster.
|
|
562
|
+ misclassified_indices = np.where(all_y_true_categorical != predicted_labels_numeric)[0]
|
|
563
|
+ # If any misclassified instances are found
|
|
564
|
+ if len(misclassified_indices) > 0:
|
|
565
|
+ # Determine how many misclassified sequences to plot (up to 5, or fewer if less were found).
|
|
566
|
+ num_misclassified_to_plot = min(5, len(misclassified_indices))
|
|
567
|
+ colors = ['red', 'green', 'blue'] # Define a simple list of colors to cycle through for features
|
|
568
|
+ # Randomly select a few misclassified sequences and plot their original data.
|
|
569
|
+ for i in np.random.choice(misclassified_indices, num_misclassified_to_plot, replace=False): # Select random indices without replacement
|
|
570
|
+ # Print shape and first few values of the original sequence being plotted for debugging/information
|
|
571
|
+ print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
|
|
572
|
+ print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
|
|
573
|
+ # Create a new figure for each individual misclassified plot.
|
|
574
|
+ plt.figure(figsize=(12, 6))
|
|
575
|
+ # Plot each original feature within the selected sequence over the timesteps.
|
|
576
|
+ for j, feature in enumerate(features): # j is feature index, feature is feature name
|
|
577
|
+ # Plot the feature values (y-axis) against the sequence timestep index (x-axis from 0 to timesteps-1).
|
|
578
|
+ # Use colors cycling through the defined list.
|
|
579
|
+ plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
|
|
580
|
+ # NOTE: true_label and predicted_label are defined here, which fixes the NameError in this block.
|
|
581
|
+ true_label = all_y_true_categorical[i] # Get the true label for the current sequence
|
|
582
|
+ predicted_label = predicted_labels_numeric[i] # Get the predicted numeric label (dominant cluster label) for the current sequence
|
|
583
|
+ # Set the title for the misclassified plot, indicating true label and the dominant label of the predicted cluster.
|
|
584
|
+ plt.title(f'Misclassified Instance (True: {true_label}, Predicted Cluster: {predicted_label})') # Title including true label and dominant predicted label
|
|
585
|
+ plt.xlabel('Time Step') # Label for the x-axis
|
|
586
|
+ plt.ylabel('Value') # Label for the y-axis
|
|
587
|
+ plt.legend() # Add a legend to identify features
|
|
588
|
+ plt.show() # Display the plot window
|
|
589
|
+ else:
|
|
590
|
+ # If no misclassified sequences were found, print a message.
|
|
591
|
+ print("No misclassified instances found based on cluster dominance.")
|
|
592
|
+
|
|
593
|
+ # Return the arrays of true labels and predicted numeric labels (based on cluster dominance)
|
|
594
|
+ # These can be used for further analysis or saving results
|
|
595
|
+ return all_y_true_categorical, predicted_labels_numeric
|
|
596
|
+
|
|
597
|
+#####################################################################################################
|
|
598
|
+# --- Main Execution Flow ---
|
|
599
|
+# This is the main part of the script that calls the functions to run the analysis
|
|
600
|
+#####################################################################################################
|
|
601
|
+
|
|
602
|
+# Create a list of true class labels for the test data.
|
|
603
|
+# This list will contain arrays, where each array corresponds to a test class and contains the true label for every data point in that class.
|
|
604
|
+# The labels are the index of the class (0 for Normal, 1 for Failure 1, etc.).
|
|
605
|
+true_labels_list = []
|
|
606
|
+for i, df in enumerate(dataTest): # Loop through each test DataFrame (each class)
|
|
607
|
+ # Create a numpy array filled with the class index 'i', with a length equal to the number of rows in the DataFrame.
|
|
608
|
+ true_labels_list.append(np.full(len(df), i))
|
|
609
|
+
|
|
610
|
+# Call the main evaluation and plotting function.
|
|
611
|
+# Pass the trained model, scaled/original test data, number of clusters, true labels, feature info, and plotting options.
|
|
612
|
+# This function performs the prediction on test data, calculates evaluation metrics, and handles plotting based on flags.
|
|
613
|
+y_true_final, y_pred_final = evaluate_and_plot_anomalies(kmeans, scaled_test_df_list, n_clusters, dataTest, true_labels_list, features, featureNames, unitNames, plot_anomalies=options.plot_anomalies, plot_misclassified=options.plot_misclassified)
|
|
614
|
+
|
|
615
|
+#####################################################################################################
|
|
616
|
+# --- Final Evaluation Metrics (on combined test data) ---
|
|
617
|
+# This section calculates and prints overall evaluation metrics after processing all test data
|
|
618
|
+#####################################################################################################
|
|
619
|
+
|
|
620
|
+# Calculate and print the final Inertia and Silhouette Score for the combined test data.
|
|
621
|
+# Check if there is any reshaped test data available (i.e., if any test data files were processed).
|
|
622
|
+if X_test_reshaped_list:
|
|
623
|
+ # Vertically stack all the reshaped test data arrays from different classes into a single array.
|
|
624
|
+ # This array contains all flattened sequences from all test data.
|
|
625
|
+ X_test_combined_reshaped = np.vstack(X_test_reshaped_list)
|
|
626
|
+ # Concatenate all the predicted cluster labels from different classes into a single array.
|
|
627
|
+ all_cluster_labels_test = np.concatenate(cluster_labels_test_list)
|
|
628
|
+
|
|
629
|
+ # Print a header for the final evaluation metrics.
|
|
630
|
+ print("\nK-Means Model Evaluation on Combined Test Data:")
|
|
631
|
+ # Print the final Inertia of the trained K-Means model on the training data.
|
|
632
|
+ # Note: This inertia value is from the model fitting process, not a specific calculation on the combined test data.
|
|
633
|
+ print(f"Inertia: {kmeans.inertia_:.4f}")
|
|
634
|
+
|
|
635
|
+ # Calculate and print the Silhouette Score for the combined test data based on the predicted cluster labels.
|
|
636
|
+ # This metric evaluates how well-separated the clusters are based on the data points within them.
|
|
637
|
+ # Only calculate if there is more than one unique predicted cluster label and at least one data point.
|
|
638
|
+ if len(np.unique(all_cluster_labels_test)) > 1 and len(all_cluster_labels_test) > 0:
|
|
639
|
+ silhouette = silhouette_score(X_test_combined_reshaped, all_cluster_labels_test)
|
|
640
|
+ print(f"Silhouette Score: {silhouette:.4f}")
|
|
641
|
+ else:
|
|
642
|
+ # If Silhouette score cannot be calculated, print a message.
|
|
643
|
+ print("Silhouette Score: Not applicable for single cluster.")
|
|
644
|
+else:
|
|
645
|
+ # If no test data sequences were available at all, print a message.
|
|
646
|
+ print("\nNo test data sequences available to evaluate Inertia and Silhouette Score.")
|