소스 검색

Adding kmeans_anomaly folder and its contents_StandardScaler and RobustScaler

Masoud Hosseini 6 일 전
부모
커밋
56c2e2bca8
2개의 변경된 파일1226개의 추가작업 그리고 0개의 파일을 삭제
  1. 580
    0
      kmeans_anomaly/V8_RobustScaler_kmeans_anomaly_with_clear_comment.py
  2. 646
    0
      kmeans_anomaly/V8_kmeans_anomaly_with_clear_comment.py

+ 580
- 0
kmeans_anomaly/V8_RobustScaler_kmeans_anomaly_with_clear_comment.py 파일 보기

@@ -0,0 +1,580 @@
1
+# Import necessary libraries
2
+import pandas as pd
3
+import numpy as np
4
+import matplotlib.pyplot as plt
5
+from sklearn.cluster import KMeans
6
+# Import scalers and metrics from scikit-learn
7
+from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder # RobustScaler is used for scaling
8
+from sklearn.metrics import silhouette_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
9
+import argparse # For parsing command line arguments
10
+import os # For path manipulation
11
+import seaborn as sns # For enhanced data visualization (like confusion matrix)
12
+
13
+# Command line arguments setup
14
+# Defines the command line interface for the script, allowing users to specify parameters
15
+parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering with visualization.')
16
+# --timesteps: Number of data points in each sequence (time window)
17
+parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
18
+# --n_clusters: Number of clusters for K-Means. Expected to be number of failure types + normal state.
19
+parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means (should match the number of failure types + normal).')
20
+# --n_init: Number of times K-Means is run with different initial centroids. The best result is chosen.
21
+parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.')
22
+# --transition: Flag to use data files that include transition periods for testing.
23
+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
24
+# Plotting flags: control which plots are displayed.
25
+parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
26
+parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
27
+parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies (based on clusters).')
28
+parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances (based on clusters).')
29
+# Parse the arguments provided by the user
30
+options = parser.parse_args()
31
+
32
+# Assign parsed arguments to variables
33
+n_clusters = options.n_clusters
34
+timesteps = options.timesteps
35
+n_init = options.n_init
36
+
37
+#####################################################################################################
38
+# Data File Configuration
39
+#####################################################################################################
40
+
41
+# Number of distinct failure types we have data for (excluding the normal state)
42
+NumberOfFailures = 4 # So far, we have only data for the first 4 types of failures
43
+# List to hold file paths for training and testing data
44
+# datafiles[0]: training data files, datafiles[1]: testing data files
45
+# Inner lists correspond to different classes/failure types (0: Normal, 1-4: Failure Types)
46
+datafiles = [[], []] # 0 for train, 1 for test
47
+# Initialize inner lists for each class (Normal + NumberOfFailures)
48
+for i in range(NumberOfFailures + 1):
49
+    datafiles[0].append([])
50
+    datafiles[1].append([])
51
+
52
+# Assign specific filenames to each class for the training set
53
+# datafiles[0][0]: Normal training data
54
+# datafiles[0][1]: Failure Type 1 training data
55
+# ... and so on
56
+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
57
+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
58
+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
59
+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
60
+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
61
+
62
+# Assign specific filenames for the testing set
63
+# Uses different files based on whether the --transition flag is set
64
+if options.transition:
65
+    # Test files including transition data
66
+    datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
67
+    datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_'] # with TRANSITION
68
+    datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_'] # with TRANSITION
69
+    datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_'] # with TRANSITION
70
+    datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
71
+else:
72
+    # Test files without explicit transition data
73
+    datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
74
+    datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
75
+    datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
76
+    datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
77
+    datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
78
+
79
+# Features (columns) to be used from the data files
80
+features = ['r1 s1', 'r1 s4', 'r1 s5']
81
+# Store the initial count of features before potentially adding derived features
82
+n_original_features = len(features) # Store the original number of features
83
+
84
+# Dictionaries to map feature names to display names (e.g., for plots)
85
+featureNames = {}
86
+featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>' # Evaporator Temperature
87
+featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>' # Condenser Temperature
88
+featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>' # Air Temperature
89
+
90
+# Dictionaries to map feature names to their units (e.g., for plots)
91
+unitNames = {}
92
+unitNames['r1 s1'] = r'($^o$C)'
93
+unitNames['r1 s4'] = r'($^o$C)'
94
+unitNames['r1 s5'] = r'($^o$C)'
95
+
96
+# Redundant variable, but kept from original code
97
+NumFeatures = len(features)
98
+
99
+#####################################################################################################
100
+# Data Loading and Preprocessing (Training Data)
101
+#####################################################################################################
102
+
103
+# List to hold DataFrames for training data, organized by class
104
+dataTrain = []
105
+# Loop through each list of files for each training class
106
+for class_files in datafiles[0]:
107
+    class_dfs = [] # List to hold dataframes for current class
108
+    # Loop through each filename in the current class
109
+    for base_filename in class_files:
110
+        # Construct the full file path
111
+        script_dir = os.path.dirname(os.path.abspath(__file__)) # Get directory of the current script
112
+        data_dir = os.path.join(script_dir, 'data') # Assume data is in a 'data' subdirectory
113
+        filepath = os.path.join(data_dir, f'{base_filename}.csv') # Full path to the CSV file
114
+        try:
115
+            # Read the CSV file into a pandas DataFrame
116
+            df = pd.read_csv(filepath)
117
+            # Convert 'datetime' column to datetime objects using two possible formats, coercing errors
118
+            df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
119
+            df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
120
+            # Convert feature columns to numeric, coercing errors to NaN
121
+            for col in features:
122
+                df[col] = pd.to_numeric(df[col], errors='coerce')
123
+            # Set the timestamp as index, resample to 5-minute frequency, and calculate the mean for features
124
+            df = df.set_index('timestamp').resample('5Min')[features].mean() # Resample and calculate mean only for features
125
+            # Estimate missing values (NaN) using linear interpolation
126
+            df = df[features].interpolate() # Estimate missing values using linear interpolation
127
+            # Append the processed DataFrame to the list for the current class
128
+            class_dfs.append(df)
129
+        except FileNotFoundError:
130
+            # Print a warning if a file is not found and skip it
131
+            print(f"Warning: File {filepath} not found and skipped.")
132
+    # If any files were successfully loaded for this class, concatenate them
133
+    if class_dfs:
134
+        dataTrain.append(pd.concat(class_dfs))
135
+
136
+# Concatenate all class DataFrames into a single DataFrame for training
137
+combined_train_data = pd.concat(dataTrain)
138
+
139
+#####################################################################################################
140
+# Data Loading and Preprocessing (Test Data)
141
+#####################################################################################################
142
+
143
+# List to hold DataFrames for test data, organized by class
144
+# Each element in dataTest corresponds to a different class (Normal, Failure Type 1, etc.)
145
+dataTest = []
146
+# Loop through each list of files for each test class
147
+for class_files in datafiles[1]:
148
+    class_dfs = [] # List to hold dataframes for current class
149
+    # Loop through each filename in the current class
150
+    for base_filename in class_files:
151
+        # Construct the full file path
152
+        script_dir = os.path.dirname(os.path.abspath(__file__)) # Get directory of the current script
153
+        data_dir = os.path.join(script_dir, 'data') # Assume data is in a 'data' subdirectory
154
+        filepath = os.path.join(data_dir, f'{base_filename}.csv') # Full path to the CSV file
155
+        try:
156
+            # Read the CSV file into a pandas DataFrame
157
+            df = pd.read_csv(filepath)
158
+            # Convert 'datetime' column to datetime objects using two possible formats, coercing errors
159
+            df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
160
+            df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
161
+            # Convert feature columns to numeric, coercing errors to NaN
162
+            for col in features:
163
+                df[col] = pd.to_numeric(df[col], errors='coerce')
164
+            # Set the timestamp as index, resample to 5-minute frequency, and calculate the mean for features
165
+            df = df.set_index('timestamp').resample('5Min')[features].mean() # Resample and calculate mean only for features
166
+            # Estimate missing values (NaN) using linear interpolation
167
+            df = df[features].interpolate() # Estimate missing values using linear interpolation
168
+            # Append the processed DataFrame to the list for the current class
169
+            class_dfs.append(df)
170
+        except FileNotFoundError:
171
+            # Print a warning if a file is not found and skip it
172
+            print(f"Warning: File {filepath} not found and skipped.")
173
+    # If any files were successfully loaded for this class, concatenate them
174
+    if class_dfs:
175
+        dataTest.append(pd.concat(class_dfs))
176
+
177
+#####################################################################################################
178
+# Raw Data Plotting (Optional)
179
+#####################################################################################################
180
+
181
+# Plot raw data if the --plot_raw flag is provided
182
+if options.plot_raw:
183
+    num_features = len(features)
184
+    # Create a figure and a set of subplots (one for each feature)
185
+    fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
186
+    # Ensure axes is an array even if there's only one feature
187
+    if num_features == 1:
188
+        axes = [axes]
189
+    # Loop through each feature
190
+    for i, feature in enumerate(features):
191
+        # Loop through each test data DataFrame (each class)
192
+        for k, df in enumerate(dataTest):
193
+            # Plot the feature data over time for the current class
194
+            axes[i].plot(df.index, df[feature], label=f'Class {k}')
195
+        # Set ylabel and title for the subplot
196
+        axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
197
+        axes[i].set_title(featureNames[feature])
198
+        # Add legend to the subplot
199
+        axes[i].legend()
200
+    # Adjust layout to prevent labels overlapping
201
+    plt.tight_layout()
202
+    # Display the plot
203
+    plt.show()
204
+    # exit(0) # Uncomment to exit after plotting raw data
205
+
206
+########################################################################################################
207
+# Data Scaling
208
+########################################################################################################
209
+
210
+# Initialize the scaler (RobustScaler is less affected by outliers than StandardScaler)
211
+# StandardScaler() # Original scaler
212
+scaler = RobustScaler() # Changed from StandardScaler
213
+
214
+# Fit the scaler on the training data and transform it
215
+# Only the original features are scaled
216
+scaled_train_data = scaler.fit_transform(combined_train_data[features]) # Normalize only the original features
217
+
218
+# Transform the test data using the scaler fitted on the training data
219
+# A list comprehension is used to transform each test DataFrame
220
+scaled_test_data_list = [scaler.transform(df[features]) for df in dataTest] # Normalize only the original features
221
+
222
+# Convert normalized data back to pandas DataFrames for easier handling (optional but can be useful)
223
+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
224
+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
225
+
226
+############################################################################################################
227
+# Sequence Creation with Rate of Change Feature Engineering
228
+############################################################################################################
229
+
230
+# Function to create time sequences from data and append the rate of change as new features
231
+def create_sequences_with_rate_of_change(data, timesteps, original_features_count): # Parameter name indicates count
232
+    sequences = [] # List to store the created sequences
233
+    # Iterate through the data to create overlapping sequences
234
+    for i in range(len(data) - timesteps + 1):
235
+        # Extract a sequence of 'timesteps' length
236
+        sequence = data[i:i + timesteps]
237
+        # Calculate the difference between consecutive points along the time axis (axis=0)
238
+        # This computes the rate of change for each feature across timesteps
239
+        rate_of_change = np.diff(sequence[:timesteps], axis=0)
240
+        # Pad the rate of change to have the same number of timesteps as the original sequence
241
+        # np.diff reduces the number of timesteps by 1, so we add a row of zeros at the beginning
242
+        # Use the count of original features for padding dimension
243
+        padding = np.zeros((1, original_features_count)) # Corrected: Use the features count
244
+        rate_of_change_padded = np.vstack((padding, rate_of_change)) # Stack the padding on top
245
+        # Concatenate the original sequence and the padded rate of change sequence horizontally
246
+        # Resulting sequence has 'timesteps' rows and '2 * original_features_count' columns
247
+        sequences.append(np.hstack((sequence, rate_of_change_padded))) # Concatenate original and rate of change
248
+    # Convert the list of sequences into a NumPy array
249
+    return np.array(sequences)
250
+
251
+# Create time sequences with rate of change for the scaled training data
252
+# The output shape will be (num_training_sequences, timesteps, 2 * n_original_features)
253
+X_train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features) # Pass n_original_features
254
+
255
+# Create time sequences with rate of change for each scaled test data DataFrame
256
+# X_test_sequences_list will be a list of arrays, one for each test class
257
+X_test_sequences_list = [create_sequences_with_rate_of_change(df.values, timesteps, n_original_features) for df in scaled_test_df_list] # Pass n_original_features
258
+
259
+############################################################################################################
260
+# K-Means Clustering Model
261
+############################################################################################################
262
+
263
+# Reshape the training sequences for K-Means
264
+# K-Means expects a 2D array (samples, features)
265
+# We flatten each sequence (timesteps * total_features) into a single row
266
+n_samples, n_timesteps, n_total_features = X_train_sequences.shape
267
+X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_total_features)
268
+
269
+# Train the K-Means model
270
+# n_clusters: Number of clusters (expected to be number of classes)
271
+# random_state=42: Ensures reproducibility of initial centroids for n_init runs
272
+# n_init=10: Runs K-Means 10 times with different centroid seeds and picks the best result (lowest inertia)
273
+kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # n_init to avoid convergence to local optima
274
+# Fit the K-Means model on the reshaped training data
275
+kmeans.fit(X_train_reshaped)
276
+
277
+############################################################################################################################
278
+# Predict Clusters for Test Data
279
+############################################################################################################################
280
+
281
+# List to store predicted cluster labels for each test data DataFrame
282
+cluster_labels_test_list = []
283
+# List to store reshaped test data (useful for evaluation metrics later)
284
+X_test_reshaped_list = []
285
+# kmeans_models = [] # To store kmeans model for each test set (This variable is declared but not used subsequently)
286
+
287
+# Loop through each test data sequence array (each class)
288
+for i, X_test_seq in enumerate(X_test_sequences_list):
289
+    # Get dimensions of the current test sequence array
290
+    n_samples_test, n_timesteps_test, n_total_features_test = X_test_seq.shape
291
+    # Reshape the test sequences for prediction (flatten each sequence)
292
+    X_test_reshaped = X_test_seq.reshape(n_samples_test, n_timesteps_test * n_total_features_test)
293
+    # Predict cluster labels for the reshaped test data
294
+    labels = kmeans.predict(X_test_reshaped)
295
+    # Append the predicted labels and reshaped data to the lists
296
+    cluster_labels_test_list.append(labels)
297
+    X_test_reshaped_list.append(X_test_reshaped) # Append reshaped data
298
+    # kmeans_models.append(kmeans) # Store the trained kmeans model (Variable declared but not used)
299
+
300
+############################################################################################################################
301
+# Plotting Clustered Data (Optional)
302
+############################################################################################################
303
+
304
+# Function to plot the original data points colored by their assigned cluster label
305
+# Plots only the original features
306
+def plot_clustered_data(original_data_list, cluster_labels_list, n_clusters, features, featureNames, unitNames):
307
+    num_features = len(features)
308
+    # Create subplots, one for each original feature
309
+    fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
310
+    # Ensure axes is an array even if only one feature
311
+    if num_features == 1:
312
+        axes = [axes]
313
+    # Generate a color map for the clusters
314
+    colors = plt.cm.viridis(np.linspace(0, 1, n_clusters)) # Assign colors to each cluster
315
+
316
+    # Loop through each original test data DataFrame (each class)
317
+    for k, df in enumerate(original_data_list):
318
+        original_indices = df.index # Get the original time index
319
+        # Get the time index corresponding to the start of each sequence (shifted by timesteps-1)
320
+        time_index = original_indices[timesteps - 1:]
321
+
322
+        # Loop through each original feature
323
+        for i, feature in enumerate(features):
324
+            # Loop through each predicted cluster ID
325
+            for cluster_id in range(n_clusters):
326
+                # Find the indices in the current test data corresponding to the current cluster ID
327
+                cluster_indices_kmeans = np.where(cluster_labels_list[k] == cluster_id)[0]
328
+                # If there are data points assigned to this cluster
329
+                if len(cluster_indices_kmeans) > 0:
330
+                    # Scatter plot the data points for this cluster
331
+                    # x-axis: time_index points corresponding to the sequence end
332
+                    # y-axis: original feature values at those time_index points
333
+                    # color: color assigned to the cluster
334
+                    # label: label for the cluster (only show for the first class (k==0) to avoid redundant legends)
335
+                    # s=10: size of the scatter points
336
+                    axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10)
337
+            # Set ylabel and title for the subplot
338
+            axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
339
+            axes[i].set_title(featureNames[feature])
340
+        # Add legend to the last subplot (or each if desired)
341
+        axes[num_features - 1].legend(loc='upper right') # Place legend on the last subplot
342
+
343
+    # Adjust layout and display the plot
344
+    plt.tight_layout()
345
+    plt.show()
346
+
347
+# Call the plotting function if the --plot_clustered flag is provided
348
+if options.plot_clustered:
349
+    plot_clustered_data(dataTest, cluster_labels_test_list, n_clusters, features, featureNames, unitNames)
350
+
351
+#####################################################################################################
352
+# Evaluation and plotting of anomalies and misclassified instances (based on cluster labels)
353
+#####################################################################################################
354
+
355
+# Function to evaluate clustering results and plot anomalies/misclassified instances
356
+def evaluate_and_plot_anomalies(kmeans_model, scaled_test_data_list, n_clusters, original_test_data_list, true_labels_list, features, featureNames, unitNames, plot_anomalies=False, plot_misclassified=False):
357
+    # Lists to store collected data and labels across all test classes
358
+    all_y_true_categorical = [] # Stores true labels (0, 1, 2, ...) for each sequence
359
+    all_predicted_cluster_labels = [] # Stores predicted cluster ID for each sequence
360
+    all_original_test_sequences = [] # Stores the original feature values for each sequence (for plotting)
361
+
362
+    # Lists to store evaluation metrics per test class (before combining)
363
+    inertia_values = [] # Inertia values for each class's data predicted by the model
364
+    silhouette_scores = [] # Silhouette scores for each class's data predicted by the model
365
+
366
+    # Loop through each test class data (scaled, original, and true labels)
367
+    for i, (scaled_test_df, original_test_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)):
368
+        # Create sequences with rate of change for the current scaled test data
369
+        X_test_sequences = create_sequences_with_rate_of_change(scaled_test_df.values, timesteps, n_original_features) # Pass n_original_features
370
+        # Skip evaluation for this class if no sequences were generated (data too short)
371
+        if X_test_sequences.size == 0:
372
+            print(f"Warning: No test sequences generated for class {i}. Skipping evaluation for this class.")
373
+            continue
374
+        # Reshape the sequences for prediction by the trained K-Means model
375
+        n_samples_test = X_test_sequences.shape[0]
376
+        X_test_reshaped = X_test_sequences.reshape(n_samples_test, -1)
377
+        # Predict cluster labels for the current test class data
378
+        cluster_labels_predicted = kmeans_model.predict(X_test_reshaped)
379
+
380
+        # Calculate and store Inertia for the current class's data (based on the overall model)
381
+        # This is different from the model's final inertia on training data
382
+        inertia_values.append(kmeans_model.inertia_) # Note: This seems to append the total model inertia, not per-class inertia. It might be intended to be calculated differently here. Keeping original code logic.
383
+        # Calculate and store Silhouette score if possible (requires >1 unique labels and >0 samples)
384
+        if len(np.unique(cluster_labels_predicted)) > 1 and len(cluster_labels_predicted) > 0:
385
+            silhouette_scores.append(silhouette_score(X_test_reshaped, cluster_labels_predicted))
386
+        else:
387
+            silhouette_scores.append(np.nan) # Append NaN if silhouette cannot be calculated
388
+
389
+        # Get the time indices corresponding to the end of each sequence in the original data
390
+        original_indices = original_test_df.index[timesteps - 1:]
391
+
392
+        # Collect true labels, predicted labels, and original sequences for evaluation/plotting
393
+        # Loop through the sequences generated for the current class
394
+        for j, label in enumerate(y_true_categorical[timesteps - 1:]): # Iterate over true labels corresponding to sequence ends
395
+            all_y_true_categorical.append(label) # Append the true label
396
+            all_predicted_cluster_labels.append(cluster_labels_predicted[j]) # Append the predicted cluster label
397
+            # Get the start and end index in the original DataFrame for the current sequence
398
+            start_index = original_test_df.index.get_loc(original_indices[j]) - (timesteps - 1)
399
+            end_index = start_index + timesteps
400
+            # Extract and append the original feature values for the current sequence
401
+            all_original_test_sequences.append(original_test_df[features].iloc[start_index:end_index].values) # Append
402
+
403
+    # Convert collected lists to NumPy arrays for easier handling
404
+    all_y_true_categorical = np.array(all_y_true_categorical)
405
+    all_predicted_cluster_labels = np.array(all_predicted_cluster_labels)
406
+    all_original_test_sequences = np.array(all_original_test_sequences)
407
+
408
+    # Print evaluation metrics (based on collected values across all test classes)
409
+    print("\nEvaluation Metrics:")
410
+    # Print mean Inertia (likely the final Inertia of the trained model as per the loop)
411
+    print(f"Inertia (final): {np.mean(inertia_values):.4f}") # Check if this is the intended calculation
412
+    # Print mean Silhouette score across classes (ignoring NaNs)
413
+    print(f"Average Silhouette Score (valid cases): {np.nanmean(silhouette_scores):.4f}")
414
+
415
+    # Analyze clusters and assign a dominant true label to each cluster ID
416
+    # This helps in mapping cluster IDs back to meaningful class labels for evaluation
417
+    cluster_dominant_label = {} # Dictionary to store the dominant true label for each cluster ID
418
+    for cluster_id in range(n_clusters): # Loop through each cluster ID
419
+        # Find indices of all sequences assigned to the current cluster ID
420
+        indices_in_cluster = np.where(all_predicted_cluster_labels == cluster_id)[0]
421
+        # If there are sequences in this cluster
422
+        if len(indices_in_cluster) > 0:
423
+            # Get the true labels for all sequences in this cluster
424
+            labels_in_cluster = all_y_true_categorical[indices_in_cluster]
425
+            # If there are labels (and thus samples) in this cluster
426
+            if len(labels_in_cluster) > 0:
427
+                # Find the most frequent true label (dominant label) in this cluster
428
+                dominant_label = np.argmax(np.bincount(labels_in_cluster))
429
+                cluster_dominant_label[cluster_id] = dominant_label # Store the dominant label
430
+            else:
431
+                cluster_dominant_label[cluster_id] = -1 # Assign -1 if no data points have true labels (shouldn't happen if indices_in_cluster > 0 and all_y_true_categorical is aligned)
432
+        else:
433
+            cluster_dominant_label[cluster_id] = -1 # Assign -1 if the cluster is empty
434
+
435
+    # Create predicted labels in numeric form based on the dominant true label of the assigned cluster
436
+    # This maps the predicted cluster ID for each sequence to the dominant true label of that cluster
437
+    predicted_labels_numeric = np.array([cluster_dominant_label.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels])
438
+
439
+    # Evaluate the clustering's ability to separate classes using classification metrics
440
+    # Only consider instances where a dominant label could be assigned (predicted_labels_numeric != -1)
441
+    valid_indices = predicted_labels_numeric != -1 # Indices where a dominant label mapping exists
442
+    # Perform evaluation if there are valid instances and more than one true class represented
443
+    if np.sum(valid_indices) > 0 and len(np.unique(all_y_true_categorical[valid_indices])) > 1:
444
+        print("\nEvaluation Results (Clusters vs True Labels):")
445
+        # Print classification report (Precision, Recall, F1-score per class, and overall metrics)
446
+        print(classification_report(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices]))
447
+        # Compute the confusion matrix
448
+        cm = confusion_matrix(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices])
449
+        # Plot the confusion matrix using seaborn heatmap
450
+        plt.figure(figsize=(8, 6))
451
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') # annot=True shows values, fmt='d' formats as integers
452
+        plt.xlabel('Predicted Cluster (Dominant True Label)') # Label for x-axis
453
+        plt.ylabel('True Label') # Label for y-axis
454
+        plt.title('Confusion Matrix (Clusters vs True Labels)') # Title of the plot
455
+        plt.show() # Display the plot
456
+    else:
457
+        print("\nCould not perform detailed evaluation (not enough data or classes).")
458
+
459
+    #################################################################################################
460
+    # Plotting Anomalies (Optional)
461
+    #################################################################################################
462
+
463
+    # Plot detected anomalies if the --plot_anomalies flag is provided
464
+    # Anomalies are defined here as instances assigned to clusters whose dominant true label is > 0 (Failure types)
465
+    if plot_anomalies:
466
+        print("\nChecking anomaly data:")
467
+        # Identify clusters that predominantly contain non-normal true labels (failure types)
468
+        anomaly_clusters = [cluster_id for cluster_id, label in cluster_dominant_label.items() if label > 0]
469
+        # Find indices of all sequences assigned to these "anomaly" clusters
470
+        anomaly_indices = np.where(np.isin(all_predicted_cluster_labels, anomaly_clusters))[0]
471
+        # If any anomalies are detected
472
+        if len(anomaly_indices) > 0:
473
+            # Limit the number of anomaly plots to show
474
+            num_anomalies_to_plot = min(5, len(anomaly_indices))
475
+            colors = ['red', 'green', 'blue'] # Define different colors for features
476
+            # Randomly select and plot a few anomaly sequences
477
+            for i in np.random.choice(anomaly_indices, num_anomalies_to_plot, replace=False):
478
+                # Print shape and sample values for the sequence being plotted
479
+                print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
480
+                print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
481
+                # Create a new figure for each anomaly plot
482
+                plt.figure(figsize=(12, 6))
483
+                # Plot each feature in the sequence over time steps
484
+                for j, feature in enumerate(features):
485
+                    # Plot the feature values (y-axis) against time steps (x-axis)
486
+                    plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
487
+                # Get the true label and predicted cluster for the title
488
+                true_label = all_y_true_categorical[i]
489
+                predicted_cluster_for_title = all_predicted_cluster_labels[i]
490
+                # Set the title for the anomaly plot, including true label and predicted cluster
491
+                plt.title(f'Detected Anomaly (True: {true_label}, Cluster: {predicted_cluster_for_title})') # Corrected title format
492
+                plt.xlabel('Time Step')
493
+                plt.ylabel('Value')
494
+                plt.legend() # Add legend to identify features
495
+                plt.show() # Display the plot
496
+        else:
497
+            print("No anomalies detected based on cluster dominance.")
498
+
499
+    #################################################################################################
500
+    # Plotting Misclassified Instances (Optional)
501
+    #################################################################################################
502
+
503
+    # Plot misclassified instances if the --plot_misclassified flag is provided
504
+    # Misclassified are defined here as instances where the true label is DIFFERENT from the dominant label of the assigned cluster
505
+    if plot_misclassified:
506
+        print("\nChecking misclassified data:")
507
+        # Find indices where the true label does not match the dominant label of the predicted cluster
508
+        misclassified_indices = np.where(all_y_true_categorical != predicted_labels_numeric)[0]
509
+        # If any misclassified instances are found
510
+        if len(misclassified_indices) > 0:
511
+            # Limit the number of misclassified plots to show
512
+            num_misclassified_to_plot = min(5, len(misclassified_indices))
513
+            colors = ['red', 'green', 'blue'] # Define different colors for features
514
+            # Randomly select and plot a few misclassified sequences
515
+            for i in np.random.choice(misclassified_indices, num_misclassified_to_plot, replace=False):
516
+                # Print shape and sample values for the sequence being plotted
517
+                print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
518
+                print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
519
+                # Create a new figure for each misclassified plot
520
+                plt.figure(figsize=(12, 6))
521
+                # Plot each feature in the sequence over time steps
522
+                for j, feature in enumerate(features):
523
+                    # Plot the feature values (y-axis) against time steps (x-axis)
524
+                    plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
525
+                # FIXED: Get labels using index i for plot title
526
+                true_label = all_y_true_categorical[i] # Get the true label
527
+                predicted_label = predicted_labels_numeric[i] # Get the numeric predicted label (dominant cluster label)
528
+                # Set the title for the misclassified plot, including true label and predicted cluster's dominant label
529
+                plt.title(f'Misclassified Instance (True: {true_label}, Predicted Cluster Dominant Label: {predicted_label})') # Corrected title format
530
+                plt.xlabel('Time Step')
531
+                plt.ylabel('Value')
532
+                plt.legend() # Add legend to identify features
533
+                plt.show() # Display the plot
534
+        else:
535
+            print("No misclassified instances found based on cluster dominance.")
536
+
537
+    # Return the true and predicted labels for potential further use
538
+    return all_y_true_categorical, predicted_labels_numeric
539
+
540
+#####################################################################################################
541
+# Main Execution
542
+#####################################################################################################
543
+
544
+# Create the list of true labels for the test data
545
+# Assign a numeric label (0, 1, 2, ...) to each sequence based on its original file class
546
+true_labels_list = []
547
+for i, df in enumerate(dataTest): # Loop through each test DataFrame (each class)
548
+    # Create a numpy array of the same length as the DataFrame, filled with the class index (i)
549
+    true_labels_list.append(np.full(len(df), i))
550
+
551
+# Call the evaluation and plotting function with the necessary data and options
552
+y_true_final, y_pred_final = evaluate_and_plot_anomalies(kmeans, scaled_test_df_list, n_clusters, dataTest, true_labels_list, features, featureNames, unitNames, plot_anomalies=options.plot_anomalies, plot_misclassified=options.plot_misclassified)
553
+
554
+#####################################################################################################
555
+# Final Evaluation Metrics (on combined test data)
556
+#####################################################################################################
557
+
558
+# Calculate and print final Inertia and Silhouette Score for the combined test data
559
+# Check if there's any reshaped test data available
560
+if X_test_reshaped_list:
561
+    # Vertically stack all reshaped test data arrays into a single array
562
+    X_test_combined_reshaped = np.vstack(X_test_reshaped_list)
563
+    # Concatenate all predicted cluster labels into a single array
564
+    all_cluster_labels_test = np.concatenate(cluster_labels_test_list)
565
+
566
+    # Print K-Means evaluation metrics on the combined test data
567
+    print("\nK-Means Model Evaluation on Combined Test Data:")
568
+    # Print the final Inertia of the trained K-Means model
569
+    print(f"Inertia: {kmeans.inertia_:.4f}")
570
+
571
+    # Calculate and print Silhouette Score if possible
572
+    # Requires more than one unique predicted label and at least one sample
573
+    if len(np.unique(all_cluster_labels_test)) > 1 and len(all_cluster_labels_test) > 0:
574
+        silhouette = silhouette_score(X_test_combined_reshaped, all_cluster_labels_test)
575
+        print(f"Silhouette Score: {silhouette:.4f}")
576
+    else:
577
+        print("Silhouette Score: Not applicable for single cluster.")
578
+else:
579
+    # Print a message if no test data sequences were available for evaluation
580
+    print("\nNo test data sequences available to evaluate Inertia and Silhouette Score.")

+ 646
- 0
kmeans_anomaly/V8_kmeans_anomaly_with_clear_comment.py 파일 보기

@@ -0,0 +1,646 @@
1
+# Import necessary libraries for data manipulation, numerical operations, plotting, and machine learning
2
+import pandas as pd # For data manipulation and analysis (DataFrames)
3
+import numpy as np # For numerical operations, especially array handling
4
+import matplotlib.pyplot as plt # For creating static, interactive, and animated visualizations
5
+from sklearn.cluster import KMeans # K-Means clustering algorithm
6
+# Import preprocessing tools and metrics from scikit-learn
7
+from sklearn.preprocessing import StandardScaler, LabelEncoder # StandardScaler is used for scaling (RobustScaler was used in another version)
8
+from sklearn.metrics import silhouette_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report # Evaluation metrics
9
+import argparse # For parsing command line arguments
10
+import os # For interacting with the operating system, like manipulating file paths
11
+import seaborn as sns # For drawing attractive statistical graphics (used for confusion matrix heatmap)
12
+
13
+# --- Command line arguments setup ---
14
+# This section defines and parses command-line arguments to control script behavior
15
+parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering with visualization.')
16
+
17
+# Define arguments with their expected data type, default value, and a help message
18
+parser.add_argument('--timesteps', type=int, default=30, help='Number of timesteps for sequences.') # Length of time window for each sequence
19
+parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means (should match the number of failure types + normal).') # Number of clusters
20
+parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.') # Number of times K-Means algorithm will be run with different centroid seeds
21
+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.') # Flag: if present, use transition test data files
22
+# Plotting flags: action='store_true' means the variable becomes True if the flag is present
23
+parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
24
+parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
25
+parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies (based on clusters).')
26
+parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances (based on clusters).')
27
+
28
+# Parse the arguments provided by the user when running the script
29
+options = parser.parse_args()
30
+
31
+# Assign the parsed arguments to variables for easier access throughout the script
32
+n_clusters = options.n_clusters
33
+timesteps = options.timesteps
34
+n_init = options.n_init
35
+
36
+#####################################################################################################
37
+# --- Data File Configuration ---
38
+# This section defines the list of data files to be used for training and testing
39
+#####################################################################################################
40
+
41
+# Specify the number of failure types data is available for (excluding the normal state, which is class 0)
42
+NumberOfFailures = 4 # So far, we have only data for the first 4 types of failures
43
+
44
+# Initialize a nested list to store filenames for training and testing data
45
+# datafiles[0] will store training files, datafiles[1] will store testing files
46
+# Each inner list datafiles[train/test][i] corresponds to class i (0 for Normal, 1 to NumberOfFailures for failure types)
47
+datafiles = [[], []] # datafiles[0] for train data filenames, datafiles[1] for test data filenames
48
+# Populate the inner lists. We need NumberOfFailures + 1 inner lists for classes (0 to 4).
49
+for i in range(NumberOfFailures + 1):
50
+    datafiles[0].append([]) # Add an empty list for the current class's training files
51
+    datafiles[1].append([]) # Add an empty list for the current class's testing files
52
+
53
+# Manually assign the base filenames for each class for the training set
54
+# These filenames are expected to be in a 'data' subdirectory relative to the script
55
+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_'] # Normal training data files (Class 0)
56
+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_'] # Failure Type 1 training data files (Class 1)
57
+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_'] # Failure Type 2 training data files (Class 2)
58
+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_'] # Failure Type 3 training data files (Class 3)
59
+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_'] # Failure Type 4 training data files (Class 4)
60
+
61
+# Assign base filenames for the testing set based on the --transition flag
62
+if options.transition:
63
+    # Use test files that are specified as including transition periods
64
+    datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_'] # Normal test data files (Class 0)
65
+    datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_'] # Failure Type 1 test data files (with TRANSITION) (Class 1)
66
+    datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_'] # Failure Type 2 test data files (with TRANSITION) (Class 2)
67
+    datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_'] # Failure Type 3 test data files (with TRANSITION) (Class 3)
68
+    datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_'] # Failure Type 4 test data files (Class 4)
69
+else:
70
+    # Use test files that are specified as not including transition periods
71
+    datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_'] # Normal test data files (Class 0)
72
+    datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_'] # Failure Type 1 test data files (Class 1)
73
+    datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_'] # Failure Type 2 test data files (Class 2)
74
+    datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_'] # Failure Type 3 test data files (Class 3)
75
+    datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_'] # Failure Type 4 test data files (Class 4)
76
+
77
+# --- Feature Definition ---
78
+# Define the list of features (column names) to be extracted from the data files
79
+features = ['r1 s1', 'r1 s4', 'r1 s5']
80
+# Store the original number of features. This is needed later for sequence creation.
81
+n_original_features = len(features) # Store the original number of features
82
+
83
+# Dictionaries to store display names (using LaTeX-like format) and units for features, mainly used for plot labels
84
+featureNames = {}
85
+featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>' # Evaporator Temperature
86
+featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>' # Condenser Temperature
87
+featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>' # Air Temperature
88
+
89
+unitNames = {}
90
+unitNames['r1 s1'] = r'($^o$C)' # Degrees Celsius unit
91
+unitNames['r1 s4'] = r'($^o$C)'
92
+unitNames['r1 s5'] = r'($^o$C)'
93
+
94
+# Redundant variable, but kept from original code
95
+NumFeatures = len(features)
96
+
97
+#####################################################################################################
98
+# --- Data Loading and Preprocessing (Training Data) ---
99
+# This section loads, cleans, and preprocesses the training data
100
+#####################################################################################################
101
+
102
+# List to hold processed DataFrames for training data, organized by class
103
+# Each element dataTrain[i] is a DataFrame containing concatenated data for class i
104
+dataTrain = []
105
+# Loop through each list of filenames for each training class (Class 0, 1, 2, 3, 4)
106
+for class_files in datafiles[0]:
107
+    class_dfs = [] # Temporary list to hold DataFrames loaded for the current class
108
+    # Loop through each base filename in the current class's list
109
+    for base_filename in class_files:
110
+        # Construct the full absolute file path assuming data is in a 'data' subdirectory
111
+        script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory where the current script is located
112
+        data_dir = os.path.join(script_dir, 'data') # Define the path to the 'data' subdirectory
113
+        filepath = os.path.join(data_dir, f'{base_filename}.csv') # Combine directory and filename
114
+
115
+        try:
116
+            # Read the CSV file into a pandas DataFrame
117
+            df = pd.read_csv(filepath)
118
+            # Convert the 'datetime' column to datetime objects. Try two formats and coerce errors (set invalid parsing to NaT).
119
+            df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
120
+            # Fill any NaT values from the first attempt by trying a second format.
121
+            df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
122
+            # Convert the specified feature columns to numeric type. Coerce errors (set invalid parsing to NaN).
123
+            for col in features:
124
+                df[col] = pd.to_numeric(df[col], errors='coerce')
125
+            # Set the 'timestamp' column as the DataFrame index
126
+            # Resample the data to a 5-minute frequency ('5Min').
127
+            # Select only the specified 'features' columns.
128
+            # Calculate the mean for each 5-minute interval.
129
+            df = df.set_index('timestamp').resample('5Min')[features].mean() # Set index, resample, and calculate mean for features
130
+            # Estimate any remaining missing values (NaN) within the feature columns using linear interpolation
131
+            df = df[features].interpolate() # Estimate missing values using linear interpolation
132
+            # Append the processed DataFrame to the list for the current class
133
+            class_dfs.append(df)
134
+        except FileNotFoundError:
135
+            # If a file is not found, print a warning and skip processing it
136
+            print(f"Warning: File {filepath} not found and skipped.")
137
+    # If the list of DataFrames for the current class is not empty, concatenate them into a single DataFrame
138
+    if class_dfs:
139
+        dataTrain.append(pd.concat(class_dfs))
140
+
141
+# Concatenate all DataFrames from all training classes into a single large DataFrame for training the scaler and the model
142
+combined_train_data = pd.concat(dataTrain)
143
+
144
+#####################################################################################################
145
+# --- Data Loading and Preprocessing (Test Data) ---
146
+# This section loads, cleans, and preprocesses the testing data
147
+# The process is identical to the training data loading, but done separately for test files
148
+#####################################################################################################
149
+
150
+# List to hold processed DataFrames for test data, organized by class
151
+# Each element dataTest[i] is a DataFrame containing concatenated data for class i
152
+dataTest = []
153
+# Loop through each list of filenames for each test class (Class 0, 1, 2, 3, 4)
154
+for class_files in datafiles[1]:
155
+    class_dfs = [] # Temporary list to hold DataFrames loaded for the current class
156
+    # Loop through each base filename in the current class's list
157
+    for base_filename in class_files:
158
+        # Construct the full absolute file path
159
+        script_dir = os.path.dirname(os.path.abspath(__file__))
160
+        data_dir = os.path.join(script_dir, 'data')
161
+        filepath = os.path.join(data_dir, f'{base_filename}.csv')
162
+
163
+        try:
164
+            # Read the CSV file into a pandas DataFrame
165
+            df = pd.read_csv(filepath)
166
+            # Convert the 'datetime' column to datetime objects
167
+            df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
168
+            df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
169
+            # Convert the specified feature columns to numeric type
170
+            for col in features:
171
+                df[col] = pd.to_numeric(df[col], errors='coerce')
172
+            # Set the 'timestamp' column as the index and resample to 5-minute frequency, calculating the mean
173
+            df = df.set_index('timestamp').resample('5Min')[features].mean() # Set index, resample, and calculate mean for features
174
+            # Estimate any remaining missing values (NaN) using linear interpolation
175
+            df = df[features].interpolate() # Estimate missing values using linear interpolation
176
+            # Append the processed DataFrame to the list for the current class
177
+            class_dfs.append(df)
178
+        except FileNotFoundError:
179
+            # If a file is not found, print a warning and skip processing it
180
+            print(f"Warning: File {filepath} not found and skipped.")
181
+    # If the list of DataFrames for the current class is not empty, concatenate them into a single DataFrame
182
+    if class_dfs:
183
+        dataTest.append(pd.concat(class_dfs))
184
+
185
+#####################################################################################################
186
+# --- Raw Data Plotting (Optional) ---
187
+# This section plots the unprocessed data for visualization if the --plot_raw flag is set
188
+#####################################################################################################
189
+
190
+# Check if the plot_raw argument was set to True
191
+if options.plot_raw:
192
+    num_features = len(features) # Get the number of features to determine plot layout
193
+    # Create a figure and a grid of subplots. Share the x-axis among all subplots.
194
+    fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
195
+    # Ensure 'axes' is always an array, even if there's only one feature (and thus only one subplot)
196
+    if num_features == 1:
197
+        axes = [axes]
198
+    # Loop through each feature by index (i) and name (feature)
199
+    for i, feature in enumerate(features):
200
+        # Loop through each test data DataFrame (representing a different class)
201
+        for k, df in enumerate(dataTest):
202
+            # Plot the data for the current feature and class over time
203
+            # Use f-string to include the class number in the label
204
+            axes[i].plot(df.index, df[feature], label=f'Class {k}')
205
+        # Set the label for the y-axis using the feature's display name and unit
206
+        axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
207
+        # Set the title for the subplot using the feature's display name
208
+        axes[i].set_title(featureNames[feature])
209
+        # Add a legend to identify the classes being plotted in each subplot
210
+        # Placing it on the last subplot is common to avoid repetition
211
+        axes[num_features - 1].legend(loc='upper right') # Place legend on the last subplot
212
+
213
+    # Adjust layout to prevent plot elements (like labels) from overlapping
214
+    plt.tight_layout()
215
+    # Display the plot window
216
+    plt.show()
217
+    # exit(0) # Uncomment this line if you want the script to stop after showing raw data plots
218
+
219
+########################################################################################################
220
+# --- Data Scaling ---
221
+# This section scales the data using StandardScaler
222
+########################################################################################################
223
+
224
+# Initialize the StandardScaler. This scaler standardizes features by removing the mean and scaling to unit variance.
225
+scaler = StandardScaler() # Using StandardScaler in this version
226
+
227
+# Fit the scaler on the training data and then transform the training data.
228
+# The scaler learns the mean and standard deviation from the combined_train_data[features].
229
+scaled_train_data = scaler.fit_transform(combined_train_data[features]) # Fit on training, transform training
230
+
231
+# Transform each test data DataFrame using the scaler fitted on the training data.
232
+# The same scaling parameters (mean, standard deviation) from the training data are applied to the test data.
233
+# A list comprehension efficiently applies the transformation to each DataFrame in dataTest.
234
+scaled_test_data_list = []
235
+for df in dataTest: # Loop through each test DataFrame
236
+    scaled_test_data_list.append(scaler.transform(df[features])) # Transform each test DataFrame
237
+
238
+# Convert the scaled NumPy arrays back into pandas DataFrames.
239
+# This step is optional but can be helpful for inspection and keeping track of timestamps/column names.
240
+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
241
+# Create a list of scaled test DataFrames, maintaining original indices and column names.
242
+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
243
+
244
+############################################################################################################
245
+# --- Sequence Creation with Rate of Change Feature Engineering ---
246
+# This section defines a function to create time sequences and adds rate of change features
247
+############################################################################################################
248
+
249
+# Function to create time sequences (windows) from data and append rate of change features
250
+# 'data': Input NumPy array (scaled feature values)
251
+# 'timesteps': The length of each sequence (number of time points)
252
+# 'original_features': The number of features in the input data (used for padding) - NOTE: Parameter name is potentially misleading, should be count
253
+def create_sequences_with_rate_of_change(data, timesteps, original_features): # NOTE: original_features parameter likely intended as original_features_count
254
+    sequences = [] # List to store the generated sequences
255
+    # Iterate through the data to create overlapping sequences.
256
+    # The loop runs until the last possible start index for a sequence of length 'timesteps'.
257
+    for i in range(len(data) - timesteps + 1):
258
+        # Extract a sequence (slice) of 'timesteps' length starting from index 'i'.
259
+        sequence = data[i:i + timesteps]
260
+        # Calculate the difference between consecutive time points within the sequence.
261
+        # np.diff with axis=0 calculates the difference along the rows (time).
262
+        # This results in an array with shape (timesteps - 1, number_of_features).
263
+        rate_of_change = np.diff(sequence[:timesteps], axis=0)
264
+        # Pad the rate of change array to match the original sequence's length ('timesteps').
265
+        # np.diff reduces the dimension by 1, so we add a row of zeros at the beginning.
266
+        # The padding shape should be (1 row, number of columns equal to original features count).
267
+        padding = np.zeros((1, original_features)) # NOTE: This line caused a TypeError in previous debugging if original_features was not the count. It should likely use n_original_features or a correctly passed count.
268
+        # Vertically stack the padding row on top of the rate of change array.
269
+        # The result has shape (timesteps, number_of_features).
270
+        rate_of_change_padded = np.vstack((padding, rate_of_change)) # Stack padding on top of diff
271
+        # Horizontally stack the original sequence and the padded rate of change array.
272
+        # The resulting combined sequence has shape (timesteps, 2 * number_of_features).
273
+        sequences.append(np.hstack((sequence, rate_of_change_padded))) # Concatenate original sequence and padded rate of change
274
+    # Convert the list of 3D sequences into a single 3D NumPy array.
275
+    return np.array(sequences)
276
+
277
+# Create time sequences with rate of change for the scaled training data.
278
+# The output is a 3D array: (number of sequences, timesteps, 2 * n_original_features).
279
+X_train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features) # Pass n_original_features (correctly passes count)
280
+
281
+# Create time sequences with rate of change for each scaled test data DataFrame.
282
+# This results in a list where each element is a 3D array of sequences for a specific test class.
283
+X_test_sequences_list = [create_sequences_with_rate_of_change(df.values, timesteps, n_original_features) for df in scaled_test_df_list] # Pass n_original_features (correctly passes count)
284
+
285
+############################################################################################################
286
+# --- K-Means Clustering Model ---
287
+# This section initializes, trains, and applies the K-Means model
288
+############################################################################################################
289
+
290
+# Reshape the training sequences into a 2D array for the K-Means algorithm.
291
+# K-Means expects data in the shape (number of samples, number of features).
292
+# Each sequence (timesteps * total_features) is flattened into a single row for clustering.
293
+n_samples, n_timesteps, n_total_features = X_train_sequences.shape # Get dimensions of the sequence array
294
+X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_total_features) # Flatten each sequence
295
+
296
+# Initialize the KMeans model.
297
+# n_clusters: The desired number of clusters (set by command line argument).
298
+# random_state=42: Sets the seed for random number generation for initial centroids, ensuring reproducibility.
299
+# n_init=10: Runs the K-Means algorithm 10 times with different centroid initializations and selects the best result based on inertia.
300
+kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # Initialize KMeans model
301
+# Train (fit) the KMeans model on the reshaped training data.
302
+kmeans.fit(X_train_reshaped)
303
+
304
+############################################################################################################################
305
+# --- Predict Clusters for Test Data ---
306
+# This section applies the trained K-Means model to the test data to get cluster assignments
307
+############################################################################################################################
308
+
309
+# List to store the predicted cluster labels for each test data class.
310
+cluster_labels_test_list = []
311
+# List to store the reshaped test data arrays (flattened sequences), needed for later evaluation metrics.
312
+X_test_reshaped_list = []
313
+kmeans_models = [] # This variable was declared but not used in the original code (To store kmeans model for each test set)
314
+
315
+# Loop through each test data sequence array (one for each test class).
316
+for i, X_test_seq in enumerate(X_test_sequences_list):
317
+    # Get the dimensions of the current test sequence array.
318
+    n_samples_test, n_timesteps_test, n_total_features_test = X_test_seq.shape
319
+    # Reshape the test sequences for prediction, flattening each sequence into a single data point for KMeans.
320
+    X_test_reshaped = X_test_seq.reshape(n_samples_test, n_timesteps_test * n_total_features_test)
321
+    # Use the trained K-Means model to predict the cluster label for each reshaped test sequence.
322
+    labels = kmeans.predict(X_test_reshaped)
323
+    # Append the predicted labels for the current test class to the list.
324
+    cluster_labels_test_list.append(labels)
325
+    # Append the reshaped test data for the current class to the list (needed for Silhouette score calculation later).
326
+    X_test_reshaped_list.append(X_test_reshaped) # Append reshaped data
327
+    # kmeans_models.append(kmeans) # Append the trained kmeans model (Variable not used)
328
+
329
+############################################################################################################################
330
+# --- Plotting Clustered Data (Optional) ---
331
+# This function plots the original data points, colored according to their predicted cluster
332
+############################################################################################################
333
+
334
+# Function to plot the original feature data, with points colored based on their assigned cluster ID.
335
+# 'original_data_list': List of original (unscaled) test DataFrames, one per class.
336
+# 'cluster_labels_list': List of predicted cluster label arrays, one for each corresponding test DataFrame.
337
+# 'n_clusters': Total number of clusters used.
338
+# 'features', 'featureNames', 'unitNames': Dictionaries for plotting labels.
339
+def plot_clustered_data(original_data_list, cluster_labels_list, n_clusters, features, featureNames, unitNames):
340
+    num_features = len(features) # Number of features to plot (determines number of subplots)
341
+    # Create a figure and a set of subplots (one row, num_features columns). Share the x-axis.
342
+    fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
343
+    # Ensure 'axes' is always an array, even if there's only one feature (and thus only one subplot)
344
+    if num_features == 1:
345
+        axes = [axes]
346
+    # Generate a color map to assign distinct colors to each cluster ID
347
+    colors = plt.cm.viridis(np.linspace(0, 1, n_clusters)) # Assign colors to each cluster
348
+
349
+    # Loop through each original test data DataFrame and its corresponding cluster labels
350
+    for k, df in enumerate(original_data_list): # k is the index of the test class/DataFrame
351
+        original_indices = df.index # Get the time index from the original DataFrame
352
+        # The cluster labels correspond to sequences, which start 'timesteps' points later than the raw data.
353
+        # Get the time index corresponding to the end of each sequence for plotting.
354
+        time_index = original_indices[timesteps - 1:]
355
+
356
+        # Loop through each original feature to plot it
357
+        for i, feature in enumerate(features): # i is the index of the feature
358
+            # Loop through each possible cluster ID
359
+            for cluster_id in range(n_clusters):
360
+                # Find the indices within the current test data corresponding to the current cluster ID
361
+                cluster_indices_kmeans = np.where(cluster_labels_list[k] == cluster_id)[0]
362
+                # If there are any data points assigned to this cluster
363
+                if len(cluster_indices_kmeans) > 0:
364
+                    # Plot the original feature data points for this specific cluster ID.
365
+                    # x-axis: time_index points corresponding to the sequence end
366
+                    # y-axis: original feature values at those time_index points
367
+                    # color: color assigned to the cluster
368
+                    # label: label for the cluster (only show for the first class (k==0) to avoid redundant legends)
369
+                    # s=10: size of the scatter markers
370
+                    axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10)
371
+            # Set the y-axis label and title for the current feature's subplot
372
+            axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
373
+            axes[i].set_title(featureNames[feature])
374
+        # Add a legend to the plot. Place it on the last subplot.
375
+        axes[num_features - 1].legend(loc='upper right') # Place legend on the last subplot
376
+
377
+    # Adjust layout to prevent plot elements (like labels) from overlapping
378
+    plt.tight_layout()
379
+    # Display the plot window
380
+    plt.show()
381
+
382
+# Call the plotting function if the --plot_clustered command line flag is provided
383
+if options.plot_clustered:
384
+    plot_clustered_data(dataTest, cluster_labels_test_list, n_clusters, features, featureNames, unitNames)
385
+
386
+#####################################################################################################
387
+# --- Evaluation and plotting of anomalies and misclassified instances (based on cluster labels) ---
388
+# This section evaluates clustering performance using classification metrics and plots specific instances
389
+#####################################################################################################
390
+
391
+# Function to evaluate clustering results and plot anomalies/misclassified instances
392
+# 'kmeans_model': The trained K-Means model.
393
+# 'scaled_test_data_list': List of scaled test data DataFrames.
394
+# 'n_clusters': Number of clusters.
395
+# 'original_test_data_list': List of original (unscaled) test data DataFrames.
396
+# 'true_labels_list': List of arrays containing true class labels for test data.
397
+# 'features', 'featureNames', 'unitNames': Feature information for plotting.
398
+# 'plot_anomalies', 'plot_misclassified': Boolean flags from command line arguments.
399
+def evaluate_and_plot_anomalies(kmeans_model, scaled_test_data_list, n_clusters, original_test_data_list, true_labels_list, features, featureNames, unitNames, plot_anomalies=False, plot_misclassified=False):
400
+    # Lists to accumulate true labels, predicted cluster labels, and original sequences for ALL test data
401
+    all_y_true_categorical = [] # Stores the true class label (0, 1, etc.) for each sequence across all test data
402
+    all_predicted_cluster_labels = [] # Stores the predicted cluster ID for each sequence across all test data
403
+    all_original_test_sequences = [] # Stores the original feature values for each sequence (window) across all test data, used for plotting
404
+
405
+    # Lists to store evaluation metrics calculated per test class DataFrame
406
+    inertia_values = [] # Inertia score when the model predicts on each class's data (Note: this likely appends the model's overall inertia repeatedly)
407
+    silhouette_scores = [] # Silhouette score calculated for each class's data based on predicted clusters
408
+
409
+    # Loop through each test data class (scaled data, original data, and true labels)
410
+    for i, (scaled_test_df, original_test_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)): # i is the class index
411
+        # Create sequences with rate of change for the current scaled test data DataFrame
412
+        X_test_sequences = create_sequences_with_rate_of_change(scaled_test_df.values, timesteps, n_original_features) # Create sequences for the current class
413
+        # If no sequences could be generated for this class (e.g., data too short), print warning and skip
414
+        if X_test_sequences.size == 0:
415
+            print(f"Warning: No test sequences generated for class {i}. Skipping evaluation for this class.")
416
+            continue # Skip to the next iteration (next class)
417
+        # Get the number of sequences generated for the current class
418
+        n_samples_test = X_test_sequences.shape[0]
419
+        # Reshape these sequences for prediction by the K-Means model (flatten each sequence)
420
+        X_test_reshaped = X_test_sequences.reshape(n_samples_test, -1)
421
+        # Predict the cluster label for each reshaped sequence using the trained model
422
+        cluster_labels_predicted = kmeans_model.predict(X_test_reshaped)
423
+
424
+        # Append the model's inertia (this seems like a repeated value of the final training inertia)
425
+        inertia_values.append(kmeans_model.inertia_) # Check if this is the intended calculation for per-class evaluation
426
+        # Calculate the Silhouette score for the current class's data based on its predicted clusters
427
+        # Only calculate if there's more than one unique predicted label and at least one sample
428
+        if len(np.unique(cluster_labels_predicted)) > 1 and len(cluster_labels_predicted) > 0:
429
+            silhouette_scores.append(silhouette_score(X_test_reshaped, cluster_labels_predicted))
430
+        else:
431
+            silhouette_scores.append(np.nan) # Append NaN if silhouette cannot be calculated
432
+
433
+        # Get the original time indices corresponding to the *end* of each sequence for this class
434
+        original_indices = original_test_df.index[timesteps - 1:]
435
+
436
+        # Collect true labels, predicted cluster labels, and the actual original sequences for this class
437
+        # Loop through the generated sequences (represented by index j) and their corresponding true labels
438
+        for j, label in enumerate(y_true_categorical[timesteps - 1:]): # Iterate over true labels aligned with sequence ends
439
+            all_y_true_categorical.append(label) # Add the true label
440
+            all_predicted_cluster_labels.append(cluster_labels_predicted[j]) # Add the predicted cluster label
441
+            # Determine the start and end indices in the original DataFrame for the current sequence (j)
442
+            # The sequence at index j in the sequences array corresponds to data starting at index 'start_index' in the original DataFrame
443
+            start_index = original_test_df.index.get_loc(original_indices[j]) - (timesteps - 1)
444
+            end_index = start_index + timesteps
445
+            # Extract and add the original (unscaled) feature values for this specific sequence
446
+            all_original_test_sequences.append(original_test_df[features].iloc[start_index:end_index].values) # Append original sequence data
447
+
448
+    # Convert the accumulated lists across all test classes into NumPy arrays
449
+    all_y_true_categorical = np.array(all_y_true_categorical) # Array of true labels for all sequences
450
+    all_predicted_cluster_labels = np.array(all_predicted_cluster_labels) # Array of predicted cluster IDs for all sequences
451
+    all_original_test_sequences = np.array(all_original_test_sequences) # 3D array of original sequence data for all sequences
452
+
453
+    # Print overall evaluation metrics based on the accumulated data
454
+    print("\nEvaluation Metrics:")
455
+    # Print the mean of the recorded inertia values (Note: Check if this average of the model's final inertia is the desired metric here)
456
+    print(f"Inertia (final): {np.mean(inertia_values):.4f}")
457
+    # Print the mean of the calculated silhouette scores per class (ignoring NaNs)
458
+    print(f"Average Silhouette Score (valid cases): {np.nanmean(silhouette_scores):.4f}")
459
+
460
+    # --- Cluster Analysis: Map Cluster IDs to Dominant True Labels ---
461
+    # This maps each cluster ID to the true class label that appears most frequently among the sequences assigned to that cluster.
462
+    cluster_dominant_label = {} # Dictionary: {cluster_id: dominant_true_label}
463
+    for cluster_id in range(n_clusters): # Loop through each possible cluster ID
464
+        # Find the indices of all sequences that were predicted to belong to the current cluster_id
465
+        indices_in_cluster = np.where(all_predicted_cluster_labels == cluster_id)[0]
466
+        # If the cluster is not empty (contains sequences)
467
+        if len(indices_in_cluster) > 0:
468
+            # Get the true labels of all sequences that fall into this cluster
469
+            labels_in_cluster = all_y_true_categorical[indices_in_cluster]
470
+            # If there are actual labels in this subset (should be true if indices_in_cluster > 0)
471
+            if len(labels_in_cluster) > 0:
472
+                # Count the occurrences of each true label in this cluster and find the index of the most frequent one
473
+                dominant_label = np.argmax(np.bincount(labels_in_cluster))
474
+                cluster_dominant_label[cluster_id] = dominant_label # Assign the dominant true label to the cluster ID
475
+            else:
476
+                cluster_dominant_label[cluster_id] = -1 # If for some reason no labels were found, mark as -1
477
+        else:
478
+            cluster_dominant_label[cluster_id] = -1 # If the cluster is empty, mark as -1
479
+
480
+    # --- Generate Predicted Labels for Classification Evaluation ---
481
+    # Create a new array of predicted labels, where each sequence's predicted cluster ID is mapped to the cluster's dominant true label.
482
+    # This allows treating the clustering result as a classification output for evaluation.
483
+    # Use .get(cluster_id, -1) to handle cases where a cluster_id might not be in cluster_dominant_label (e.g., if cluster was empty).
484
+    predicted_labels_numeric = np.array([cluster_dominant_label.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels])
485
+
486
+    # --- Classification Evaluation (using mapped labels) ---
487
+    # Evaluate the performance by comparing the true labels with the predicted labels derived from cluster dominance.
488
+    # Only include instances where a dominant label was successfully assigned (not -1).
489
+    valid_indices = predicted_labels_numeric != -1 # Indices of sequences that have a valid predicted numeric label
490
+    # Proceed with evaluation metrics and confusion matrix if there are valid instances and more than one true class is present in these instances.
491
+    if np.sum(valid_indices) > 0 and len(np.unique(all_y_true_categorical[valid_indices])) > 1:
492
+        print("\nEvaluation Results (Clusters vs True Labels):")
493
+        # Print a detailed classification report (Precision, Recall, F1-score, Support for each class, and overall metrics).
494
+        print(classification_report(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices]))
495
+        # Compute the Confusion Matrix: rows are true labels, columns are predicted dominant labels.
496
+        cm = confusion_matrix(all_y_true_categorical[valid_indices], predicted_labels_numeric[valid_indices])
497
+        # Create a figure for the confusion matrix plot.
498
+        plt.figure(figsize=(8, 6))
499
+        # Use seaborn heatmap to visualize the confusion matrix.
500
+        # annot=True displays the values in the cells.
501
+        # fmt='d' formats the values as integers.
502
+        # cmap='Blues' sets the color map.
503
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
504
+        plt.xlabel('Predicted Cluster (Dominant True Label)') # Label for the x-axis
505
+        plt.ylabel('True Label') # Label for the y-axis
506
+        plt.title('Confusion Matrix (Clusters vs True Labels)') # Title of the plot
507
+        plt.show() # Display the plot
508
+    else:
509
+        print("\nCould not perform detailed evaluation (not enough data or classes with assigned dominant labels).")
510
+
511
+    #################################################################################################
512
+    # --- Plotting Anomalies (Optional) ---
513
+    # This section plots time series data for sequences identified as anomalies based on clustering
514
+    #################################################################################################
515
+
516
+    # Check if the --plot_anomalies command line flag is provided
517
+    if plot_anomalies:
518
+        print("\nChecking anomaly data:")
519
+        # Define "anomaly clusters" as those whose dominant true label is greater than 0 (i.e., corresponds to any failure type).
520
+        anomaly_clusters = [cluster_id for cluster_id, label in cluster_dominant_label.items() if label > 0]
521
+        # Find the indices of all sequences that were assigned to any of the "anomaly clusters".
522
+        anomaly_indices = np.where(np.isin(all_predicted_cluster_labels, anomaly_clusters))[0]
523
+        # If any anomaly sequences were found
524
+        if len(anomaly_indices) > 0:
525
+            # Determine how many anomaly sequences to plot (up to 5, or fewer if less were found).
526
+            num_anomalies_to_plot = min(5, len(anomaly_indices))
527
+            colors = ['red', 'green', 'blue'] # Define a simple list of colors to cycle through for features
528
+            # Randomly select a few anomaly sequences and plot their original data.
529
+            for i in np.random.choice(anomaly_indices, num_anomalies_to_plot, replace=False): # Select random indices without replacement
530
+                # Print shape and first few values of the original sequence being plotted for debugging/information
531
+                print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
532
+                print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
533
+                # Create a new figure for each individual anomaly plot.
534
+                plt.figure(figsize=(12, 6))
535
+                # Plot each original feature within the selected sequence over the timesteps.
536
+                for j, feature in enumerate(features): # j is feature index, feature is feature name
537
+                    # Plot the feature values (y-axis) against the sequence timestep index (x-axis from 0 to timesteps-1).
538
+                    # Use colors cycling through the defined list.
539
+                    plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
540
+                # NOTE: true_label and predicted_cluster_for_title variables are not defined in this block before use in the title. This will cause a NameError.
541
+                # You need to add lines like:
542
+                # true_label = all_y_true_categorical[i]
543
+                # predicted_cluster_for_title = all_predicted_cluster_labels[i]
544
+                plt.title('Detected Anomalies (based on cluster dominance)') # Title of the plot
545
+                plt.xlabel('Time Step') # Label for the x-axis (timestep within the sequence)
546
+                plt.ylabel('Value') # Label for the y-axis (feature value)
547
+                plt.legend() # Add a legend to identify which line corresponds to which feature
548
+                plt.show() # Display the plot window
549
+        else:
550
+            # If no sequences were assigned to anomaly clusters, print a message.
551
+            print("No anomalies detected based on cluster dominance.")
552
+
553
+    #################################################################################################
554
+    # --- Plotting Misclassified Instances (Optional) ---
555
+    # This section plots time series data for sequences that were "misclassified" based on cluster dominance
556
+    #################################################################################################
557
+
558
+    # Check if the --plot_misclassified command line flag is provided
559
+    if plot_misclassified:
560
+        print("\nChecking misclassified data:")
561
+        # Find the indices of all sequences where the true label does NOT match the dominant label of their assigned cluster.
562
+        misclassified_indices = np.where(all_y_true_categorical != predicted_labels_numeric)[0]
563
+        # If any misclassified instances are found
564
+        if len(misclassified_indices) > 0:
565
+            # Determine how many misclassified sequences to plot (up to 5, or fewer if less were found).
566
+            num_misclassified_to_plot = min(5, len(misclassified_indices))
567
+            colors = ['red', 'green', 'blue'] # Define a simple list of colors to cycle through for features
568
+            # Randomly select a few misclassified sequences and plot their original data.
569
+            for i in np.random.choice(misclassified_indices, num_misclassified_to_plot, replace=False): # Select random indices without replacement
570
+                # Print shape and first few values of the original sequence being plotted for debugging/information
571
+                print(f"Shape of all_original_test_sequences[{i}]: {all_original_test_sequences[i].shape}")
572
+                print(f"First few values of all_original_test_sequences[{i}]:\n{all_original_test_sequences[i][:5]}")
573
+                # Create a new figure for each individual misclassified plot.
574
+                plt.figure(figsize=(12, 6))
575
+                # Plot each original feature within the selected sequence over the timesteps.
576
+                for j, feature in enumerate(features): # j is feature index, feature is feature name
577
+                    # Plot the feature values (y-axis) against the sequence timestep index (x-axis from 0 to timesteps-1).
578
+                    # Use colors cycling through the defined list.
579
+                    plt.plot(np.arange(timesteps), all_original_test_sequences[i][:, j], label=feature, color=colors[j % len(colors)])
580
+                # NOTE: true_label and predicted_label are defined here, which fixes the NameError in this block.
581
+                true_label = all_y_true_categorical[i] # Get the true label for the current sequence
582
+                predicted_label = predicted_labels_numeric[i] # Get the predicted numeric label (dominant cluster label) for the current sequence
583
+                # Set the title for the misclassified plot, indicating true label and the dominant label of the predicted cluster.
584
+                plt.title(f'Misclassified Instance (True: {true_label}, Predicted Cluster: {predicted_label})') # Title including true label and dominant predicted label
585
+                plt.xlabel('Time Step') # Label for the x-axis
586
+                plt.ylabel('Value') # Label for the y-axis
587
+                plt.legend() # Add a legend to identify features
588
+                plt.show() # Display the plot window
589
+        else:
590
+            # If no misclassified sequences were found, print a message.
591
+            print("No misclassified instances found based on cluster dominance.")
592
+
593
+    # Return the arrays of true labels and predicted numeric labels (based on cluster dominance)
594
+    # These can be used for further analysis or saving results
595
+    return all_y_true_categorical, predicted_labels_numeric
596
+
597
+#####################################################################################################
598
+# --- Main Execution Flow ---
599
+# This is the main part of the script that calls the functions to run the analysis
600
+#####################################################################################################
601
+
602
+# Create a list of true class labels for the test data.
603
+# This list will contain arrays, where each array corresponds to a test class and contains the true label for every data point in that class.
604
+# The labels are the index of the class (0 for Normal, 1 for Failure 1, etc.).
605
+true_labels_list = []
606
+for i, df in enumerate(dataTest): # Loop through each test DataFrame (each class)
607
+    # Create a numpy array filled with the class index 'i', with a length equal to the number of rows in the DataFrame.
608
+    true_labels_list.append(np.full(len(df), i))
609
+
610
+# Call the main evaluation and plotting function.
611
+# Pass the trained model, scaled/original test data, number of clusters, true labels, feature info, and plotting options.
612
+# This function performs the prediction on test data, calculates evaluation metrics, and handles plotting based on flags.
613
+y_true_final, y_pred_final = evaluate_and_plot_anomalies(kmeans, scaled_test_df_list, n_clusters, dataTest, true_labels_list, features, featureNames, unitNames, plot_anomalies=options.plot_anomalies, plot_misclassified=options.plot_misclassified)
614
+
615
+#####################################################################################################
616
+# --- Final Evaluation Metrics (on combined test data) ---
617
+# This section calculates and prints overall evaluation metrics after processing all test data
618
+#####################################################################################################
619
+
620
+# Calculate and print the final Inertia and Silhouette Score for the combined test data.
621
+# Check if there is any reshaped test data available (i.e., if any test data files were processed).
622
+if X_test_reshaped_list:
623
+    # Vertically stack all the reshaped test data arrays from different classes into a single array.
624
+    # This array contains all flattened sequences from all test data.
625
+    X_test_combined_reshaped = np.vstack(X_test_reshaped_list)
626
+    # Concatenate all the predicted cluster labels from different classes into a single array.
627
+    all_cluster_labels_test = np.concatenate(cluster_labels_test_list)
628
+
629
+    # Print a header for the final evaluation metrics.
630
+    print("\nK-Means Model Evaluation on Combined Test Data:")
631
+    # Print the final Inertia of the trained K-Means model on the training data.
632
+    # Note: This inertia value is from the model fitting process, not a specific calculation on the combined test data.
633
+    print(f"Inertia: {kmeans.inertia_:.4f}")
634
+
635
+    # Calculate and print the Silhouette Score for the combined test data based on the predicted cluster labels.
636
+    # This metric evaluates how well-separated the clusters are based on the data points within them.
637
+    # Only calculate if there is more than one unique predicted cluster label and at least one data point.
638
+    if len(np.unique(all_cluster_labels_test)) > 1 and len(all_cluster_labels_test) > 0:
639
+        silhouette = silhouette_score(X_test_combined_reshaped, all_cluster_labels_test)
640
+        print(f"Silhouette Score: {silhouette:.4f}")
641
+    else:
642
+        # If Silhouette score cannot be calculated, print a message.
643
+        print("Silhouette Score: Not applicable for single cluster.")
644
+else:
645
+    # If no test data sequences were available at all, print a message.
646
+    print("\nNo test data sequences available to evaluate Inertia and Silhouette Score.")

Powered by TurnKey Linux.