|
@@ -0,0 +1,418 @@
|
|
1
|
+# V0_V1_kmeans_anomaly_unsupervised_with_comment
|
|
2
|
+import pandas as pd
|
|
3
|
+import numpy as np
|
|
4
|
+import matplotlib.pyplot as plt
|
|
5
|
+from sklearn.cluster import KMeans
|
|
6
|
+from sklearn.preprocessing import StandardScaler
|
|
7
|
+from sklearn.metrics import silhouette_score, davies_bouldin_score
|
|
8
|
+import argparse
|
|
9
|
+import os
|
|
10
|
+import seaborn as sns
|
|
11
|
+from sklearn.metrics import confusion_matrix, classification_report
|
|
12
|
+from sklearn.decomposition import PCA # For dimensionality reduction if needed
|
|
13
|
+
|
|
14
|
+# Set up command line arguments
|
|
15
|
+parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering (Unsupervised).')
|
|
16
|
+parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
|
|
17
|
+parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means.')
|
|
18
|
+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
|
|
19
|
+parser.add_argument('--plot_before', action='store_true', help='Plot data before training.')
|
|
20
|
+parser.add_argument('--plot_after', action='store_true', help='Plot data after clustering.')
|
|
21
|
+parser.add_argument('--plot_transition', action='store_true', help='Plot transition data with clusters.')
|
|
22
|
+parser.add_argument('--plot_anomalies', action='store_true', help='Plot potential anomalies based on distance.')
|
|
23
|
+parser.add_argument('--plot_anomaly_clusters', action='store_true', help='Highlight clusters with potential anomalies.')
|
|
24
|
+parser.add_argument('--manual_anomaly_labels', type=str, default=None, help='Path to CSV with manual anomaly labels for evaluation.')
|
|
25
|
+
|
|
26
|
+options = parser.parse_args()
|
|
27
|
+
|
|
28
|
+# Number of clusters and sequence length from command line arguments
|
|
29
|
+n_clusters = options.n_clusters
|
|
30
|
+timesteps = options.timesteps
|
|
31
|
+
|
|
32
|
+#####################################################################################################
|
|
33
|
+
|
|
34
|
+NumberOfFailures = 4 # So far, we have only data for the first 4 types of failures
|
|
35
|
+datafiles = [[], []] # 0 for train, 1 for test
|
|
36
|
+for i in range(NumberOfFailures + 1):
|
|
37
|
+ datafiles[0].append([])
|
|
38
|
+ datafiles[1].append([])
|
|
39
|
+
|
|
40
|
+# Next set of data corresponds to Freezer, SP=-26
|
|
41
|
+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
|
|
42
|
+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
|
|
43
|
+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
|
|
44
|
+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
|
|
45
|
+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
|
|
46
|
+
|
|
47
|
+if options.transition:
|
|
48
|
+ datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
|
|
49
|
+ datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_'] # with TRANSITION
|
|
50
|
+ datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_'] # with TRANSITION
|
|
51
|
+ datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_'] # with TRANSITION
|
|
52
|
+ datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
|
|
53
|
+
|
|
54
|
+else:
|
|
55
|
+ datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
|
|
56
|
+ datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
|
|
57
|
+ datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
|
|
58
|
+ datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
|
|
59
|
+ datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
|
|
60
|
+
|
|
61
|
+# Features used
|
|
62
|
+features = ['r1 s1', 'r1 s4', 'r1 s5']
|
|
63
|
+
|
|
64
|
+featureNames = {}
|
|
65
|
+featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>'
|
|
66
|
+featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>'
|
|
67
|
+featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>'
|
|
68
|
+
|
|
69
|
+unitNames = {}
|
|
70
|
+unitNames['r1 s1'] = r'($^o$C)'
|
|
71
|
+unitNames['r1 s4'] = r'($^o$C)'
|
|
72
|
+unitNames['r1 s5'] = r'($^o$C)'
|
|
73
|
+
|
|
74
|
+NumFeatures = len(features)
|
|
75
|
+
|
|
76
|
+#####################################################################################################
|
|
77
|
+
|
|
78
|
+# Load and merge training data
|
|
79
|
+dataTrain = []
|
|
80
|
+for class_files in datafiles[0]:
|
|
81
|
+ class_dfs = []
|
|
82
|
+ for base_filename in class_files:
|
|
83
|
+ script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
84
|
+ data_dir = os.path.join(script_dir, 'data')
|
|
85
|
+ filepath = os.path.join(data_dir, f'{base_filename}.csv')
|
|
86
|
+ try:
|
|
87
|
+ df = pd.read_csv(filepath)
|
|
88
|
+ df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
|
|
89
|
+ df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
|
|
90
|
+ for col in features:
|
|
91
|
+ df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
92
|
+ df = df.set_index('timestamp').resample('5Min')[features].mean() # Resample and calculate mean only for features
|
|
93
|
+ df = df[features].interpolate() # Estimate missing values using linear interpolation
|
|
94
|
+ class_dfs.append(df)
|
|
95
|
+ except FileNotFoundError:
|
|
96
|
+ print(f"Warning: File {filepath} not found and skipped.")
|
|
97
|
+ if class_dfs:
|
|
98
|
+ dataTrain.append(pd.concat(class_dfs))
|
|
99
|
+
|
|
100
|
+# Concatenate all training data into a single DataFrame
|
|
101
|
+combined_train_data = pd.concat(dataTrain)
|
|
102
|
+
|
|
103
|
+# Load and merge test data
|
|
104
|
+dataTest = []
|
|
105
|
+for class_files in datafiles[1]:
|
|
106
|
+ class_dfs = []
|
|
107
|
+ for base_filename in class_files:
|
|
108
|
+ script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
109
|
+ data_dir = os.path.join(script_dir, 'data')
|
|
110
|
+ filepath = os.path.join(data_dir, f'{base_filename}.csv')
|
|
111
|
+ try:
|
|
112
|
+ df = pd.read_csv(filepath)
|
|
113
|
+ df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
|
|
114
|
+ df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
|
|
115
|
+ for col in features:
|
|
116
|
+ df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
117
|
+ df = df.set_index('timestamp').resample('5Min')[features].mean() # Resample and calculate mean only for features
|
|
118
|
+ df = df[features].interpolate() # Estimate missing values using linear interpolation
|
|
119
|
+ class_dfs.append(df)
|
|
120
|
+ except FileNotFoundError:
|
|
121
|
+ print(f"Warning: File {filepath} not found and skipped.")
|
|
122
|
+ if class_dfs:
|
|
123
|
+ dataTest.append(pd.concat(class_dfs))
|
|
124
|
+
|
|
125
|
+# Plot data before training
|
|
126
|
+if options.plot_before:
|
|
127
|
+ num_features = len(features)
|
|
128
|
+ fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
|
|
129
|
+ for i, feature in enumerate(features):
|
|
130
|
+ for k, df in enumerate(dataTrain):
|
|
131
|
+ axes[i].plot(df.index, df[feature], label=f'Train Class {k}')
|
|
132
|
+ axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
|
|
133
|
+ axes[i].set_title(f'Train Data - {featureNames[feature]}')
|
|
134
|
+ axes[i].legend()
|
|
135
|
+ plt.tight_layout()
|
|
136
|
+ plt.show()
|
|
137
|
+
|
|
138
|
+########################################################################################################
|
|
139
|
+
|
|
140
|
+# Normalize the data
|
|
141
|
+scaler = StandardScaler()
|
|
142
|
+scaled_train_data = scaler.fit_transform(combined_train_data[features]) # Normalize only the features
|
|
143
|
+
|
|
144
|
+scaled_test_data_list = []
|
|
145
|
+for df in dataTest:
|
|
146
|
+ scaled_test_data_list.append(scaler.transform(df[features])) # Normalize only the features
|
|
147
|
+
|
|
148
|
+# Convert normalized data to DataFrame for easier handling
|
|
149
|
+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
|
|
150
|
+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
|
|
151
|
+
|
|
152
|
+############################################################################################################
|
|
153
|
+
|
|
154
|
+# Create time sequences
|
|
155
|
+def create_sequences(data, timesteps):
|
|
156
|
+ sequences = []
|
|
157
|
+ for i in range(len(data) - timesteps + 1):
|
|
158
|
+ sequences.append(data[i:i + timesteps])
|
|
159
|
+ return np.array(sequences)
|
|
160
|
+
|
|
161
|
+# Create time sequences for training data
|
|
162
|
+X_train_sequences = create_sequences(scaled_train_df.values, timesteps)
|
|
163
|
+n_samples, n_timesteps, n_features = X_train_sequences.shape
|
|
164
|
+X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_features)
|
|
165
|
+
|
|
166
|
+# Create time sequences for test data
|
|
167
|
+X_test_sequences_list = [create_sequences(df.values, timesteps) for df in scaled_test_df_list]
|
|
168
|
+X_test_reshaped_list = [seq.reshape(seq.shape[0], seq.shape[1] * seq.shape[2]) for seq in X_test_sequences_list]
|
|
169
|
+
|
|
170
|
+############################################################################################################
|
|
171
|
+
|
|
172
|
+# Train the K-Means model
|
|
173
|
+kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) # n_init to prevent convergence to local optimal
|
|
174
|
+kmeans.fit(X_train_reshaped)
|
|
175
|
+train_labels = kmeans.labels_
|
|
176
|
+
|
|
177
|
+# Evaluate clustering quality for training data
|
|
178
|
+silhouette_avg_train = silhouette_score(X_train_reshaped, train_labels)
|
|
179
|
+db_index_train = davies_bouldin_score(X_train_reshaped, train_labels)
|
|
180
|
+print("\nK-Means Model Evaluation on Training Data:")
|
|
181
|
+print(f"\nSilhouette Score (Train Data): {silhouette_avg_train:.4f}")
|
|
182
|
+print(f"Davies-Bouldin Index (Train Data): {db_index_train:.4f}")
|
|
183
|
+
|
|
184
|
+# Predict clusters for test data
|
|
185
|
+cluster_labels_test_list = [kmeans.predict(X_test_reshaped) for X_test_reshaped in X_test_reshaped_list]
|
|
186
|
+
|
|
187
|
+# Evaluate clustering quality for test data (if there is more than one sample)
|
|
188
|
+for i, labels_test in enumerate(cluster_labels_test_list):
|
|
189
|
+ if len(X_test_reshaped_list[i]) > 1 and len(np.unique(labels_test)) > 1:
|
|
190
|
+ silhouette_avg_test = silhouette_score(X_test_reshaped_list[i], labels_test)
|
|
191
|
+ db_index_test = davies_bouldin_score(X_test_reshaped_list[i], labels_test)
|
|
192
|
+ print(f"\nSilhouette Score (Test Data - Set {i+1}): {silhouette_avg_test:.4f}")
|
|
193
|
+ print(f"Davies-Bouldin Index (Test Data - Set {i+1}): {db_index_test:.4f}")
|
|
194
|
+ else:
|
|
195
|
+ print(f"\nCannot calculate Silhouette Score and Davies-Bouldin Index for Test Data - Set {i+1} due to insufficient samples or only one cluster.")
|
|
196
|
+############################################################################################################################
|
|
197
|
+
|
|
198
|
+# Function to plot data with assigned clusters
|
|
199
|
+def plot_clustered_data(original_data_list, cluster_labels_list, n_clusters, features, featureNames, unitNames, title_prefix=""):
|
|
200
|
+ num_features = len(features)
|
|
201
|
+ fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
|
|
202
|
+ colors = plt.cm.viridis(np.linspace(0, 1, n_clusters)) # Assign colors to each cluster
|
|
203
|
+
|
|
204
|
+ for k, df in enumerate(original_data_list):
|
|
205
|
+ # Change: Directly use DataFrame index
|
|
206
|
+ df_index = df.index
|
|
207
|
+ labels = cluster_labels_list[k]
|
|
208
|
+ for i, feature in enumerate(features):
|
|
209
|
+ for cluster_id in range(n_clusters):
|
|
210
|
+ cluster_indices_kmeans = np.where(labels == cluster_id)[0]
|
|
211
|
+ # Ensure indices are within the range of df_index
|
|
212
|
+ if len(cluster_indices_kmeans) > 0 and len(df_index) > cluster_indices_kmeans[-1]:
|
|
213
|
+ valid_indices = df_index[cluster_indices_kmeans]
|
|
214
|
+ axes[i].scatter(valid_indices, df[feature].loc[valid_indices], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10)
|
|
215
|
+ axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
|
|
216
|
+ axes[i].set_title(f'{title_prefix} - {featureNames[feature]}')
|
|
217
|
+ axes[i].legend(loc='upper right')
|
|
218
|
+
|
|
219
|
+ plt.tight_layout()
|
|
220
|
+ plt.show()
|
|
221
|
+
|
|
222
|
+# Plot data after clustering (training)
|
|
223
|
+if options.plot_after:
|
|
224
|
+ # To plot training data, we need to map sequences back to original data
|
|
225
|
+ train_original_indices = combined_train_data.index[timesteps - 1:]
|
|
226
|
+ plot_df_train = pd.DataFrame(scaled_train_df.values[timesteps - 1:], index=train_original_indices, columns=features)
|
|
227
|
+
|
|
228
|
+ # Change: Slice train_labels to the length of plot_df_train
|
|
229
|
+ trimmed_train_labels = train_labels[:len(plot_df_train)]
|
|
230
|
+
|
|
231
|
+ plot_clustered_data([plot_df_train], [trimmed_train_labels], n_clusters, features, featureNames, unitNames, title_prefix="Train Data Clusters")
|
|
232
|
+
|
|
233
|
+ # Plot scatter plot of clusters for training data (using the first two features)
|
|
234
|
+ if len(features) >= 2:
|
|
235
|
+ plt.figure(figsize=(10, 8))
|
|
236
|
+ scatter = plt.scatter(scaled_train_df[features[0]][:len(plot_df_train)], scaled_train_df[features[1]][:len(plot_df_train)], c=trimmed_train_labels, cmap='viridis')
|
|
237
|
+ plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
|
|
238
|
+ plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
|
|
239
|
+ plt.title('Clusters of Train Data (Features 1 vs 2)')
|
|
240
|
+ plt.colorbar(scatter, label='Cluster ID')
|
|
241
|
+ plt.show()
|
|
242
|
+
|
|
243
|
+# Plot test data with predicted clusters
|
|
244
|
+if options.plot_transition:
|
|
245
|
+ plot_clustered_data(dataTest, cluster_labels_test_list, n_clusters, features, featureNames, unitNames, title_prefix="Test Data Clusters")
|
|
246
|
+
|
|
247
|
+ # Plot scatter plot of clusters for test data (using the first two features)
|
|
248
|
+ for i, labels_test in enumerate(cluster_labels_test_list):
|
|
249
|
+ if len(features) >= 2 and len(scaled_test_df_list[i]) > timesteps - 1:
|
|
250
|
+ # Change: Ensure label length matches data length
|
|
251
|
+ test_labels_to_plot = labels_test[:len(scaled_test_df_list[i]) - (timesteps - 1)]
|
|
252
|
+ if len(test_labels_to_plot) > 0:
|
|
253
|
+ plt.figure(figsize=(10, 8))
|
|
254
|
+ scatter = plt.scatter(scaled_test_df_list[i][features[0]][timesteps-1:], scaled_test_df_list[i][features[1]][timesteps-1:], c=test_labels_to_plot, cmap='viridis')
|
|
255
|
+ plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
|
|
256
|
+ plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
|
|
257
|
+ plt.title(f'Clusters of Test Data - Set {i+1} (Features 1 vs 2)')
|
|
258
|
+ plt.colorbar(scatter, label='Cluster ID')
|
|
259
|
+ plt.show()
|
|
260
|
+
|
|
261
|
+#####################################################################################################
|
|
262
|
+# Anomaly detection based on distance from cluster center
|
|
263
|
+#####################################################################################################
|
|
264
|
+
|
|
265
|
+def calculate_anomaly_scores(data_reshaped, kmeans_model):
|
|
266
|
+ distances = []
|
|
267
|
+ for i in range(len(data_reshaped)):
|
|
268
|
+ distance = np.linalg.norm(data_reshaped[i] - kmeans_model.cluster_centers_[kmeans_model.labels_[i]])
|
|
269
|
+ distances.append(distance)
|
|
270
|
+ return np.array(distances)
|
|
271
|
+
|
|
272
|
+train_anomaly_scores = calculate_anomaly_scores(X_train_reshaped, kmeans)
|
|
273
|
+
|
|
274
|
+def plot_anomalies(original_data_list, anomaly_scores_list, threshold_factor=3):
|
|
275
|
+ num_features = len(features)
|
|
276
|
+ fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
|
|
277
|
+ for k, df in enumerate(original_data_list):
|
|
278
|
+ time_index = df.index[timesteps - 1:]
|
|
279
|
+ anomaly_scores = anomaly_scores_list[k]
|
|
280
|
+ threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
|
|
281
|
+ anomalous_indices = np.where(anomaly_scores > threshold)[0]
|
|
282
|
+ for i, feature in enumerate(features):
|
|
283
|
+ # Ensure lengths match for plotting
|
|
284
|
+ # Plot full original data dimly
|
|
285
|
+ axes[i].plot(df.index, df[feature], label=f'Normal', alpha=0.7)
|
|
286
|
+
|
|
287
|
+ if len(time_index) == len(anomaly_scores): # Check alignment
|
|
288
|
+ # Now mark anomalies on the original data time axis
|
|
289
|
+ anomaly_times = time_index[anomalous_indices]
|
|
290
|
+ axes[i].scatter(anomaly_times, df[feature].loc[anomaly_times], color='red', label=f'Anomaly' if k == 0 else "", s=20, zorder=5) # Use zorder
|
|
291
|
+ else:
|
|
292
|
+ print(f"Warning: Data length mismatch for plotting anomalies in file {k}. Skipping anomaly plot for this file.")
|
|
293
|
+
|
|
294
|
+ axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
|
|
295
|
+ axes[i].set_title(f'Potential Anomalies - {featureNames[feature]}')
|
|
296
|
+ # Only add a single legend across all subplots
|
|
297
|
+ handles, labels = [], []
|
|
298
|
+ for ax in axes:
|
|
299
|
+ for h, l in zip(*ax.get_legend_handles_labels()):
|
|
300
|
+ if l not in labels:
|
|
301
|
+ handles.append(h)
|
|
302
|
+ labels.append(l)
|
|
303
|
+ if handles:
|
|
304
|
+ axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+ plt.tight_layout()
|
|
308
|
+ plt.show()
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+if options.plot_anomalies:
|
|
312
|
+ test_anomaly_scores_list = [calculate_anomaly_scores(X_test_reshaped, kmeans) for X_test_reshaped in X_test_reshaped_list]
|
|
313
|
+ # Need original data frames for plotting
|
|
314
|
+ plot_anomalies(dataTest, test_anomaly_scores_list)
|
|
315
|
+
|
|
316
|
+#####################################################################################################
|
|
317
|
+# Display clusters that may contain anomalies
|
|
318
|
+#####################################################################################################
|
|
319
|
+
|
|
320
|
+def plot_anomaly_clusters(original_data_list, cluster_labels_list, anomaly_scores_list, n_clusters, features, featureNames, unitNames, threshold_factor=3):
|
|
321
|
+ num_features = len(features)
|
|
322
|
+ fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
|
|
323
|
+ colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
|
|
324
|
+
|
|
325
|
+ for k, df in enumerate(original_data_list):
|
|
326
|
+ time_index = df.index[timesteps - 1:]
|
|
327
|
+ cluster_labels = cluster_labels_list[k]
|
|
328
|
+ anomaly_scores = anomaly_scores_list[k]
|
|
329
|
+ threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
|
|
330
|
+ anomalous_indices = np.where(anomaly_scores > threshold)[0]
|
|
331
|
+ anomalous_clusters = np.unique(cluster_labels[anomalous_indices])
|
|
332
|
+
|
|
333
|
+ for i, feature in enumerate(features):
|
|
334
|
+ # Plot all data points colored by their cluster
|
|
335
|
+ for cluster_id in range(n_clusters):
|
|
336
|
+ cluster_indices = np.where(cluster_labels == cluster_id)[0]
|
|
337
|
+ # Ensure cluster_indices are valid for time_index
|
|
338
|
+ if len(cluster_indices) > 0:
|
|
339
|
+ current_time_index = time_index[cluster_indices]
|
|
340
|
+ # Need to get corresponding values from original_df using time index
|
|
341
|
+ axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10, alpha=0.5)
|
|
342
|
+
|
|
343
|
+ # Highlight anomaly clusters - Plot ALL points belonging to the identified anomaly clusters
|
|
344
|
+ # Loop through clusters that contain at least one anomalous point
|
|
345
|
+ for anomaly_cluster in anomalous_clusters:
|
|
346
|
+ # Find ALL indices that belong to this anomaly cluster
|
|
347
|
+ cluster_all_indices = np.where(cluster_labels == anomaly_cluster)[0]
|
|
348
|
+ if len(cluster_all_indices) > 0:
|
|
349
|
+ current_time_index = time_index[cluster_all_indices]
|
|
350
|
+ # Plot these points with a distinct color/marker
|
|
351
|
+ axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color='red', label=f'Anomaly Cluster {anomaly_cluster}' if k == 0 else "", s=30, zorder=5)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+ axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
|
|
355
|
+ axes[i].set_title(f'Anomaly Clusters - {featureNames[feature]}')
|
|
356
|
+ # Only add a single legend across all subplots
|
|
357
|
+ handles, labels = [], []
|
|
358
|
+ for ax in axes:
|
|
359
|
+ for h, l in zip(*ax.get_legend_handles_labels()):
|
|
360
|
+ if l not in labels:
|
|
361
|
+ handles.append(h)
|
|
362
|
+ labels.append(l)
|
|
363
|
+ if handles:
|
|
364
|
+ axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+ plt.tight_layout()
|
|
368
|
+ plt.show()
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+if options.plot_anomaly_clusters:
|
|
372
|
+ test_anomaly_scores_list = [calculate_anomaly_scores(X_test_reshaped, kmeans) for X_test_reshaped in X_test_reshaped_list]
|
|
373
|
+ plot_anomaly_clusters(dataTest, cluster_labels_test_list, test_anomaly_scores_list, n_clusters, features, featureNames, unitNames)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+#####################################################################################################
|
|
377
|
+# Evaluation with manual labels (if provided)
|
|
378
|
+#####################################################################################################
|
|
379
|
+
|
|
380
|
+if options.manual_anomaly_labels:
|
|
381
|
+ manual_labels_df = pd.read_csv(options.manual_anomaly_labels, index_col=0)
|
|
382
|
+ all_true_labels = []
|
|
383
|
+ all_predicted_anomalies = []
|
|
384
|
+
|
|
385
|
+ for k, df in enumerate(dataTest):
|
|
386
|
+ # Ensure anomaly scores were calculated for this test set
|
|
387
|
+ if k >= len(test_anomaly_scores_list):
|
|
388
|
+ print(f"Warning: Anomaly scores not calculated for test set {k}. Skipping manual evaluation for this set.")
|
|
389
|
+ continue
|
|
390
|
+
|
|
391
|
+ time_index = df.index[timesteps - 1:] # Time index aligned with sequences
|
|
392
|
+ if not time_index.empty:
|
|
393
|
+ # Assuming the manual labels CSV has a boolean 'is_anomaly' column and its index aligns with the test data
|
|
394
|
+ relevant_labels = manual_labels_df.reindex(time_index, method='nearest')['is_anomaly'].fillna(False).astype(int)
|
|
395
|
+
|
|
396
|
+ anomaly_scores = test_anomaly_scores_list[k]
|
|
397
|
+ # Ensure lengths match before applying threshold
|
|
398
|
+ if len(anomaly_scores) == len(relevant_labels):
|
|
399
|
+ threshold = np.mean(anomaly_scores) + 3 * np.std(anomaly_scores) # Example threshold
|
|
400
|
+ predicted_anomalies = (anomaly_scores > threshold).astype(int)
|
|
401
|
+ all_true_labels.extend(relevant_labels.values)
|
|
402
|
+ all_predicted_anomalies.extend(predicted_anomalies)
|
|
403
|
+ else:
|
|
404
|
+ print(f"Warning: Mismatch in lengths between anomaly scores ({len(anomaly_scores)}) and manual labels ({len(relevant_labels)}) for test set {k}. Skipping evaluation for this set.")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+ if all_true_labels:
|
|
408
|
+ print("\nEvaluation using manual anomaly labels:")
|
|
409
|
+ print(classification_report(all_true_labels, all_predicted_anomalies, target_names=['Normal', 'Anomaly'], zero_division=0))
|
|
410
|
+ cm = confusion_matrix(all_true_labels, all_predicted_anomalies)
|
|
411
|
+ plt.figure(figsize=(8, 6))
|
|
412
|
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
|
|
413
|
+ plt.title('Confusion Matrix (Manual Labels)')
|
|
414
|
+ plt.xlabel('Predicted Label')
|
|
415
|
+ plt.ylabel('True Label')
|
|
416
|
+ plt.show()
|
|
417
|
+ else:
|
|
418
|
+ print("\nNo manual anomaly labels provided or aligned with test data.")
|