Bladeren bron

Adding kmeans_gmm_anomaly folder and it is contents Kmeans and gmm algorithm

Masoud Hosseini 3 weken geleden
bovenliggende
commit
f75165f1de

+ 412
- 0
kmeans_gmm_anomaly/V0_V1_gmm_anomaly_unsupervised_with_comment.py Bestand weergeven

@@ -0,0 +1,412 @@
1
+# V0_V1_gmm_anomaly_unsupervised
2
+import pandas as pd
3
+import numpy as np
4
+import matplotlib.pyplot as plt
5
+from sklearn.mixture import GaussianMixture
6
+from sklearn.preprocessing import StandardScaler
7
+from sklearn.metrics import silhouette_score, davies_bouldin_score
8
+import argparse
9
+import os
10
+import seaborn as sns
11
+from sklearn.metrics import confusion_matrix, classification_report
12
+from sklearn.decomposition import PCA  # For dimensionality reduction if needed
13
+
14
+# Set up command line arguments
15
+parser = argparse.ArgumentParser(description='Anomaly detection using Gaussian Mixture Models (Unsupervised).')
16
+parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
17
+parser.add_argument('--n_components', type=int, default=5, help='Number of components for GMM.')
18
+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
19
+parser.add_argument('--plot_before', action='store_true', help='Plot data before training.')
20
+parser.add_argument('--plot_after', action='store_true', help='Plot data after clustering (components).')
21
+parser.add_argument('--plot_transition', action='store_true', help='Plot transition data with components.')
22
+parser.add_argument('--plot_anomalies', action='store_true', help='Plot potential anomalies based on likelihood.')
23
+parser.add_argument('--plot_anomaly_clusters', action='store_true', help='Highlight components with potential anomalies.')
24
+parser.add_argument('--manual_anomaly_labels', type=str, default=None, help='Path to CSV with manual anomaly labels for evaluation.')
25
+
26
+options = parser.parse_args()
27
+
28
+# Number of components and sequence length from command line arguments
29
+n_components = options.n_components
30
+timesteps = options.timesteps
31
+
32
+#####################################################################################################
33
+
34
+NumberOfFailures = 4  # So far, we have only data for the first 4 types of failures
35
+datafiles = [[], []]  # 0 for train,  1 for test
36
+for i in range(NumberOfFailures + 1):
37
+  datafiles[0].append([])
38
+  datafiles[1].append([])
39
+
40
+# Next set of ddata corresponds to Freezer, SP=-26
41
+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
42
+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
43
+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
44
+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
45
+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
46
+
47
+if options.transition:
48
+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
49
+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']  # with TRANSITION
50
+  datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']  # with TRANSITION
51
+  datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']  # with TRANSITION
52
+  datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
53
+
54
+else:
55
+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
56
+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
57
+  datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
58
+  datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
59
+  datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
60
+
61
+# Features used
62
+features = ['r1 s1', 'r1 s4', 'r1 s5']
63
+
64
+featureNames = {}
65
+featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>'
66
+featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>'
67
+featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>'
68
+
69
+unitNames = {}
70
+unitNames['r1 s1'] = r'($^o$C)'
71
+unitNames['r1 s4'] = r'($^o$C)'
72
+unitNames['r1 s5'] = r'($^o$C)'
73
+
74
+NumFeatures = len(features)
75
+
76
+#####################################################################################################
77
+
78
+# Load and merge training data
79
+dataTrain = []
80
+for class_files in datafiles[0]:
81
+  class_dfs = []
82
+  for base_filename in class_files:
83
+    script_dir = os.path.dirname(os.path.abspath(__file__))
84
+    data_dir = os.path.join(script_dir, 'data')
85
+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
86
+    try:
87
+      df = pd.read_csv(filepath)
88
+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
89
+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
90
+      for col in features:
91
+        df[col] = pd.to_numeric(df[col], errors='coerce')
92
+      df = df.set_index('timestamp').resample('5Min')[features].mean()  # Resample and calculate mean only for features
93
+      df = df[features].interpolate()  # Estimate missing values using linear interpolation
94
+      class_dfs.append(df)
95
+    except FileNotFoundError:
96
+      print(f"Warning: File {filepath} not found and skipped.") 
97
+  if class_dfs:
98
+    dataTrain.append(pd.concat(class_dfs))
99
+
100
+# Concatenate all training data into a single DataFrame
101
+combined_train_data = pd.concat(dataTrain)
102
+
103
+# Load and merge test data
104
+dataTest = []
105
+for class_files in datafiles[1]:
106
+  class_dfs = []
107
+  for base_filename in class_files:
108
+    script_dir = os.path.dirname(os.path.abspath(__file__))
109
+    data_dir = os.path.join(script_dir, 'data')
110
+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
111
+    try:
112
+      df = pd.read_csv(filepath)
113
+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
114
+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
115
+      for col in features:
116
+        df[col] = pd.to_numeric(df[col], errors='coerce')
117
+      df = df.set_index('timestamp').resample('5Min')[features].mean()  # Resample and calculate mean only for features
118
+      df = df[features].interpolate()  # Estimate missing values using linear interpolation
119
+      class_dfs.append(df)
120
+    except FileNotFoundError:
121
+      print(f"Warning: File {filepath} not found and skipped.") 
122
+  if class_dfs:
123
+    dataTest.append(pd.concat(class_dfs))
124
+
125
+# Plot data before training
126
+if options.plot_before:
127
+  num_features = len(features)
128
+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
129
+  for i, feature in enumerate(features):
130
+    for k, df in enumerate(dataTrain):
131
+      axes[i].plot(df.index, df[feature], label=f'Train Class {k}')
132
+    axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
133
+    axes[i].set_title(f'Train Data - {featureNames[feature]}')
134
+    axes[i].legend()
135
+  plt.tight_layout()
136
+  plt.show()
137
+
138
+########################################################################################################
139
+
140
+# Normalize the data
141
+scaler = StandardScaler()
142
+scaled_train_data = scaler.fit_transform(combined_train_data[features])  # Normalize only the features
143
+
144
+scaled_test_data_list = []
145
+for df in dataTest:
146
+  scaled_test_data_list.append(scaler.transform(df[features]))  # Normalize only the features
147
+
148
+# Convert normalized data to DataFrame for easier handling
149
+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
150
+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
151
+
152
+############################################################################################################
153
+
154
+# Create time sequences
155
+def create_sequences(data, timesteps):
156
+  sequences = []
157
+  for i in range(len(data) - timesteps + 1):
158
+    sequences.append(data[i:i + timesteps])
159
+  return np.array(sequences)
160
+
161
+# Create time sequences for training data
162
+X_train_sequences = create_sequences(scaled_train_df.values, timesteps)
163
+n_samples, n_timesteps, n_features = X_train_sequences.shape
164
+X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_features)
165
+
166
+# Create time sequences for test data
167
+X_test_sequences_list = [create_sequences(df.values, timesteps) for df in scaled_test_df_list]
168
+X_test_reshaped_list = [seq.reshape(seq.shape[0], seq.shape[1] * seq.shape[2]) for seq in X_test_sequences_list]
169
+
170
+############################################################################################################
171
+
172
+# Train the GMM model
173
+gmm = GaussianMixture(n_components=n_components, random_state=42, n_init=10, init_params='kmeans')
174
+gmm.fit(X_train_reshaped)
175
+train_labels_gmm = gmm.predict(X_train_reshaped)
176
+
177
+# Predict components for test data
178
+cluster_labels_test_gmm_list = [gmm.predict(X_test_reshaped) for X_test_reshaped in X_test_reshaped_list]
179
+
180
+
181
+# Evaluate clustering quality for test data (GMM)
182
+for i, labels_test_gmm in enumerate(cluster_labels_test_gmm_list):
183
+    if len(X_test_reshaped_list[i]) > 1 and len(np.unique(labels_test_gmm)) > 1:
184
+        silhouette_avg_test_gmm = silhouette_score(X_test_reshaped_list[i], labels_test_gmm)
185
+        db_index_test_gmm = davies_bouldin_score(X_test_reshaped_list[i], labels_test_gmm)
186
+        print(f"\nSilhouette Score (Test Data - Set {i+1} - GMM): {silhouette_avg_test_gmm:.4f}")
187
+        print(f"Davies-Bouldin Index (Test Data - Set {i+1} - GMM): {db_index_test_gmm:.4f}")
188
+    else:
189
+        print(f"\nCannot calculate Silhouette Score and Davies-Bouldin Index for Test Data - Set {i+1} (GMM) due to insufficient samples or only one component.")
190
+
191
+############################################################################################################################
192
+
193
+# Function to plot data with assigned components (for GMM)
194
+def plot_clustered_data_gmm(original_data_list, cluster_labels_list, n_components, features, featureNames, unitNames, title_prefix=""):
195
+  num_features = len(features)
196
+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
197
+  colors = plt.cm.viridis(np.linspace(0, 1, n_components))  # Assign colors to each component
198
+
199
+  for k, df in enumerate(original_data_list):
200
+    df_index = df.index
201
+    labels = cluster_labels_list[k]
202
+    for i, feature in enumerate(features):
203
+      for component_id in range(n_components):
204
+        component_indices = np.where(labels == component_id)[0]
205
+        if len(component_indices) > 0 and len(df_index) > component_indices[-1]:
206
+          valid_indices = df_index[component_indices]
207
+          axes[i].scatter(valid_indices, df[feature].loc[valid_indices], color=colors[component_id], label=f'Component {component_id}' if k == 0 else "", s=10)
208
+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
209
+      axes[i].set_title(f'{title_prefix} - {featureNames[feature]}')
210
+      axes[i].legend(loc='upper right')
211
+
212
+  plt.tight_layout()
213
+  plt.show()
214
+
215
+# Plot data after clustering (training) with GMM
216
+if options.plot_after:
217
+  train_original_indices = combined_train_data.index[timesteps - 1:]
218
+  plot_df_train = pd.DataFrame(scaled_train_df.values[timesteps - 1:], index=train_original_indices, columns=features)
219
+  trimmed_train_labels_gmm = train_labels_gmm[:len(plot_df_train)]
220
+  plot_clustered_data_gmm([plot_df_train], [trimmed_train_labels_gmm], n_components, features, featureNames, unitNames, title_prefix="Train Data Components")
221
+
222
+  
223
+  # Plot scatter plot of components for training data (using the first two features)
224
+  if len(features) >= 2:
225
+    plt.figure(figsize=(10, 8))
226
+    scatter = plt.scatter(scaled_train_df[features[0]][:len(plot_df_train)], scaled_train_df[features[1]][:len(plot_df_train)], c=trimmed_train_labels_gmm, cmap='viridis')
227
+    plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
228
+    plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
229
+    plt.title('Components of Train Data (Features 1 vs 2) - GMM')
230
+    plt.colorbar(scatter, label='Component ID')
231
+    plt.show()
232
+
233
+# Plot test data with predicted components (GMM)
234
+if options.plot_transition:
235
+  plot_clustered_data_gmm(dataTest, cluster_labels_test_gmm_list, n_components, features, featureNames, unitNames, title_prefix="Test Data Components")
236
+
237
+  # Evaluate clustering quality for test data (GMM)
238
+  for i, labels_test_gmm in enumerate(cluster_labels_test_gmm_list):
239
+    if len(X_test_reshaped_list[i]) > 1 and len(np.unique(labels_test_gmm)) > 1:
240
+      silhouette_avg_test_gmm = silhouette_score(X_test_reshaped_list[i], labels_test_gmm)
241
+      db_index_test_gmm = davies_bouldin_score(X_test_reshaped_list[i], labels_test_gmm)
242
+      print(f"\nSilhouette Score (Test Data - Set {i+1} - GMM): {silhouette_avg_test_gmm:.4f}")
243
+      print(f"Davies-Bouldin Index (Test Data - Set {i+1} - GMM): {db_index_test_gmm:.4f}")
244
+    else:
245
+      print(f"\nCannot calculate Silhouette Score and Davies-Bouldin Index for Test Data - Set {i+1} (GMM) due to insufficient samples or only one component.")
246
+
247
+    # Plot scatter plot of components for test data (using the first two features)
248
+    if len(features) >= 2 and len(scaled_test_df_list[i]) > timesteps - 1:
249
+      test_labels_to_plot_gmm = labels_test_gmm[:len(scaled_test_df_list[i]) - (timesteps - 1)]
250
+      if len(test_labels_to_plot_gmm) > 0:
251
+        plt.figure(figsize=(10, 8))
252
+        scatter = plt.scatter(scaled_test_df_list[i][features[0]][timesteps-1:], scaled_test_df_list[i][features[1]][timesteps-1:], c=test_labels_to_plot_gmm, cmap='viridis')
253
+        plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
254
+        plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
255
+        plt.title(f'Components of Test Data - Set {i+1} (Features 1 vs 2) - GMM')
256
+        plt.colorbar(scatter, label='Component ID')
257
+        plt.show()
258
+
259
+#####################################################################################################
260
+# Anomaly detection based on likelihood in GMM
261
+#####################################################################################################
262
+
263
+def calculate_anomaly_scores_gmm(data_reshaped, gmm_model):
264
+  # Lower likelihood indicates higher probability of anomaly. We use the negative log-likelihood as the anomaly score.
265
+  anomaly_scores = -gmm_model.score_samples(data_reshaped)
266
+  return anomaly_scores
267
+
268
+train_anomaly_scores_gmm = calculate_anomaly_scores_gmm(X_train_reshaped, gmm)
269
+
270
+def plot_anomalies_gmm(original_data_list, anomaly_scores_list, threshold_factor=3):
271
+  num_features = len(features)
272
+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
273
+  for k, df in enumerate(original_data_list):
274
+    time_index = df.index[timesteps - 1:]
275
+    anomaly_scores = anomaly_scores_list[k]
276
+    threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
277
+    anomalous_indices = np.where(anomaly_scores > threshold)[0]
278
+    for i, feature in enumerate(features):
279
+      # Ensure the lengths match for plotting - time_index length should match anomaly_scores length
280
+      # Slicing original_df by time_index might not be needed if df is already aligned
281
+      # Let's plot the full original data and mark anomalies based on time_index
282
+      axes[i].plot(df.index, df[feature], label=f'Normal', alpha=0.7) # Plot full data dimly
283
+
284
+      if len(time_index) == len(anomaly_scores): # Check alignment
285
+        # Now mark anomalies on the original data time axis
286
+        anomaly_times = time_index[anomalous_indices]
287
+        axes[i].scatter(anomaly_times, df[feature].loc[anomaly_times], color='red', label=f'Anomaly' if k == 0 else "", s=20, zorder=5) # Use zorder to ensure points are visible
288
+      else:
289
+        print(f"Warning: Data length mismatch for plotting anomalies in file {k}. Skipping anomaly plot for this file.")
290
+     
291
+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
292
+      axes[i].set_title(f'Potential Anomalies (GMM) - {featureNames[feature]}')
293
+      # Only add a single legend across all subplots
294
+  handles, labels = [], []
295
+  for ax in axes:
296
+   for h, l in zip(*ax.get_legend_handles_labels()):
297
+    if l not in labels:
298
+     handles.append(h)
299
+     labels.append(l)
300
+  if handles:
301
+   axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
302
+
303
+
304
+  plt.tight_layout()
305
+  plt.show()
306
+
307
+if options.plot_anomalies:
308
+  test_anomaly_scores_gmm_list = [calculate_anomaly_scores_gmm(X_test_reshaped, gmm) for X_test_reshaped in X_test_reshaped_list]
309
+  # Need original data frames for plotting
310
+  plot_anomalies_gmm(dataTest, test_anomaly_scores_gmm_list)
311
+
312
+#####################################################################################################
313
+# Display components that may contain anomalies (GMM)
314
+#####################################################################################################
315
+
316
+def plot_anomaly_clusters_gmm(original_data_list, cluster_labels_list, anomaly_scores_list, n_components, features, featureNames, unitNames, threshold_factor=3):
317
+  num_features = len(features)
318
+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
319
+  colors = plt.cm.viridis(np.linspace(0, 1, n_components))
320
+
321
+  for k, df in enumerate(original_data_list):
322
+    time_index = df.index[timesteps - 1:]
323
+    cluster_labels = cluster_labels_list[k]
324
+    anomaly_scores = anomaly_scores_list[k]
325
+    threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
326
+    anomalous_indices = np.where(anomaly_scores > threshold)[0]
327
+    anomalous_components = np.unique(cluster_labels[anomalous_indices])
328
+
329
+    for i, feature in enumerate(features):
330
+      # Plot all data points colored by their component
331
+      for component_id in range(n_components):
332
+        component_indices = np.where(cluster_labels == component_id)[0]
333
+        # Ensure component_indices are valid for time_index
334
+        if len(component_indices) > 0:
335
+          current_time_index = time_index[component_indices]
336
+          # Need to get corresponding values from original_df using time index
337
+          axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color=colors[component_id], label=f'Component {component_id}' if k == 0 else "", s=10, alpha=0.5)
338
+
339
+      # Highlight anomaly components - Plot ALL points belonging to the identified anomaly components
340
+      # Loop through components that contain at least one anomalous point
341
+      for anomaly_component in anomalous_components:
342
+        # Find ALL indices that belong to this anomaly component
343
+        component_all_indices = np.where(cluster_labels == anomaly_component)[0]
344
+        if len(component_all_indices) > 0:
345
+          current_time_index = time_index[component_all_indices]
346
+          # Plot these points with a distinct color/marker
347
+          axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color='red', label=f'Anomaly Comp. {anomaly_component}' if k == 0 else "", s=30, zorder=5)
348
+
349
+
350
+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
351
+      axes[i].set_title(f'Anomaly Components (GMM) - {featureNames[feature]}')
352
+      # Only add a single legend across all subplots
353
+  handles, labels = [], []
354
+  for ax in axes:
355
+   for h, l in zip(*ax.get_legend_handles_labels()):
356
+    if l not in labels:
357
+     handles.append(h)
358
+     labels.append(l)
359
+  if handles:
360
+   axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
361
+
362
+
363
+  plt.tight_layout()
364
+  plt.show()
365
+
366
+if options.plot_anomaly_clusters:
367
+  test_anomaly_scores_gmm_list = [calculate_anomaly_scores_gmm(X_test_reshaped, gmm) for X_test_reshaped in X_test_reshaped_list]
368
+  plot_anomaly_clusters_gmm(dataTest, cluster_labels_test_gmm_list, test_anomaly_scores_gmm_list, n_components, features, featureNames, unitNames)
369
+
370
+#####################################################################################################
371
+# Evaluation with manual labels (if provided) - similar for GMM
372
+#####################################################################################################
373
+
374
+if options.manual_anomaly_labels:
375
+  manual_labels_df = pd.read_csv(options.manual_anomaly_labels, index_col=0)
376
+  all_true_labels = []
377
+  all_predicted_anomalies = []
378
+
379
+  for k, df in enumerate(dataTest):
380
+    # Ensure anomaly scores were calculated for this test set
381
+    if k >= len(test_anomaly_scores_gmm_list):
382
+     print(f"Warning: Anomaly scores not calculated for test set {k}. Skipping manual evaluation for this set.")
383
+     continue
384
+
385
+    time_index = df.index[timesteps - 1:] # Time index aligned with sequences
386
+    if not time_index.empty:
387
+      # Reindex manual labels to match the time_index of sequences
388
+      relevant_labels = manual_labels_df.reindex(time_index, method='nearest')['is_anomaly'].fillna(False).astype(int)
389
+
390
+      anomaly_scores = test_anomaly_scores_gmm_list[k]
391
+      # Ensure lengths match before applying threshold
392
+      if len(anomaly_scores) == len(relevant_labels):
393
+        threshold = np.mean(anomaly_scores) + 3 * np.std(anomaly_scores) # Example threshold
394
+        predicted_anomalies = (anomaly_scores > threshold).astype(int)
395
+        all_true_labels.extend(relevant_labels.values)
396
+        all_predicted_anomalies.extend(predicted_anomalies)
397
+      else:
398
+       print(f"Warning: Mismatch in lengths between anomaly scores ({len(anomaly_scores)}) and manual labels ({len(relevant_labels)}) for test set {k}. Skipping evaluation for this set.")
399
+
400
+
401
+  if all_true_labels:
402
+    print("\nEvaluation using manual anomaly labels (GMM):")
403
+    print(classification_report(all_true_labels, all_predicted_anomalies, target_names=['Normal', 'Anomaly'], zero_division=0))
404
+    cm = confusion_matrix(all_true_labels, all_predicted_anomalies)
405
+    plt.figure(figsize=(8, 6))
406
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
407
+    plt.title('Confusion Matrix (Manual Labels - GMM)')
408
+    plt.xlabel('Predicted Label')
409
+    plt.ylabel('True Label')
410
+    plt.show()
411
+  else:
412
+    print("\nNo manual anomaly labels provided or aligned with test data for GMM evaluation.")

+ 418
- 0
kmeans_gmm_anomaly/V0_V1_kmeans_anomaly_unsupervised_with_comment.py Bestand weergeven

@@ -0,0 +1,418 @@
1
+# V0_V1_kmeans_anomaly_unsupervised_with_comment
2
+import pandas as pd
3
+import numpy as np
4
+import matplotlib.pyplot as plt
5
+from sklearn.cluster import KMeans
6
+from sklearn.preprocessing import StandardScaler
7
+from sklearn.metrics import silhouette_score, davies_bouldin_score
8
+import argparse
9
+import os
10
+import seaborn as sns
11
+from sklearn.metrics import confusion_matrix, classification_report
12
+from sklearn.decomposition import PCA  # For dimensionality reduction if needed
13
+
14
+# Set up command line arguments
15
+parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering (Unsupervised).')
16
+parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
17
+parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means.')
18
+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
19
+parser.add_argument('--plot_before', action='store_true', help='Plot data before training.')
20
+parser.add_argument('--plot_after', action='store_true', help='Plot data after clustering.')
21
+parser.add_argument('--plot_transition', action='store_true', help='Plot transition data with clusters.')
22
+parser.add_argument('--plot_anomalies', action='store_true', help='Plot potential anomalies based on distance.')
23
+parser.add_argument('--plot_anomaly_clusters', action='store_true', help='Highlight clusters with potential anomalies.')
24
+parser.add_argument('--manual_anomaly_labels', type=str, default=None, help='Path to CSV with manual anomaly labels for evaluation.')
25
+
26
+options = parser.parse_args()
27
+
28
+# Number of clusters and sequence length from command line arguments
29
+n_clusters = options.n_clusters
30
+timesteps = options.timesteps
31
+
32
+#####################################################################################################
33
+
34
+NumberOfFailures = 4  # So far, we have only data for the first 4 types of failures
35
+datafiles = [[], []]  # 0 for train,  1 for test
36
+for i in range(NumberOfFailures + 1):
37
+  datafiles[0].append([])
38
+  datafiles[1].append([])
39
+
40
+# Next set of data corresponds to Freezer, SP=-26
41
+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
42
+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
43
+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
44
+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
45
+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
46
+
47
+if options.transition:
48
+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
49
+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']  # with TRANSITION
50
+  datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']  # with TRANSITION
51
+  datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']  # with TRANSITION
52
+  datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
53
+
54
+else:
55
+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
56
+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
57
+  datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
58
+  datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
59
+  datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
60
+
61
+# Features used
62
+features = ['r1 s1', 'r1 s4', 'r1 s5']
63
+
64
+featureNames = {}
65
+featureNames['r1 s1'] = r'<span class="math-inline">T\_\{evap\}</span>'
66
+featureNames['r1 s4'] = r'<span class="math-inline">T\_\{cond\}</span>'
67
+featureNames['r1 s5'] = r'<span class="math-inline">T\_\{air\}</span>'
68
+
69
+unitNames = {}
70
+unitNames['r1 s1'] = r'($^o$C)'
71
+unitNames['r1 s4'] = r'($^o$C)'
72
+unitNames['r1 s5'] = r'($^o$C)'
73
+
74
+NumFeatures = len(features)
75
+
76
+#####################################################################################################
77
+
78
+# Load and merge training data
79
+dataTrain = []
80
+for class_files in datafiles[0]:
81
+  class_dfs = []
82
+  for base_filename in class_files:
83
+    script_dir = os.path.dirname(os.path.abspath(__file__))
84
+    data_dir = os.path.join(script_dir, 'data')
85
+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
86
+    try:
87
+      df = pd.read_csv(filepath)
88
+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
89
+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
90
+      for col in features:
91
+        df[col] = pd.to_numeric(df[col], errors='coerce')
92
+      df = df.set_index('timestamp').resample('5Min')[features].mean()  # Resample and calculate mean only for features
93
+      df = df[features].interpolate()  # Estimate missing values using linear interpolation
94
+      class_dfs.append(df)
95
+    except FileNotFoundError:
96
+      print(f"Warning: File {filepath} not found and skipped.") 
97
+  if class_dfs:
98
+    dataTrain.append(pd.concat(class_dfs))
99
+
100
+# Concatenate all training data into a single DataFrame
101
+combined_train_data = pd.concat(dataTrain)
102
+
103
+# Load and merge test data
104
+dataTest = []
105
+for class_files in datafiles[1]:
106
+  class_dfs = []
107
+  for base_filename in class_files:
108
+    script_dir = os.path.dirname(os.path.abspath(__file__))
109
+    data_dir = os.path.join(script_dir, 'data')
110
+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
111
+    try:
112
+      df = pd.read_csv(filepath)
113
+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
114
+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
115
+      for col in features:
116
+        df[col] = pd.to_numeric(df[col], errors='coerce')
117
+      df = df.set_index('timestamp').resample('5Min')[features].mean()  # Resample and calculate mean only for features
118
+      df = df[features].interpolate()  # Estimate missing values using linear interpolation
119
+      class_dfs.append(df)
120
+    except FileNotFoundError:
121
+      print(f"Warning: File {filepath} not found and skipped.")
122
+  if class_dfs:
123
+    dataTest.append(pd.concat(class_dfs))
124
+
125
+# Plot data before training
126
+if options.plot_before:
127
+  num_features = len(features)
128
+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
129
+  for i, feature in enumerate(features):
130
+    for k, df in enumerate(dataTrain):
131
+      axes[i].plot(df.index, df[feature], label=f'Train Class {k}')
132
+    axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
133
+    axes[i].set_title(f'Train Data - {featureNames[feature]}')
134
+    axes[i].legend()
135
+  plt.tight_layout()
136
+  plt.show()
137
+
138
+########################################################################################################
139
+
140
+# Normalize the data
141
+scaler = StandardScaler()
142
+scaled_train_data = scaler.fit_transform(combined_train_data[features])  # Normalize only the features
143
+
144
+scaled_test_data_list = []
145
+for df in dataTest:
146
+  scaled_test_data_list.append(scaler.transform(df[features]))  # Normalize only the features
147
+
148
+# Convert normalized data to DataFrame for easier handling
149
+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
150
+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
151
+
152
+############################################################################################################
153
+
154
+# Create time sequences
155
+def create_sequences(data, timesteps):
156
+  sequences = []
157
+  for i in range(len(data) - timesteps + 1):
158
+    sequences.append(data[i:i + timesteps])
159
+  return np.array(sequences)
160
+
161
+# Create time sequences for training data
162
+X_train_sequences = create_sequences(scaled_train_df.values, timesteps)
163
+n_samples, n_timesteps, n_features = X_train_sequences.shape
164
+X_train_reshaped = X_train_sequences.reshape(n_samples, n_timesteps * n_features)
165
+
166
+# Create time sequences for test data
167
+X_test_sequences_list = [create_sequences(df.values, timesteps) for df in scaled_test_df_list]
168
+X_test_reshaped_list = [seq.reshape(seq.shape[0], seq.shape[1] * seq.shape[2]) for seq in X_test_sequences_list]
169
+
170
+############################################################################################################
171
+
172
+# Train the K-Means model
173
+kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)  # n_init to prevent convergence to local optimal
174
+kmeans.fit(X_train_reshaped)
175
+train_labels = kmeans.labels_
176
+
177
+# Evaluate clustering quality for training data
178
+silhouette_avg_train = silhouette_score(X_train_reshaped, train_labels)
179
+db_index_train = davies_bouldin_score(X_train_reshaped, train_labels)
180
+print("\nK-Means Model Evaluation on Training Data:")
181
+print(f"\nSilhouette Score (Train Data): {silhouette_avg_train:.4f}")
182
+print(f"Davies-Bouldin Index (Train Data): {db_index_train:.4f}")
183
+
184
+# Predict clusters for test data
185
+cluster_labels_test_list = [kmeans.predict(X_test_reshaped) for X_test_reshaped in X_test_reshaped_list]
186
+
187
+# Evaluate clustering quality for test data (if there is more than one sample)
188
+for i, labels_test in enumerate(cluster_labels_test_list):
189
+  if len(X_test_reshaped_list[i]) > 1 and len(np.unique(labels_test)) > 1:
190
+    silhouette_avg_test = silhouette_score(X_test_reshaped_list[i], labels_test)
191
+    db_index_test = davies_bouldin_score(X_test_reshaped_list[i], labels_test)
192
+    print(f"\nSilhouette Score (Test Data - Set {i+1}): {silhouette_avg_test:.4f}")
193
+    print(f"Davies-Bouldin Index (Test Data - Set {i+1}): {db_index_test:.4f}")
194
+  else:
195
+    print(f"\nCannot calculate Silhouette Score and Davies-Bouldin Index for Test Data - Set {i+1} due to insufficient samples or only one cluster.")
196
+############################################################################################################################
197
+
198
+# Function to plot data with assigned clusters
199
+def plot_clustered_data(original_data_list, cluster_labels_list, n_clusters, features, featureNames, unitNames, title_prefix=""):
200
+  num_features = len(features)
201
+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
202
+  colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))  # Assign colors to each cluster
203
+
204
+  for k, df in enumerate(original_data_list):
205
+    # Change: Directly use DataFrame index
206
+    df_index = df.index
207
+    labels = cluster_labels_list[k]
208
+    for i, feature in enumerate(features):
209
+      for cluster_id in range(n_clusters):
210
+        cluster_indices_kmeans = np.where(labels == cluster_id)[0]
211
+        # Ensure indices are within the range of df_index
212
+        if len(cluster_indices_kmeans) > 0 and len(df_index) > cluster_indices_kmeans[-1]:
213
+          valid_indices = df_index[cluster_indices_kmeans]
214
+          axes[i].scatter(valid_indices, df[feature].loc[valid_indices], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10)
215
+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
216
+      axes[i].set_title(f'{title_prefix} - {featureNames[feature]}')
217
+      axes[i].legend(loc='upper right')
218
+
219
+  plt.tight_layout()
220
+  plt.show()
221
+
222
+# Plot data after clustering (training)
223
+if options.plot_after:
224
+  # To plot training data, we need to map sequences back to original data
225
+  train_original_indices = combined_train_data.index[timesteps - 1:]
226
+  plot_df_train = pd.DataFrame(scaled_train_df.values[timesteps - 1:], index=train_original_indices, columns=features)
227
+
228
+  # Change: Slice train_labels to the length of plot_df_train
229
+  trimmed_train_labels = train_labels[:len(plot_df_train)]
230
+
231
+  plot_clustered_data([plot_df_train], [trimmed_train_labels], n_clusters, features, featureNames, unitNames, title_prefix="Train Data Clusters")
232
+
233
+    # Plot scatter plot of clusters for training data (using the first two features)
234
+  if len(features) >= 2:
235
+    plt.figure(figsize=(10, 8))
236
+    scatter = plt.scatter(scaled_train_df[features[0]][:len(plot_df_train)], scaled_train_df[features[1]][:len(plot_df_train)], c=trimmed_train_labels, cmap='viridis')
237
+    plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
238
+    plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
239
+    plt.title('Clusters of Train Data (Features 1 vs 2)')
240
+    plt.colorbar(scatter, label='Cluster ID')
241
+    plt.show()
242
+
243
+# Plot test data with predicted clusters
244
+if options.plot_transition:
245
+  plot_clustered_data(dataTest, cluster_labels_test_list, n_clusters, features, featureNames, unitNames, title_prefix="Test Data Clusters")
246
+
247
+    # Plot scatter plot of clusters for test data (using the first two features)
248
+  for i, labels_test in enumerate(cluster_labels_test_list):  
249
+    if len(features) >= 2 and len(scaled_test_df_list[i]) > timesteps - 1:
250
+      # Change: Ensure label length matches data length
251
+      test_labels_to_plot = labels_test[:len(scaled_test_df_list[i]) - (timesteps - 1)]
252
+      if len(test_labels_to_plot) > 0:
253
+        plt.figure(figsize=(10, 8))
254
+        scatter = plt.scatter(scaled_test_df_list[i][features[0]][timesteps-1:], scaled_test_df_list[i][features[1]][timesteps-1:], c=test_labels_to_plot, cmap='viridis')
255
+        plt.xlabel(f'{featureNames[features[0]]} {unitNames[features[0]]}')
256
+        plt.ylabel(f'{featureNames[features[1]]} {unitNames[features[1]]}')
257
+        plt.title(f'Clusters of Test Data - Set {i+1} (Features 1 vs 2)')
258
+        plt.colorbar(scatter, label='Cluster ID')
259
+        plt.show()
260
+
261
+#####################################################################################################
262
+# Anomaly detection based on distance from cluster center
263
+#####################################################################################################
264
+
265
+def calculate_anomaly_scores(data_reshaped, kmeans_model):
266
+  distances = []
267
+  for i in range(len(data_reshaped)):
268
+    distance = np.linalg.norm(data_reshaped[i] - kmeans_model.cluster_centers_[kmeans_model.labels_[i]])
269
+    distances.append(distance)
270
+  return np.array(distances)
271
+
272
+train_anomaly_scores = calculate_anomaly_scores(X_train_reshaped, kmeans)
273
+
274
+def plot_anomalies(original_data_list, anomaly_scores_list, threshold_factor=3):
275
+  num_features = len(features)
276
+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
277
+  for k, df in enumerate(original_data_list):
278
+    time_index = df.index[timesteps - 1:]
279
+    anomaly_scores = anomaly_scores_list[k]
280
+    threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
281
+    anomalous_indices = np.where(anomaly_scores > threshold)[0]
282
+    for i, feature in enumerate(features):
283
+      # Ensure lengths match for plotting
284
+      # Plot full original data dimly
285
+      axes[i].plot(df.index, df[feature], label=f'Normal', alpha=0.7)
286
+
287
+      if len(time_index) == len(anomaly_scores): # Check alignment
288
+        # Now mark anomalies on the original data time axis
289
+        anomaly_times = time_index[anomalous_indices]
290
+        axes[i].scatter(anomaly_times, df[feature].loc[anomaly_times], color='red', label=f'Anomaly' if k == 0 else "", s=20, zorder=5) # Use zorder
291
+      else:
292
+        print(f"Warning: Data length mismatch for plotting anomalies in file {k}. Skipping anomaly plot for this file.")
293
+
294
+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
295
+      axes[i].set_title(f'Potential Anomalies - {featureNames[feature]}')
296
+      # Only add a single legend across all subplots
297
+  handles, labels = [], []
298
+  for ax in axes:
299
+   for h, l in zip(*ax.get_legend_handles_labels()):
300
+    if l not in labels:
301
+     handles.append(h)
302
+     labels.append(l)
303
+  if handles:
304
+   axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
305
+
306
+
307
+  plt.tight_layout()
308
+  plt.show()
309
+
310
+
311
+if options.plot_anomalies:
312
+  test_anomaly_scores_list = [calculate_anomaly_scores(X_test_reshaped, kmeans) for X_test_reshaped in X_test_reshaped_list]
313
+  # Need original data frames for plotting
314
+  plot_anomalies(dataTest, test_anomaly_scores_list)
315
+
316
+#####################################################################################################
317
+# Display clusters that may contain anomalies
318
+#####################################################################################################
319
+
320
+def plot_anomaly_clusters(original_data_list, cluster_labels_list, anomaly_scores_list, n_clusters, features, featureNames, unitNames, threshold_factor=3):
321
+  num_features = len(features)
322
+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
323
+  colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
324
+
325
+  for k, df in enumerate(original_data_list):
326
+    time_index = df.index[timesteps - 1:]
327
+    cluster_labels = cluster_labels_list[k]
328
+    anomaly_scores = anomaly_scores_list[k]
329
+    threshold = np.mean(anomaly_scores) + threshold_factor * np.std(anomaly_scores)
330
+    anomalous_indices = np.where(anomaly_scores > threshold)[0]
331
+    anomalous_clusters = np.unique(cluster_labels[anomalous_indices])
332
+
333
+    for i, feature in enumerate(features):
334
+      # Plot all data points colored by their cluster
335
+      for cluster_id in range(n_clusters):
336
+        cluster_indices = np.where(cluster_labels == cluster_id)[0]
337
+        # Ensure cluster_indices are valid for time_index
338
+        if len(cluster_indices) > 0:
339
+          current_time_index = time_index[cluster_indices]
340
+          # Need to get corresponding values from original_df using time index
341
+          axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color=colors[cluster_id], label=f'Cluster {cluster_id}' if k == 0 else "", s=10, alpha=0.5)
342
+
343
+      # Highlight anomaly clusters - Plot ALL points belonging to the identified anomaly clusters
344
+      # Loop through clusters that contain at least one anomalous point
345
+      for anomaly_cluster in anomalous_clusters:
346
+        # Find ALL indices that belong to this anomaly cluster
347
+        cluster_all_indices = np.where(cluster_labels == anomaly_cluster)[0]
348
+        if len(cluster_all_indices) > 0:
349
+          current_time_index = time_index[cluster_all_indices]
350
+          # Plot these points with a distinct color/marker
351
+          axes[i].scatter(current_time_index, df[feature].loc[current_time_index], color='red', label=f'Anomaly Cluster {anomaly_cluster}' if k == 0 else "", s=30, zorder=5)
352
+
353
+
354
+  axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
355
+  axes[i].set_title(f'Anomaly Clusters - {featureNames[feature]}')
356
+  # Only add a single legend across all subplots
357
+  handles, labels = [], []
358
+  for ax in axes:
359
+   for h, l in zip(*ax.get_legend_handles_labels()):
360
+    if l not in labels:
361
+     handles.append(h)
362
+     labels.append(l)
363
+  if handles:
364
+   axes[0].legend(handles, labels, loc='upper right') # Add legend to the first subplot
365
+
366
+
367
+  plt.tight_layout()
368
+  plt.show()
369
+
370
+
371
+if options.plot_anomaly_clusters:
372
+  test_anomaly_scores_list = [calculate_anomaly_scores(X_test_reshaped, kmeans) for X_test_reshaped in X_test_reshaped_list]
373
+  plot_anomaly_clusters(dataTest, cluster_labels_test_list, test_anomaly_scores_list, n_clusters, features, featureNames, unitNames)
374
+
375
+
376
+#####################################################################################################
377
+# Evaluation with manual labels (if provided)
378
+#####################################################################################################
379
+
380
+if options.manual_anomaly_labels:
381
+  manual_labels_df = pd.read_csv(options.manual_anomaly_labels, index_col=0)
382
+  all_true_labels = []
383
+  all_predicted_anomalies = []
384
+
385
+  for k, df in enumerate(dataTest):
386
+    # Ensure anomaly scores were calculated for this test set
387
+    if k >= len(test_anomaly_scores_list):
388
+     print(f"Warning: Anomaly scores not calculated for test set {k}. Skipping manual evaluation for this set.")
389
+     continue
390
+
391
+    time_index = df.index[timesteps - 1:] # Time index aligned with sequences
392
+    if not time_index.empty:
393
+      # Assuming the manual labels CSV has a boolean 'is_anomaly' column and its index aligns with the test data
394
+      relevant_labels = manual_labels_df.reindex(time_index, method='nearest')['is_anomaly'].fillna(False).astype(int)
395
+
396
+      anomaly_scores = test_anomaly_scores_list[k]
397
+      # Ensure lengths match before applying threshold
398
+      if len(anomaly_scores) == len(relevant_labels):
399
+        threshold = np.mean(anomaly_scores) + 3 * np.std(anomaly_scores) # Example threshold
400
+        predicted_anomalies = (anomaly_scores > threshold).astype(int)
401
+        all_true_labels.extend(relevant_labels.values)
402
+        all_predicted_anomalies.extend(predicted_anomalies)
403
+      else:
404
+       print(f"Warning: Mismatch in lengths between anomaly scores ({len(anomaly_scores)}) and manual labels ({len(relevant_labels)}) for test set {k}. Skipping evaluation for this set.")
405
+
406
+
407
+  if all_true_labels:
408
+    print("\nEvaluation using manual anomaly labels:")
409
+    print(classification_report(all_true_labels, all_predicted_anomalies, target_names=['Normal', 'Anomaly'], zero_division=0))
410
+    cm = confusion_matrix(all_true_labels, all_predicted_anomalies)
411
+    plt.figure(figsize=(8, 6))
412
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
413
+    plt.title('Confusion Matrix (Manual Labels)')
414
+    plt.xlabel('Predicted Label')
415
+    plt.ylabel('True Label')
416
+    plt.show()
417
+  else:
418
+    print("\nNo manual anomaly labels provided or aligned with test data.")

Powered by TurnKey Linux.