|
@@ -0,0 +1,502 @@
|
|
1
|
+import pandas as pd
|
|
2
|
+import numpy as np
|
|
3
|
+import matplotlib.pyplot as plt
|
|
4
|
+# Import both scalers
|
|
5
|
+from sklearn.preprocessing import StandardScaler, RobustScaler
|
|
6
|
+# Removed LabelEncoder as it's not used in this version
|
|
7
|
+from sklearn.cluster import KMeans
|
|
8
|
+# Added silhouette_score back, keep classification_report, confusion_matrix
|
|
9
|
+from sklearn.metrics import classification_report, confusion_matrix, silhouette_score
|
|
10
|
+import argparse
|
|
11
|
+import os
|
|
12
|
+import seaborn as sns
|
|
13
|
+
|
|
14
|
+# Command line arguments setup
|
|
15
|
+parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering with Rate of Change, Change Point Detection, and Delayed Evaluation.')
|
|
16
|
+parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
|
|
17
|
+parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means.')
|
|
18
|
+parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.') # Using n_init from options (FIXED)
|
|
19
|
+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
|
|
20
|
+parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
|
|
21
|
+parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
|
|
22
|
+# Removed plot_anomalies, plot_misclassified flags as they are not implemented in this version
|
|
23
|
+# parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies.')
|
|
24
|
+# parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances.')
|
|
25
|
+parser.add_argument('--delay', type=int, default=10, help='Number of timesteps to delay evaluation after a change point.')
|
|
26
|
+parser.add_argument('--show_change_points', action='store_true', help='Show change points on clustered plots.')
|
|
27
|
+parser.add_argument('--use_standard_scaler', action='store_true', help='Use StandardScaler instead of RobustScaler.') # Added scaler choice flag
|
|
28
|
+options = parser.parse_args()
|
|
29
|
+
|
|
30
|
+# Parameters
|
|
31
|
+n_clusters = options.n_clusters
|
|
32
|
+timesteps = options.timesteps
|
|
33
|
+n_init = options.n_init # Used n_init from options (FIXED)
|
|
34
|
+delay_steps = options.delay
|
|
35
|
+show_change_points = options.show_change_points
|
|
36
|
+use_standard_scaler = options.use_standard_scaler # Get scaler choice
|
|
37
|
+
|
|
38
|
+# Data loading (same as previous code)
|
|
39
|
+NumberOfFailures = 4
|
|
40
|
+datafiles = [[], []]
|
|
41
|
+for i in range(NumberOfFailures + 1):
|
|
42
|
+ datafiles[0].append([])
|
|
43
|
+ datafiles[1].append([])
|
|
44
|
+
|
|
45
|
+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
|
|
46
|
+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
|
|
47
|
+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
|
|
48
|
+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
|
|
49
|
+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
|
|
50
|
+
|
|
51
|
+if options.transition:
|
|
52
|
+ datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
|
|
53
|
+ datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']
|
|
54
|
+ datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
|
|
55
|
+ datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']
|
|
56
|
+ datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
|
|
57
|
+else:
|
|
58
|
+ datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
|
|
59
|
+ datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
|
|
60
|
+ datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
|
|
61
|
+ datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
|
|
62
|
+ datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
|
|
63
|
+
|
|
64
|
+features = ['r1 s1', 'r1 s4', 'r1 s5']
|
|
65
|
+n_original_features = len(features) # Store original number of features for sequence creation with ROC
|
|
66
|
+# Using standard LaTeX formatting for display names
|
|
67
|
+featureNames = {'r1 s1': r'$T_{evap}$', 'r1 s4': r'$T_{cond}$', 'r1 s5': r'$T_{air}$'}
|
|
68
|
+unitNames = {'r1 s1': r'($^o$C)', 'r1 s4': r'($^o$C)', 'r1 s5': r'($^o$C)'}
|
|
69
|
+NumFeatures = len(features) # Used for indexing features[:NumFeatures]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+# Load and preprocess data (same as previous code)
|
|
73
|
+dataTrain = []
|
|
74
|
+for class_files in datafiles[0]:
|
|
75
|
+ script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
76
|
+ data_dir = os.path.join(script_dir, 'data')
|
|
77
|
+ class_dfs = []
|
|
78
|
+ for base_filename in class_files:
|
|
79
|
+ filepath = os.path.join(data_dir, f'{base_filename}.csv')
|
|
80
|
+ try:
|
|
81
|
+ df = pd.read_csv(filepath)
|
|
82
|
+ df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
|
|
83
|
+ df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
|
|
84
|
+ for col in features:
|
|
85
|
+ df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
86
|
+ df = df.set_index('timestamp').resample('5Min')[features].mean()
|
|
87
|
+ df = df[features].interpolate()
|
|
88
|
+ class_dfs.append(df)
|
|
89
|
+ except FileNotFoundError:
|
|
90
|
+ print(f"Warning: File {filepath} not found and skipped.")
|
|
91
|
+ if class_dfs:
|
|
92
|
+ dataTrain.append(pd.concat(class_dfs))
|
|
93
|
+combined_train_data = pd.concat(dataTrain)
|
|
94
|
+
|
|
95
|
+dataTest = []
|
|
96
|
+for class_files in datafiles[1]:
|
|
97
|
+ script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
98
|
+ data_dir = os.path.join(script_dir, 'data')
|
|
99
|
+ class_dfs = []
|
|
100
|
+ for base_filename in class_files:
|
|
101
|
+ filepath = os.path.join(data_dir, f'{base_filename}.csv')
|
|
102
|
+ try:
|
|
103
|
+ df = pd.read_csv(filepath)
|
|
104
|
+ df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
|
|
105
|
+ df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
|
|
106
|
+ for col in features:
|
|
107
|
+ df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
108
|
+ df = df.set_index('timestamp').resample('5Min')[features].mean()
|
|
109
|
+ df = df[features].interpolate()
|
|
110
|
+ class_dfs.append(df)
|
|
111
|
+ except FileNotFoundError:
|
|
112
|
+ print(f"Warning: File {filepath} not found and skipped.")
|
|
113
|
+ if class_dfs:
|
|
114
|
+ dataTest.append(pd.concat(class_dfs))
|
|
115
|
+
|
|
116
|
+# Normalize data (Uses RobustScaler by default, StandardScaler if --use_standard_scaler is set)
|
|
117
|
+scaler = StandardScaler() if use_standard_scaler else RobustScaler() # Scaler choice based on argument
|
|
118
|
+scaled_train_data = scaler.fit_transform(combined_train_data[features])
|
|
119
|
+scaled_test_data_list = [] # This list will store NumPy arrays initially
|
|
120
|
+for df in dataTest:
|
|
121
|
+ scaled_test_data_list.append(scaler.transform(df[features]))
|
|
122
|
+
|
|
123
|
+# Convert scaled data to DataFrames for easier handling and plotting
|
|
124
|
+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
|
|
125
|
+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)] # This list stores DataFrames
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+# Create time sequences WITH Rate of Change (from previous version)
|
|
129
|
+def create_sequences_with_rate_of_change(data, timesteps, original_features_count):
|
|
130
|
+ sequences = []
|
|
131
|
+ # Calculate rate of change for the entire data first (handles NaNs appropriately)
|
|
132
|
+ # Resulting shape is (len(data), original_features_count), with NaN in first row
|
|
133
|
+ rate_of_change_full = np.diff(data, axis=0, prepend=np.nan)
|
|
134
|
+
|
|
135
|
+ # Combine original data and rate of change
|
|
136
|
+ # Horizontally stack original data and rate of change
|
|
137
|
+ combined_data = np.hstack((data, rate_of_change_full)) # Shape (len(data), 2 * original_features_count)
|
|
138
|
+
|
|
139
|
+ # Create sequences from the combined data
|
|
140
|
+ for i in range(1, len(combined_data) - timesteps + 1): # CORRECTED: Start range from 1 to skip the first sequence
|
|
141
|
+ sequence = combined_data[i:i + timesteps] # Shape (timesteps, 2 * original_features_count)
|
|
142
|
+ sequences.append(sequence)
|
|
143
|
+
|
|
144
|
+ return np.array(sequences)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+X_train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features) # Use scaled_train_df.values
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+# Train K-Means model on all training data
|
|
151
|
+n_samples_train, n_timesteps_train, n_total_features_train = X_train_sequences.shape # Total features is now 2 * original_features
|
|
152
|
+X_train_reshaped = X_train_sequences.reshape(n_samples_train, n_timesteps_train * n_total_features_train) # Flatten sequences
|
|
153
|
+kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # Using n_init from options (FIXED)
|
|
154
|
+kmeans.fit(X_train_reshaped)
|
|
155
|
+
|
|
156
|
+# Function to detect change points (Function following sharp jumps or drops)
|
|
157
|
+# This function works on a single 2D data array (samples, features)
|
|
158
|
+# It's typically better to detect change points on the original scaled features BEFORE adding ROC
|
|
159
|
+def detect_change_points(data, threshold=0.8):
|
|
160
|
+ change_points = []
|
|
161
|
+ # Iterate through the data points starting from the second one
|
|
162
|
+ for i in range(1, len(data)):
|
|
163
|
+ # Calculate the absolute difference between the current point and the previous point
|
|
164
|
+ difference = np.abs(data[i] - data[i-1])
|
|
165
|
+ # If the difference for ANY feature is greater than the threshold, mark this point as a change point
|
|
166
|
+ if np.any(difference > threshold):
|
|
167
|
+ change_points.append(i)
|
|
168
|
+ return np.array(change_points)
|
|
169
|
+
|
|
170
|
+# Function to plot clustered data (adapted to accept sequence indices and show change points)
|
|
171
|
+def plot_clustered_data(df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=False, change_point_indices=None):
|
|
172
|
+ # Note: 'features' here is the list of original feature names
|
|
173
|
+ num_original_features = len(features)
|
|
174
|
+ fig, axes = plt.subplots(num_original_features, 1, figsize=(15, 5 * num_original_features), sharex=True)
|
|
175
|
+ if num_original_features == 1: axes = [axes] # Ensure axes is always an array
|
|
176
|
+ colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
|
|
177
|
+
|
|
178
|
+ # We are plotting original features only
|
|
179
|
+ # The input 'features' list in this function IS the list of original feature names ('r1 s1', 'r1 s4', 'r1 s5')
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+ for i, feature in enumerate(features): # Iterate only through original features for plotting
|
|
183
|
+ # Plot data points colored by their assigned cluster
|
|
184
|
+ for cluster_id in range(n_clusters):
|
|
185
|
+ cluster_indices_kmeans = np.where(predicted_clusters == cluster_id)[0]
|
|
186
|
+ if len(cluster_indices_kmeans) > 0:
|
|
187
|
+ # Use time_index for x-axis and original df for y-axis values
|
|
188
|
+ axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]],
|
|
189
|
+ color=colors[cluster_id], label=f'Cluster {cluster_id}', s=10, alpha=0.6) # Added alpha for better visualization of overlaps
|
|
190
|
+ axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}') # Use original names/units from dict
|
|
191
|
+ axes[i].set_title(featureNames[feature]) # Use original names from dict
|
|
192
|
+ axes[i].grid(True, linestyle='--', alpha=0.6) # Added grid for readability
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+ # Plot change points if enabled
|
|
196
|
+ if show_cp and change_point_indices is not None:
|
|
197
|
+ # Ensure change_point_indices contains datetime objects matching time_index
|
|
198
|
+ for cp_time in change_point_indices:
|
|
199
|
+ axes[i].axvline(x=cp_time, color='red', linestyle='--', linewidth=1.5, label='Change Point' if i == 0 else '', alpha=0.8) # Only one label for change point
|
|
200
|
+
|
|
201
|
+ # Add legend to the last subplot, including cluster labels and change point if plotted
|
|
202
|
+ handles, labels = [], []
|
|
203
|
+ # Collect handles and labels from all axes to avoid duplicates
|
|
204
|
+ for ax in axes:
|
|
205
|
+ for handle, label in zip(*ax.get_legend_handles_labels()):
|
|
206
|
+ if label not in labels:
|
|
207
|
+ handles.append(handle)
|
|
208
|
+ labels.append(label)
|
|
209
|
+ if handles:
|
|
210
|
+ axes[-1].legend(handles, labels, loc='upper right')
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+ plt.tight_layout()
|
|
214
|
+ plt.show()
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+# Combined Evaluation Function (Full and Delayed)
|
|
218
|
+def evaluate_and_report(kmeans_model, scaled_test_df_list, original_test_data_list, true_labels_list, timesteps, delay_steps, features, options, n_original_features):
|
|
219
|
+ all_y_true_full = [] # True labels for all test sequences
|
|
220
|
+ all_predicted_cluster_labels_full = [] # Predicted clusters for all test sequences
|
|
221
|
+ # all_original_test_sequences_full = [] # Not directly needed in evaluation logic itself
|
|
222
|
+ all_change_points_detected_list = [] # Detected change points time indices for each test file
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+ # --- 1. Collect data and predict clusters for ALL test sequences ---
|
|
226
|
+ # Iterate over scaled DataFrames (scaled_test_df_list) and original DataFrames (original_test_data_list)
|
|
227
|
+ for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_df_list, original_test_data_list, true_labels_list)):
|
|
228
|
+ original_indices = original_df.index
|
|
229
|
+ # time_index for plotting and aligning labels corresponds to the end of each sequence
|
|
230
|
+ time_index = original_indices[timesteps - 1:]
|
|
231
|
+
|
|
232
|
+ # Create sequences (using create_sequences_with_rate_of_change for this version)
|
|
233
|
+ # scaled_df is a DataFrame, pass its .values
|
|
234
|
+ sequences = create_sequences_with_rate_of_change(scaled_df.values, timesteps, n_original_features)
|
|
235
|
+
|
|
236
|
+ if sequences.size == 0:
|
|
237
|
+ print(f"Warning: No sequences generated for test file {k}. Skipping.")
|
|
238
|
+ all_change_points_detected_list.append([]) # Append empty list for consistency
|
|
239
|
+ continue # Skip to next file
|
|
240
|
+
|
|
241
|
+ n_sequences = sequences.shape[0]
|
|
242
|
+ reshaped_sequences = sequences.reshape(n_sequences, -1)
|
|
243
|
+ predicted_clusters = kmeans_model.predict(reshaped_sequences)
|
|
244
|
+
|
|
245
|
+ # Collect true labels aligned with the sequences that were ACTUALLY created.
|
|
246
|
+ # create_sequences_with_rate_of_change skips the first sequence (ending at timesteps-1).
|
|
247
|
+ # So, start collecting true labels from the index corresponding to the END of the second sequence (index timesteps).
|
|
248
|
+ all_y_true_full.extend(y_true_categorical[timesteps:]) # CORRECTED: Start slicing from timesteps
|
|
249
|
+ all_predicted_cluster_labels_full.extend(predicted_clusters)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+ # Detect change points for this test file (on scaled data - using ORIGINAL features before adding ROC)
|
|
253
|
+ # scaled_df.values contains only the original scaled features
|
|
254
|
+ change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
|
|
255
|
+
|
|
256
|
+ # Map original data change point indices to the time_index of sequences
|
|
257
|
+ # A change at original index `cp_original` corresponds to the sequence ENDING at `cp_original`.
|
|
258
|
+ # The index in the `sequences` array corresponding to original index `cp_original` is `cp_original - (timesteps - 1)`.
|
|
259
|
+ # We need to make sure this sequence index is valid (>= 0 and < n_sequences).
|
|
260
|
+ change_point_sequence_indices = change_points - (timesteps - 1)
|
|
261
|
+ # Filter for valid sequence indices
|
|
262
|
+ valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < n_sequences)]
|
|
263
|
+
|
|
264
|
+ # Store the time index of the valid change point sequences
|
|
265
|
+ if valid_change_point_sequence_indices.size > 0:
|
|
266
|
+ cp_time_indices = time_index[valid_change_point_sequence_indices].tolist()
|
|
267
|
+ all_change_points_detected_list.append(cp_time_indices)
|
|
268
|
+ else:
|
|
269
|
+ all_change_points_detected_list.append([])
|
|
270
|
+
|
|
271
|
+ # Plot clustered data for the current test file if requested and transition=False (to avoid duplicate plots handled later)
|
|
272
|
+ # This uses original_df and predicted_clusters for this file
|
|
273
|
+ if options.plot_clustered and not options.transition:
|
|
274
|
+ print(f"\nClustered Data for Test File {k}:")
|
|
275
|
+ plot_clustered_data(original_df, predicted_clusters, time_index, kmeans_model.n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices if show_change_points else None)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+ # Convert collected lists to numpy arrays for evaluation
|
|
279
|
+ all_y_true_full = np.array(all_y_true_full)
|
|
280
|
+ all_predicted_cluster_labels_full = np.array(all_predicted_cluster_labels_full)
|
|
281
|
+ # all_original_test_sequences_full = np.array(all_original_test_sequences_full) # Convert if needed for plotting later
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+ # --- 2. Perform FULL Evaluation (on all test sequences) ---
|
|
285
|
+ print("\n--- Full Evaluation Results (All Test Sequences) ---")
|
|
286
|
+
|
|
287
|
+ # Analyze clusters and assign a dominant true label to each cluster based on ALL test sequences
|
|
288
|
+ cluster_dominant_label_full = {}
|
|
289
|
+ for cluster_id in range(kmeans_model.n_clusters):
|
|
290
|
+ indices_in_cluster = np.where(all_predicted_cluster_labels_full == cluster_id)[0]
|
|
291
|
+ if len(indices_in_cluster) > 0:
|
|
292
|
+ labels_in_cluster = all_y_true_full[indices_in_cluster]
|
|
293
|
+ if len(labels_in_cluster) > 0:
|
|
294
|
+ # Use np.argmax to find the index of the max count (dominant label)
|
|
295
|
+ dominant_label = np.argmax(np.bincount(labels_in_cluster))
|
|
296
|
+ cluster_dominant_label_full[cluster_id] = dominant_label
|
|
297
|
+ else:
|
|
298
|
+ cluster_dominant_label_full[cluster_id] = -1 # No data in this cluster with known labels
|
|
299
|
+ else:
|
|
300
|
+ cluster_dominant_label_full[cluster_id] = -1 # Empty cluster
|
|
301
|
+
|
|
302
|
+ # Create predicted labels for full evaluation based on the dominant label of the assigned cluster
|
|
303
|
+ predicted_labels_numeric_full = np.array([cluster_dominant_label_full.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels_full])
|
|
304
|
+
|
|
305
|
+ # Evaluate (using numeric labels for the full set)
|
|
306
|
+ valid_indices_full = predicted_labels_numeric_full != -1
|
|
307
|
+ if np.sum(valid_indices_full) > 0 and len(np.unique(all_y_true_full[valid_indices_full])) > 1:
|
|
308
|
+ print("Classification Report (Full Test Set):")
|
|
309
|
+ print(classification_report(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full]))
|
|
310
|
+ cm_full = confusion_matrix(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full])
|
|
311
|
+ plt.figure(figsize=(8, 6))
|
|
312
|
+ sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues')
|
|
313
|
+ plt.xlabel('Predicted Cluster (Dominant True Label)')
|
|
314
|
+ plt.ylabel('True Label')
|
|
315
|
+ plt.title('Confusion Matrix (Full Test Set)')
|
|
316
|
+ plt.show()
|
|
317
|
+ else:
|
|
318
|
+ print("Could not perform full evaluation (not enough data or classes after mapping).")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+ # --- 3. Perform DELAYED Evaluation (on subset after delay) ---
|
|
322
|
+ print("\n--- Delayed Evaluation Results (Subset after Delay) ---")
|
|
323
|
+
|
|
324
|
+ all_y_true_delayed = []
|
|
325
|
+ all_predicted_cluster_labels_delayed = []
|
|
326
|
+
|
|
327
|
+ # Apply delay logic PER FILE's sequence indices, and collect results
|
|
328
|
+ sequence_count_so_far = 0
|
|
329
|
+ # Iterate over scaled DataFrames (scaled_test_df_list) and original DataFrames (original_test_data_list)
|
|
330
|
+ for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_df_list, original_test_data_list, true_labels_list)):
|
|
331
|
+ # Use the correct sequence creation function based on this version (with ROC)
|
|
332
|
+ sequences = create_sequences_with_rate_of_change(scaled_df.values, timesteps, n_original_features)
|
|
333
|
+ if sequences.size == 0:
|
|
334
|
+ sequence_count_so_far += 0 # No sequences added
|
|
335
|
+ continue # Skip empty files
|
|
336
|
+
|
|
337
|
+ n_sequences_file = sequences.shape[0]
|
|
338
|
+
|
|
339
|
+ # Detect change points for THIS file (on scaled data - original features)
|
|
340
|
+ change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
|
|
341
|
+
|
|
342
|
+ # Apply delay logic PER FILE's sequence indices
|
|
343
|
+ evaluation_allowed_file = np.ones(n_sequences_file, dtype=bool)
|
|
344
|
+ # Map original data change point indices to sequence indices for delay logic
|
|
345
|
+ change_point_sequence_indices_file = change_points - (timesteps - 1)
|
|
346
|
+ # Filter for valid sequence indices for delay
|
|
347
|
+ valid_change_point_sequence_indices_file = change_point_sequence_indices_file[(change_point_sequence_indices_file >= 0) & (change_point_sequence_indices_file < n_sequences_file)]
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+ for cp_seq_index in valid_change_point_sequence_indices_file:
|
|
351
|
+ start_delay = max(0, cp_seq_index)
|
|
352
|
+ end_delay = min(n_sequences_file, cp_seq_index + delay_steps)
|
|
353
|
+ evaluation_allowed_file[start_delay:end_delay] = False
|
|
354
|
+
|
|
355
|
+ # Collect data for DELAYED evaluation (only where evaluation_allowed_file is True)
|
|
356
|
+ # Use the dominant label mapping calculated on the FULL test set for consistency
|
|
357
|
+ # Get the predicted clusters for sequences in THIS file (from the full prediction list)
|
|
358
|
+ predicted_clusters_file = all_predicted_cluster_labels_full[sequence_count_so_far : sequence_count_so_far + n_sequences_file]
|
|
359
|
+ predicted_labels_numeric_file = np.array([cluster_dominant_label_full.get(cluster, -1) for cluster in predicted_clusters_file]) # Use full mapping
|
|
360
|
+
|
|
361
|
+ # Get true labels for sequences in THIS file (aligned with sequences)
|
|
362
|
+ true_labels_file = all_y_true_full[sequence_count_so_far : sequence_count_so_far + n_sequences_file]
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+ all_y_true_delayed.extend(true_labels_file[evaluation_allowed_file])
|
|
366
|
+ all_predicted_cluster_labels_delayed.extend(predicted_labels_numeric_file[evaluation_allowed_file])
|
|
367
|
+
|
|
368
|
+ sequence_count_so_far += n_sequences_file # Update count for slicing
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+ all_y_true_delayed = np.array(all_y_true_delayed)
|
|
372
|
+ all_predicted_cluster_labels_delayed = np.array(all_predicted_cluster_labels_delayed)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+ # Perform Delayed Evaluation
|
|
376
|
+ valid_indices_delayed = all_predicted_cluster_labels_delayed != -1
|
|
377
|
+ if np.sum(valid_indices_delayed) > 0 and len(np.unique(all_y_true_delayed[valid_indices_delayed])) > 1:
|
|
378
|
+ print("Classification Report (Subset after Delay):")
|
|
379
|
+ print(classification_report(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed]))
|
|
380
|
+ cm_delayed = confusion_matrix(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed])
|
|
381
|
+ plt.figure(figsize=(8, 6))
|
|
382
|
+ sns.heatmap(cm_delayed, annot=True, fmt='d', cmap='Blues')
|
|
383
|
+ plt.xlabel('Predicted Label (Delayed)')
|
|
384
|
+ plt.ylabel('True Label')
|
|
385
|
+ plt.title('Confusion Matrix (Subset after Delay)')
|
|
386
|
+ plt.show()
|
|
387
|
+ else:
|
|
388
|
+ print("Could not perform delayed evaluation (not enough data after delay or classes).")
|
|
389
|
+
|
|
390
|
+ # --- 4. Report Detected Change Points ---
|
|
391
|
+ print("\nDetected Change Points (Start Time of Sequence after Change):")
|
|
392
|
+ # Print the collected list of change point time indices per file
|
|
393
|
+ for i, cp_list in enumerate(all_change_points_detected_list):
|
|
394
|
+ print(f"File {i}: {cp_list}")
|
|
395
|
+
|
|
396
|
+ # Note: Anomaly and Misclassified plotting is not implemented in this version due to complexity with delayed evaluation subset.
|
|
397
|
+
|
|
398
|
+ # Return collected data if needed for further processing (e.g., plotting)
|
|
399
|
+ return all_y_true_full, all_predicted_cluster_labels_full, all_change_points_detected_list # Removed original_test_sequences_full return
|
|
400
|
+
|
|
401
|
+# Main execution
|
|
402
|
+if __name__ == "__main__":
|
|
403
|
+ # Load and preprocess training data (already done outside __name__ == "__main__")
|
|
404
|
+ # Load and preprocess test data (already done outside __name__ == "__main__")
|
|
405
|
+
|
|
406
|
+ # Create true labels list for test data
|
|
407
|
+ true_labels_list = []
|
|
408
|
+ for i, df in enumerate(dataTest):
|
|
409
|
+ true_labels_list.append(np.full(len(df), i))
|
|
410
|
+
|
|
411
|
+ # Plot raw data if requested
|
|
412
|
+ if options.plot_raw:
|
|
413
|
+ print("\nPlotting Raw Data:")
|
|
414
|
+ num_features = len(features)
|
|
415
|
+ fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
|
|
416
|
+ if num_features == 1: axes = [axes]
|
|
417
|
+ for i, feature in enumerate(features):
|
|
418
|
+ for k, df in enumerate(dataTest):
|
|
419
|
+ axes[i].plot(df.index, df[feature], label=f'Class {k}', alpha=0.7) # Added alpha
|
|
420
|
+ axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
|
|
421
|
+ axes[i].set_title(featureNames[feature])
|
|
422
|
+ axes[-1].legend(loc='upper right') # Legend on the last subplot
|
|
423
|
+ plt.tight_layout()
|
|
424
|
+ plt.show()
|
|
425
|
+
|
|
426
|
+ # Plot clustered data for training set if requested
|
|
427
|
+ # Use scaled_train_df.values for sequence creation
|
|
428
|
+ train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features)
|
|
429
|
+ train_reshaped_sequences = train_sequences.reshape(train_sequences.shape[0], -1)
|
|
430
|
+ train_predicted_clusters = kmeans.predict(train_reshaped_sequences)
|
|
431
|
+ train_time_index = combined_train_data.index[timesteps - 1:]
|
|
432
|
+ if options.plot_clustered:
|
|
433
|
+ print("\nClustered Data for Training Set:")
|
|
434
|
+ # Change points detection for training data (optional) - on original scaled features
|
|
435
|
+ train_change_points = detect_change_points(scaled_train_df.values, threshold=0.8)
|
|
436
|
+ train_change_point_sequence_indices = train_change_points - (timesteps - 1)
|
|
437
|
+ valid_train_change_point_sequence_indices = train_change_point_sequence_indices[(train_change_point_sequence_indices >= 0) & (train_change_point_sequence_indices < train_sequences.shape[0])]
|
|
438
|
+ train_cp_time_indices = train_time_index[valid_train_change_point_sequence_indices].tolist() if valid_train_change_point_sequence_indices.size > 0 else None
|
|
439
|
+
|
|
440
|
+ # Use combined_train_data for plotting original values
|
|
441
|
+ plot_clustered_data(combined_train_data.loc[train_time_index], train_predicted_clusters, train_time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=train_cp_time_indices)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+ # Plot clustered data for TEST set (per file) if requested
|
|
445
|
+ # Note: This iterates through test files and plots each one's clustered data.
|
|
446
|
+ # It also calculates and plots change points for each file.
|
|
447
|
+ if options.plot_clustered:
|
|
448
|
+ print("\nClustered Data for Test Sets (per file):")
|
|
449
|
+ # Corrected loop: iterate over scaled DataFrames (scaled_test_df_list) and original DataFrames (dataTest)
|
|
450
|
+ for k, (scaled_df, original_df) in enumerate(zip(scaled_test_df_list, dataTest)):
|
|
451
|
+ original_indices = original_df.index
|
|
452
|
+ time_index = original_indices[timesteps - 1:]
|
|
453
|
+ # Use scaled_df.values for sequence creation
|
|
454
|
+ sequences = create_sequences_with_rate_of_change(scaled_df.values, timesteps, n_original_features)
|
|
455
|
+ if sequences.size == 0: continue
|
|
456
|
+ reshaped_sequences = sequences.reshape(sequences.shape[0], -1)
|
|
457
|
+ predicted_clusters = kmeans.predict(reshaped_sequences)
|
|
458
|
+
|
|
459
|
+ # Change points detection for this test file - on original scaled features
|
|
460
|
+ change_points = detect_change_points(scaled_df.values, threshold=0.8)
|
|
461
|
+ change_point_sequence_indices = change_points - (timesteps - 1)
|
|
462
|
+ valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < sequences.shape[0])]
|
|
463
|
+ cp_time_indices = time_index[valid_change_point_sequence_indices].tolist() if valid_change_point_sequence_indices.size > 0 else None
|
|
464
|
+
|
|
465
|
+ print(f" Plotting Test File {k}")
|
|
466
|
+ # Use original_df for plotting original values
|
|
467
|
+ plot_clustered_data(original_df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices)
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+ # Perform Evaluation (Full and Delayed)
|
|
471
|
+ # This function handles all evaluation reporting and confusion matrix plotting
|
|
472
|
+ # Pass scaled_test_df_list (list of DataFrames)
|
|
473
|
+ all_y_true_full, all_predicted_cluster_labels_full, all_change_points_detected_list = evaluate_and_report(kmeans, scaled_test_df_list, dataTest, true_labels_list, timesteps, delay_steps, features, options, n_original_features) # Pass scaled_test_df_list (FIXED)
|
|
474
|
+
|
|
475
|
+ # Calculate and print Inertia and Silhouette Score for combined test data
|
|
476
|
+ # Need to combine all test sequences and cluster labels first
|
|
477
|
+ # The evaluation function now returns the full lists, so we can use them here.
|
|
478
|
+ # Need the combined reshaped sequences for Silhouette calculation
|
|
479
|
+ # Use scaled_test_df_list to create sequences
|
|
480
|
+ X_test_sequences_combined = np.vstack([create_sequences_with_rate_of_change(df.values, timesteps, n_original_features) for df in scaled_test_df_list if create_sequences_with_rate_of_change(df.values, timesteps, n_original_features).size > 0])
|
|
481
|
+
|
|
482
|
+ if X_test_sequences_combined.size > 0 and all_predicted_cluster_labels_full.size > 0: # Check if any sequences were processed for evaluation
|
|
483
|
+ X_test_combined_reshaped = X_test_sequences_combined.reshape(X_test_sequences_combined.shape[0], -1)
|
|
484
|
+
|
|
485
|
+ print("\n--- K-Means Model Evaluation (Overall Metrics on Combined Test Data) ---")
|
|
486
|
+ print(f"Inertia: {kmeans.inertia_:.4f}") # Inertia is from training fit
|
|
487
|
+
|
|
488
|
+ # Silhouette score on the combined test data
|
|
489
|
+ # Use the cluster labels predicted for the full test set
|
|
490
|
+ if len(np.unique(all_predicted_cluster_labels_full)) > 1 and all_predicted_cluster_labels_full.size > 0: # Check for multiple clusters and non-empty
|
|
491
|
+ try:
|
|
492
|
+ silhouette = silhouette_score(X_test_combined_reshaped, all_predicted_cluster_labels_full)
|
|
493
|
+ print(f"Silhouette Score: {silhouette:.4f}")
|
|
494
|
+ except ValueError as e:
|
|
495
|
+ print(f"Silhouette Score: Could not calculate ({e})")
|
|
496
|
+ else:
|
|
497
|
+ print("Silhouette Score: Not applicable for single cluster or empty data on combined test data.")
|
|
498
|
+ else:
|
|
499
|
+ print("\n--- K-Means Model Evaluation (Overall Metrics) ---")
|
|
500
|
+ print("No test data sequences available or processed for overall evaluation metrics.")
|
|
501
|
+
|
|
502
|
+ # Note: Anomaly and Misclassified plotting is not implemented in this version.
|