Browse Source

Adding kmeans_anomaly_change points folder and its contents robust_scaler_roc_cp_delayed and standard_scaler_cp_delayed

Masoud Hosseini 2 weeks ago
parent
commit
90b3c380c6

+ 502
- 0
kmeans_anomaly_change points/robust_scaler_roc_cp_delayed.py View File

@@ -0,0 +1,502 @@
1
+import pandas as pd
2
+import numpy as np
3
+import matplotlib.pyplot as plt
4
+# Import both scalers
5
+from sklearn.preprocessing import StandardScaler, RobustScaler
6
+# Removed LabelEncoder as it's not used in this version
7
+from sklearn.cluster import KMeans
8
+# Added silhouette_score back, keep classification_report, confusion_matrix
9
+from sklearn.metrics import classification_report, confusion_matrix, silhouette_score
10
+import argparse
11
+import os
12
+import seaborn as sns
13
+
14
+# Command line arguments setup
15
+parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering with Rate of Change, Change Point Detection, and Delayed Evaluation.')
16
+parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
17
+parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means.')
18
+parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.') # Using n_init from options (FIXED)
19
+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
20
+parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
21
+parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
22
+# Removed plot_anomalies, plot_misclassified flags as they are not implemented in this version
23
+# parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies.')
24
+# parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances.')
25
+parser.add_argument('--delay', type=int, default=10, help='Number of timesteps to delay evaluation after a change point.')
26
+parser.add_argument('--show_change_points', action='store_true', help='Show change points on clustered plots.')
27
+parser.add_argument('--use_standard_scaler', action='store_true', help='Use StandardScaler instead of RobustScaler.') # Added scaler choice flag
28
+options = parser.parse_args()
29
+
30
+# Parameters
31
+n_clusters = options.n_clusters
32
+timesteps = options.timesteps
33
+n_init = options.n_init # Used n_init from options (FIXED)
34
+delay_steps = options.delay
35
+show_change_points = options.show_change_points
36
+use_standard_scaler = options.use_standard_scaler # Get scaler choice
37
+
38
+# Data loading (same as previous code)
39
+NumberOfFailures = 4
40
+datafiles = [[], []]
41
+for i in range(NumberOfFailures + 1):
42
+  datafiles[0].append([])
43
+  datafiles[1].append([])
44
+
45
+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
46
+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
47
+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
48
+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
49
+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
50
+
51
+if options.transition:
52
+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
53
+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']
54
+  datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
55
+  datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']
56
+  datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
57
+else:
58
+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
59
+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
60
+  datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
61
+  datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
62
+  datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
63
+
64
+features = ['r1 s1', 'r1 s4', 'r1 s5']
65
+n_original_features = len(features) # Store original number of features for sequence creation with ROC
66
+# Using standard LaTeX formatting for display names
67
+featureNames = {'r1 s1': r'$T_{evap}$', 'r1 s4': r'$T_{cond}$', 'r1 s5': r'$T_{air}$'}
68
+unitNames = {'r1 s1': r'($^o$C)', 'r1 s4': r'($^o$C)', 'r1 s5': r'($^o$C)'}
69
+NumFeatures = len(features) # Used for indexing features[:NumFeatures]
70
+
71
+
72
+# Load and preprocess data (same as previous code)
73
+dataTrain = []
74
+for class_files in datafiles[0]:
75
+  script_dir = os.path.dirname(os.path.abspath(__file__))
76
+  data_dir = os.path.join(script_dir, 'data')
77
+  class_dfs = []
78
+  for base_filename in class_files:
79
+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
80
+    try:
81
+      df = pd.read_csv(filepath)
82
+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
83
+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
84
+      for col in features:
85
+        df[col] = pd.to_numeric(df[col], errors='coerce')
86
+      df = df.set_index('timestamp').resample('5Min')[features].mean()
87
+      df = df[features].interpolate()
88
+      class_dfs.append(df)
89
+    except FileNotFoundError:
90
+      print(f"Warning: File {filepath} not found and skipped.")
91
+  if class_dfs:
92
+    dataTrain.append(pd.concat(class_dfs))
93
+combined_train_data = pd.concat(dataTrain)
94
+
95
+dataTest = []
96
+for class_files in datafiles[1]:
97
+  script_dir = os.path.dirname(os.path.abspath(__file__))
98
+  data_dir = os.path.join(script_dir, 'data')
99
+  class_dfs = []
100
+  for base_filename in class_files:
101
+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
102
+    try:
103
+      df = pd.read_csv(filepath)
104
+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
105
+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
106
+      for col in features:
107
+        df[col] = pd.to_numeric(df[col], errors='coerce')
108
+      df = df.set_index('timestamp').resample('5Min')[features].mean()
109
+      df = df[features].interpolate()
110
+      class_dfs.append(df)
111
+    except FileNotFoundError:
112
+      print(f"Warning: File {filepath} not found and skipped.")
113
+  if class_dfs:
114
+    dataTest.append(pd.concat(class_dfs))
115
+
116
+# Normalize data (Uses RobustScaler by default, StandardScaler if --use_standard_scaler is set)
117
+scaler = StandardScaler() if use_standard_scaler else RobustScaler() # Scaler choice based on argument
118
+scaled_train_data = scaler.fit_transform(combined_train_data[features])
119
+scaled_test_data_list = [] # This list will store NumPy arrays initially
120
+for df in dataTest:
121
+  scaled_test_data_list.append(scaler.transform(df[features]))
122
+
123
+# Convert scaled data to DataFrames for easier handling and plotting
124
+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
125
+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)] # This list stores DataFrames
126
+
127
+
128
+# Create time sequences WITH Rate of Change (from previous version)
129
+def create_sequences_with_rate_of_change(data, timesteps, original_features_count):
130
+  sequences = []
131
+  # Calculate rate of change for the entire data first (handles NaNs appropriately)
132
+  # Resulting shape is (len(data), original_features_count), with NaN in first row
133
+  rate_of_change_full = np.diff(data, axis=0, prepend=np.nan)
134
+
135
+  # Combine original data and rate of change
136
+  # Horizontally stack original data and rate of change
137
+  combined_data = np.hstack((data, rate_of_change_full)) # Shape (len(data), 2 * original_features_count)
138
+
139
+  # Create sequences from the combined data
140
+  for i in range(1, len(combined_data) - timesteps + 1): # CORRECTED: Start range from 1 to skip the first sequence
141
+    sequence = combined_data[i:i + timesteps] # Shape (timesteps, 2 * original_features_count)
142
+    sequences.append(sequence)
143
+
144
+  return np.array(sequences)
145
+
146
+
147
+X_train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features) # Use scaled_train_df.values
148
+
149
+
150
+# Train K-Means model on all training data
151
+n_samples_train, n_timesteps_train, n_total_features_train = X_train_sequences.shape # Total features is now 2 * original_features
152
+X_train_reshaped = X_train_sequences.reshape(n_samples_train, n_timesteps_train * n_total_features_train) # Flatten sequences
153
+kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # Using n_init from options (FIXED)
154
+kmeans.fit(X_train_reshaped)
155
+
156
+# Function to detect change points (Function following sharp jumps or drops)
157
+# This function works on a single 2D data array (samples, features)
158
+# It's typically better to detect change points on the original scaled features BEFORE adding ROC
159
+def detect_change_points(data, threshold=0.8):
160
+  change_points = []
161
+  # Iterate through the data points starting from the second one
162
+  for i in range(1, len(data)):
163
+    # Calculate the absolute difference between the current point and the previous point
164
+    difference = np.abs(data[i] - data[i-1])
165
+    # If the difference for ANY feature is greater than the threshold, mark this point as a change point
166
+    if np.any(difference > threshold):
167
+      change_points.append(i)
168
+  return np.array(change_points)
169
+
170
+# Function to plot clustered data (adapted to accept sequence indices and show change points)
171
+def plot_clustered_data(df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=False, change_point_indices=None):
172
+  # Note: 'features' here is the list of original feature names
173
+  num_original_features = len(features)
174
+  fig, axes = plt.subplots(num_original_features, 1, figsize=(15, 5 * num_original_features), sharex=True)
175
+  if num_original_features == 1: axes = [axes] # Ensure axes is always an array
176
+  colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
177
+
178
+  # We are plotting original features only
179
+  # The input 'features' list in this function IS the list of original feature names ('r1 s1', 'r1 s4', 'r1 s5')
180
+
181
+
182
+  for i, feature in enumerate(features): # Iterate only through original features for plotting
183
+    # Plot data points colored by their assigned cluster
184
+    for cluster_id in range(n_clusters):
185
+      cluster_indices_kmeans = np.where(predicted_clusters == cluster_id)[0]
186
+      if len(cluster_indices_kmeans) > 0:
187
+        # Use time_index for x-axis and original df for y-axis values
188
+        axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]],
189
+               color=colors[cluster_id], label=f'Cluster {cluster_id}', s=10, alpha=0.6) # Added alpha for better visualization of overlaps
190
+    axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}') # Use original names/units from dict
191
+    axes[i].set_title(featureNames[feature]) # Use original names from dict
192
+    axes[i].grid(True, linestyle='--', alpha=0.6) # Added grid for readability
193
+
194
+
195
+    # Plot change points if enabled
196
+    if show_cp and change_point_indices is not None:
197
+      # Ensure change_point_indices contains datetime objects matching time_index
198
+      for cp_time in change_point_indices:
199
+        axes[i].axvline(x=cp_time, color='red', linestyle='--', linewidth=1.5, label='Change Point' if i == 0 else '', alpha=0.8) # Only one label for change point
200
+
201
+  # Add legend to the last subplot, including cluster labels and change point if plotted
202
+  handles, labels = [], []
203
+  # Collect handles and labels from all axes to avoid duplicates
204
+  for ax in axes:
205
+    for handle, label in zip(*ax.get_legend_handles_labels()):
206
+      if label not in labels:
207
+        handles.append(handle)
208
+        labels.append(label)
209
+  if handles:
210
+    axes[-1].legend(handles, labels, loc='upper right')
211
+
212
+
213
+  plt.tight_layout()
214
+  plt.show()
215
+
216
+
217
+# Combined Evaluation Function (Full and Delayed)
218
+def evaluate_and_report(kmeans_model, scaled_test_df_list, original_test_data_list, true_labels_list, timesteps, delay_steps, features, options, n_original_features):
219
+  all_y_true_full = [] # True labels for all test sequences
220
+  all_predicted_cluster_labels_full = [] # Predicted clusters for all test sequences
221
+  # all_original_test_sequences_full = [] # Not directly needed in evaluation logic itself
222
+  all_change_points_detected_list = [] # Detected change points time indices for each test file
223
+
224
+
225
+  # --- 1. Collect data and predict clusters for ALL test sequences ---
226
+  # Iterate over scaled DataFrames (scaled_test_df_list) and original DataFrames (original_test_data_list)
227
+  for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_df_list, original_test_data_list, true_labels_list)):
228
+    original_indices = original_df.index
229
+    # time_index for plotting and aligning labels corresponds to the end of each sequence
230
+    time_index = original_indices[timesteps - 1:]
231
+
232
+    # Create sequences (using create_sequences_with_rate_of_change for this version)
233
+    # scaled_df is a DataFrame, pass its .values
234
+    sequences = create_sequences_with_rate_of_change(scaled_df.values, timesteps, n_original_features)
235
+
236
+    if sequences.size == 0:
237
+      print(f"Warning: No sequences generated for test file {k}. Skipping.")
238
+      all_change_points_detected_list.append([]) # Append empty list for consistency
239
+      continue # Skip to next file
240
+
241
+    n_sequences = sequences.shape[0]
242
+    reshaped_sequences = sequences.reshape(n_sequences, -1)
243
+    predicted_clusters = kmeans_model.predict(reshaped_sequences)
244
+
245
+    # Collect true labels aligned with the sequences that were ACTUALLY created.
246
+    # create_sequences_with_rate_of_change skips the first sequence (ending at timesteps-1).
247
+    # So, start collecting true labels from the index corresponding to the END of the second sequence (index timesteps).
248
+    all_y_true_full.extend(y_true_categorical[timesteps:]) # CORRECTED: Start slicing from timesteps
249
+    all_predicted_cluster_labels_full.extend(predicted_clusters)
250
+
251
+
252
+    # Detect change points for this test file (on scaled data - using ORIGINAL features before adding ROC)
253
+    # scaled_df.values contains only the original scaled features
254
+    change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
255
+
256
+    # Map original data change point indices to the time_index of sequences
257
+    # A change at original index `cp_original` corresponds to the sequence ENDING at `cp_original`.
258
+    # The index in the `sequences` array corresponding to original index `cp_original` is `cp_original - (timesteps - 1)`.
259
+    # We need to make sure this sequence index is valid (>= 0 and < n_sequences).
260
+    change_point_sequence_indices = change_points - (timesteps - 1)
261
+    # Filter for valid sequence indices
262
+    valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < n_sequences)]
263
+
264
+    # Store the time index of the valid change point sequences
265
+    if valid_change_point_sequence_indices.size > 0:
266
+      cp_time_indices = time_index[valid_change_point_sequence_indices].tolist()
267
+      all_change_points_detected_list.append(cp_time_indices)
268
+    else:
269
+      all_change_points_detected_list.append([])
270
+
271
+    # Plot clustered data for the current test file if requested and transition=False (to avoid duplicate plots handled later)
272
+    # This uses original_df and predicted_clusters for this file
273
+    if options.plot_clustered and not options.transition:
274
+      print(f"\nClustered Data for Test File {k}:")
275
+      plot_clustered_data(original_df, predicted_clusters, time_index, kmeans_model.n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices if show_change_points else None)
276
+
277
+
278
+  # Convert collected lists to numpy arrays for evaluation
279
+  all_y_true_full = np.array(all_y_true_full)
280
+  all_predicted_cluster_labels_full = np.array(all_predicted_cluster_labels_full)
281
+  # all_original_test_sequences_full = np.array(all_original_test_sequences_full) # Convert if needed for plotting later
282
+
283
+
284
+  # --- 2. Perform FULL Evaluation (on all test sequences) ---
285
+  print("\n--- Full Evaluation Results (All Test Sequences) ---")
286
+
287
+  # Analyze clusters and assign a dominant true label to each cluster based on ALL test sequences
288
+  cluster_dominant_label_full = {}
289
+  for cluster_id in range(kmeans_model.n_clusters):
290
+    indices_in_cluster = np.where(all_predicted_cluster_labels_full == cluster_id)[0]
291
+    if len(indices_in_cluster) > 0:
292
+      labels_in_cluster = all_y_true_full[indices_in_cluster]
293
+      if len(labels_in_cluster) > 0:
294
+        # Use np.argmax to find the index of the max count (dominant label)
295
+        dominant_label = np.argmax(np.bincount(labels_in_cluster))
296
+        cluster_dominant_label_full[cluster_id] = dominant_label
297
+      else:
298
+        cluster_dominant_label_full[cluster_id] = -1 # No data in this cluster with known labels
299
+    else:
300
+      cluster_dominant_label_full[cluster_id] = -1 # Empty cluster
301
+
302
+  # Create predicted labels for full evaluation based on the dominant label of the assigned cluster
303
+  predicted_labels_numeric_full = np.array([cluster_dominant_label_full.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels_full])
304
+
305
+  # Evaluate (using numeric labels for the full set)
306
+  valid_indices_full = predicted_labels_numeric_full != -1
307
+  if np.sum(valid_indices_full) > 0 and len(np.unique(all_y_true_full[valid_indices_full])) > 1:
308
+    print("Classification Report (Full Test Set):")
309
+    print(classification_report(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full]))
310
+    cm_full = confusion_matrix(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full])
311
+    plt.figure(figsize=(8, 6))
312
+    sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues')
313
+    plt.xlabel('Predicted Cluster (Dominant True Label)')
314
+    plt.ylabel('True Label')
315
+    plt.title('Confusion Matrix (Full Test Set)')
316
+    plt.show()
317
+  else:
318
+    print("Could not perform full evaluation (not enough data or classes after mapping).")
319
+
320
+
321
+  # --- 3. Perform DELAYED Evaluation (on subset after delay) ---
322
+  print("\n--- Delayed Evaluation Results (Subset after Delay) ---")
323
+
324
+  all_y_true_delayed = []
325
+  all_predicted_cluster_labels_delayed = []
326
+
327
+  # Apply delay logic PER FILE's sequence indices, and collect results
328
+  sequence_count_so_far = 0
329
+  # Iterate over scaled DataFrames (scaled_test_df_list) and original DataFrames (original_test_data_list)
330
+  for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_df_list, original_test_data_list, true_labels_list)):
331
+    # Use the correct sequence creation function based on this version (with ROC)
332
+    sequences = create_sequences_with_rate_of_change(scaled_df.values, timesteps, n_original_features)
333
+    if sequences.size == 0:
334
+      sequence_count_so_far += 0 # No sequences added
335
+      continue # Skip empty files
336
+
337
+    n_sequences_file = sequences.shape[0]
338
+
339
+    # Detect change points for THIS file (on scaled data - original features)
340
+    change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
341
+
342
+    # Apply delay logic PER FILE's sequence indices
343
+    evaluation_allowed_file = np.ones(n_sequences_file, dtype=bool)
344
+    # Map original data change point indices to sequence indices for delay logic
345
+    change_point_sequence_indices_file = change_points - (timesteps - 1)
346
+    # Filter for valid sequence indices for delay
347
+    valid_change_point_sequence_indices_file = change_point_sequence_indices_file[(change_point_sequence_indices_file >= 0) & (change_point_sequence_indices_file < n_sequences_file)]
348
+
349
+
350
+    for cp_seq_index in valid_change_point_sequence_indices_file:
351
+      start_delay = max(0, cp_seq_index)
352
+      end_delay = min(n_sequences_file, cp_seq_index + delay_steps)
353
+      evaluation_allowed_file[start_delay:end_delay] = False
354
+
355
+    # Collect data for DELAYED evaluation (only where evaluation_allowed_file is True)
356
+    # Use the dominant label mapping calculated on the FULL test set for consistency
357
+    # Get the predicted clusters for sequences in THIS file (from the full prediction list)
358
+    predicted_clusters_file = all_predicted_cluster_labels_full[sequence_count_so_far : sequence_count_so_far + n_sequences_file]
359
+    predicted_labels_numeric_file = np.array([cluster_dominant_label_full.get(cluster, -1) for cluster in predicted_clusters_file]) # Use full mapping
360
+
361
+    # Get true labels for sequences in THIS file (aligned with sequences)
362
+    true_labels_file = all_y_true_full[sequence_count_so_far : sequence_count_so_far + n_sequences_file]
363
+
364
+
365
+    all_y_true_delayed.extend(true_labels_file[evaluation_allowed_file])
366
+    all_predicted_cluster_labels_delayed.extend(predicted_labels_numeric_file[evaluation_allowed_file])
367
+
368
+    sequence_count_so_far += n_sequences_file # Update count for slicing
369
+
370
+
371
+  all_y_true_delayed = np.array(all_y_true_delayed)
372
+  all_predicted_cluster_labels_delayed = np.array(all_predicted_cluster_labels_delayed)
373
+
374
+
375
+  # Perform Delayed Evaluation
376
+  valid_indices_delayed = all_predicted_cluster_labels_delayed != -1
377
+  if np.sum(valid_indices_delayed) > 0 and len(np.unique(all_y_true_delayed[valid_indices_delayed])) > 1:
378
+    print("Classification Report (Subset after Delay):")
379
+    print(classification_report(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed]))
380
+    cm_delayed = confusion_matrix(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed])
381
+    plt.figure(figsize=(8, 6))
382
+    sns.heatmap(cm_delayed, annot=True, fmt='d', cmap='Blues')
383
+    plt.xlabel('Predicted Label (Delayed)')
384
+    plt.ylabel('True Label')
385
+    plt.title('Confusion Matrix (Subset after Delay)')
386
+    plt.show()
387
+  else:
388
+    print("Could not perform delayed evaluation (not enough data after delay or classes).")
389
+
390
+  # --- 4. Report Detected Change Points ---
391
+  print("\nDetected Change Points (Start Time of Sequence after Change):")
392
+  # Print the collected list of change point time indices per file
393
+  for i, cp_list in enumerate(all_change_points_detected_list):
394
+    print(f"File {i}: {cp_list}")
395
+
396
+  # Note: Anomaly and Misclassified plotting is not implemented in this version due to complexity with delayed evaluation subset.
397
+
398
+  # Return collected data if needed for further processing (e.g., plotting)
399
+  return all_y_true_full, all_predicted_cluster_labels_full, all_change_points_detected_list # Removed original_test_sequences_full return
400
+
401
+# Main execution
402
+if __name__ == "__main__":
403
+  # Load and preprocess training data (already done outside __name__ == "__main__")
404
+  # Load and preprocess test data (already done outside __name__ == "__main__")
405
+
406
+  # Create true labels list for test data
407
+  true_labels_list = []
408
+  for i, df in enumerate(dataTest):
409
+    true_labels_list.append(np.full(len(df), i))
410
+
411
+  # Plot raw data if requested
412
+  if options.plot_raw:
413
+    print("\nPlotting Raw Data:")
414
+    num_features = len(features)
415
+    fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
416
+    if num_features == 1: axes = [axes]
417
+    for i, feature in enumerate(features):
418
+      for k, df in enumerate(dataTest):
419
+        axes[i].plot(df.index, df[feature], label=f'Class {k}', alpha=0.7) # Added alpha
420
+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
421
+      axes[i].set_title(featureNames[feature])
422
+    axes[-1].legend(loc='upper right') # Legend on the last subplot
423
+    plt.tight_layout()
424
+    plt.show()
425
+
426
+  # Plot clustered data for training set if requested
427
+  # Use scaled_train_df.values for sequence creation
428
+  train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features)
429
+  train_reshaped_sequences = train_sequences.reshape(train_sequences.shape[0], -1)
430
+  train_predicted_clusters = kmeans.predict(train_reshaped_sequences)
431
+  train_time_index = combined_train_data.index[timesteps - 1:]
432
+  if options.plot_clustered:
433
+    print("\nClustered Data for Training Set:")
434
+    # Change points detection for training data (optional) - on original scaled features
435
+    train_change_points = detect_change_points(scaled_train_df.values, threshold=0.8)
436
+    train_change_point_sequence_indices = train_change_points - (timesteps - 1)
437
+    valid_train_change_point_sequence_indices = train_change_point_sequence_indices[(train_change_point_sequence_indices >= 0) & (train_change_point_sequence_indices < train_sequences.shape[0])]
438
+    train_cp_time_indices = train_time_index[valid_train_change_point_sequence_indices].tolist() if valid_train_change_point_sequence_indices.size > 0 else None
439
+
440
+    # Use combined_train_data for plotting original values
441
+    plot_clustered_data(combined_train_data.loc[train_time_index], train_predicted_clusters, train_time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=train_cp_time_indices)
442
+
443
+
444
+  # Plot clustered data for TEST set (per file) if requested
445
+  # Note: This iterates through test files and plots each one's clustered data.
446
+  # It also calculates and plots change points for each file.
447
+  if options.plot_clustered:
448
+    print("\nClustered Data for Test Sets (per file):")
449
+    # Corrected loop: iterate over scaled DataFrames (scaled_test_df_list) and original DataFrames (dataTest)
450
+    for k, (scaled_df, original_df) in enumerate(zip(scaled_test_df_list, dataTest)):
451
+      original_indices = original_df.index
452
+      time_index = original_indices[timesteps - 1:]
453
+      # Use scaled_df.values for sequence creation
454
+      sequences = create_sequences_with_rate_of_change(scaled_df.values, timesteps, n_original_features)
455
+      if sequences.size == 0: continue
456
+      reshaped_sequences = sequences.reshape(sequences.shape[0], -1)
457
+      predicted_clusters = kmeans.predict(reshaped_sequences)
458
+
459
+      # Change points detection for this test file - on original scaled features
460
+      change_points = detect_change_points(scaled_df.values, threshold=0.8)
461
+      change_point_sequence_indices = change_points - (timesteps - 1)
462
+      valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < sequences.shape[0])]
463
+      cp_time_indices = time_index[valid_change_point_sequence_indices].tolist() if valid_change_point_sequence_indices.size > 0 else None
464
+
465
+      print(f" Plotting Test File {k}")
466
+      # Use original_df for plotting original values
467
+      plot_clustered_data(original_df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices)
468
+
469
+
470
+  # Perform Evaluation (Full and Delayed)
471
+  # This function handles all evaluation reporting and confusion matrix plotting
472
+  # Pass scaled_test_df_list (list of DataFrames)
473
+  all_y_true_full, all_predicted_cluster_labels_full, all_change_points_detected_list = evaluate_and_report(kmeans, scaled_test_df_list, dataTest, true_labels_list, timesteps, delay_steps, features, options, n_original_features) # Pass scaled_test_df_list (FIXED)
474
+
475
+  # Calculate and print Inertia and Silhouette Score for combined test data
476
+  # Need to combine all test sequences and cluster labels first
477
+  # The evaluation function now returns the full lists, so we can use them here.
478
+  # Need the combined reshaped sequences for Silhouette calculation
479
+  # Use scaled_test_df_list to create sequences
480
+  X_test_sequences_combined = np.vstack([create_sequences_with_rate_of_change(df.values, timesteps, n_original_features) for df in scaled_test_df_list if create_sequences_with_rate_of_change(df.values, timesteps, n_original_features).size > 0])
481
+
482
+  if X_test_sequences_combined.size > 0 and all_predicted_cluster_labels_full.size > 0: # Check if any sequences were processed for evaluation
483
+    X_test_combined_reshaped = X_test_sequences_combined.reshape(X_test_sequences_combined.shape[0], -1)
484
+
485
+    print("\n--- K-Means Model Evaluation (Overall Metrics on Combined Test Data) ---")
486
+    print(f"Inertia: {kmeans.inertia_:.4f}") # Inertia is from training fit
487
+
488
+    # Silhouette score on the combined test data
489
+    # Use the cluster labels predicted for the full test set
490
+    if len(np.unique(all_predicted_cluster_labels_full)) > 1 and all_predicted_cluster_labels_full.size > 0: # Check for multiple clusters and non-empty
491
+      try:
492
+        silhouette = silhouette_score(X_test_combined_reshaped, all_predicted_cluster_labels_full)
493
+        print(f"Silhouette Score: {silhouette:.4f}")
494
+      except ValueError as e:
495
+        print(f"Silhouette Score: Could not calculate ({e})")
496
+    else:
497
+      print("Silhouette Score: Not applicable for single cluster or empty data on combined test data.")
498
+  else:
499
+    print("\n--- K-Means Model Evaluation (Overall Metrics) ---")
500
+    print("No test data sequences available or processed for overall evaluation metrics.")
501
+
502
+  # Note: Anomaly and Misclassified plotting is not implemented in this version.

+ 449
- 0
kmeans_anomaly_change points/standard_scaler_cp_delayed.py View File

@@ -0,0 +1,449 @@
1
+import pandas as pd
2
+import numpy as np
3
+import matplotlib.pyplot as plt
4
+from sklearn.cluster import KMeans
5
+from sklearn.preprocessing import StandardScaler
6
+# Removed LabelEncoder as it's not used in this version
7
+from sklearn.metrics import classification_report, confusion_matrix, silhouette_score # Added silhouette_score back
8
+import argparse
9
+import os
10
+import seaborn as sns
11
+
12
+# Command line arguments setup (Removed plot_anomalies, plot_misclassified flags as they are not implemented)
13
+parser = argparse.ArgumentParser(description='Anomaly detection using K-Means with change point detection and delayed evaluation.')
14
+parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
15
+parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means.')
16
+parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.') # Added n_init back
17
+parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
18
+parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
19
+parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
20
+# parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies.') # Removed
21
+# parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances.') # Removed
22
+parser.add_argument('--delay', type=int, default=10, help='Number of timesteps to delay evaluation after a change point.')
23
+parser.add_argument('--show_change_points', action='store_true', help='Show change points on clustered plots.')
24
+options = parser.parse_args()
25
+
26
+# Parameters
27
+n_clusters = options.n_clusters
28
+timesteps = options.timesteps
29
+n_init = options.n_init # Used n_init from options (FIXED)
30
+delay_steps = options.delay
31
+show_change_points = options.show_change_points
32
+
33
+# Data loading (same as previous code)
34
+NumberOfFailures = 4
35
+datafiles = [[], []]
36
+for i in range(NumberOfFailures + 1):
37
+  datafiles[0].append([])
38
+  datafiles[1].append([])
39
+
40
+datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
41
+datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
42
+datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
43
+datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
44
+datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
45
+
46
+if options.transition:
47
+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
48
+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']
49
+  datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
50
+  datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']
51
+  datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
52
+else:
53
+  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
54
+  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
55
+  datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
56
+  datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
57
+  datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
58
+
59
+features = ['r1 s1', 'r1 s4', 'r1 s5']
60
+# Using standard LaTeX formatting without span tags
61
+featureNames = {'r1 s1': r'$T_{evap}$', 'r1 s4': r'$T_{cond}$', 'r1 s5': r'$T_{air}$'}
62
+unitNames = {'r1 s1': r'($^o$C)', 'r1 s4': r'($^o$C)', 'r1 s5': r'($^o$C)'}
63
+NumFeatures = len(features)
64
+
65
+# Load and preprocess data (same as previous code)
66
+dataTrain = []
67
+for class_files in datafiles[0]:
68
+  script_dir = os.path.dirname(os.path.abspath(__file__))
69
+  data_dir = os.path.join(script_dir, 'data')
70
+  class_dfs = []
71
+  for base_filename in class_files:
72
+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
73
+    try:
74
+      df = pd.read_csv(filepath)
75
+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
76
+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
77
+      for col in features:
78
+        df[col] = pd.to_numeric(df[col], errors='coerce')
79
+      df = df.set_index('timestamp').resample('5Min')[features].mean()
80
+      df = df[features].interpolate()
81
+      class_dfs.append(df)
82
+    except FileNotFoundError:
83
+      print(f"Warning: File {filepath} not found and skipped.")
84
+  if class_dfs:
85
+    dataTrain.append(pd.concat(class_dfs))
86
+combined_train_data = pd.concat(dataTrain)
87
+
88
+dataTest = []
89
+for class_files in datafiles[1]:
90
+  script_dir = os.path.dirname(os.path.abspath(__file__))
91
+  data_dir = os.path.join(script_dir, 'data')
92
+  class_dfs = []
93
+  for base_filename in class_files:
94
+    filepath = os.path.join(data_dir, f'{base_filename}.csv')
95
+    try:
96
+      df = pd.read_csv(filepath)
97
+      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
98
+      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
99
+      for col in features:
100
+        df[col] = pd.to_numeric(df[col], errors='coerce')
101
+      df = df.set_index('timestamp').resample('5Min')[features].mean()
102
+      df = df[features].interpolate()
103
+      class_dfs.append(df)
104
+    except FileNotFoundError:
105
+      print(f"Warning: File {filepath} not found and skipped.")
106
+  if class_dfs:
107
+    dataTest.append(pd.concat(class_dfs))
108
+
109
+# Normalize data
110
+scaler = StandardScaler() # Using StandardScaler
111
+scaled_train_data = scaler.fit_transform(combined_train_data[features])
112
+scaled_test_data_list = []
113
+for df in dataTest:
114
+  scaled_test_data_list.append(scaler.transform(df[features]))
115
+
116
+scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
117
+scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
118
+
119
+# Create time sequences (NO Rate of Change)
120
+def create_sequences(data, timesteps):
121
+  sequences = []
122
+  for i in range(len(data) - timesteps + 1):
123
+    sequences.append(data[i:i + timesteps])
124
+  return np.array(sequences)
125
+
126
+X_train_sequences = create_sequences(scaled_train_df.values, timesteps)
127
+# X_test_sequences_list calculated later in main execution
128
+
129
+# Train K-Means model on all training data
130
+n_samples_train, n_timesteps_train, n_features_train = X_train_sequences.shape
131
+X_train_reshaped = X_train_sequences.reshape(n_samples_train, n_timesteps_train * n_features_train)
132
+kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # Using n_init from options (FIXED)
133
+kmeans.fit(X_train_reshaped)
134
+
135
+# Function to detect change points (Function following sharp jumps or drops)
136
+# This function works on a single 2D data array (samples, features)
137
+def detect_change_points(data, threshold=0.8):
138
+  change_points = []
139
+  # Iterate through the data points starting from the second one
140
+  for i in range(1, len(data)):
141
+    # Calculate the absolute difference between the current point and the previous point
142
+    difference = np.abs(data[i] - data[i-1])
143
+    # If the difference for ANY feature is greater than the threshold, mark this point as a change point
144
+    if np.any(difference > threshold):
145
+      change_points.append(i)
146
+  return np.array(change_points)
147
+
148
+# Function to plot clustered data (adapted to accept sequence indices and show change points)
149
+def plot_clustered_data(df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=False, change_point_indices=None):
150
+  num_features = len(features)
151
+  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
152
+  if num_features == 1: axes = [axes] # Ensure axes is always an array
153
+  colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
154
+
155
+  for i, feature in enumerate(features):
156
+    # Plot data points colored by their assigned cluster
157
+    for cluster_id in range(n_clusters):
158
+      cluster_indices_kmeans = np.where(predicted_clusters == cluster_id)[0]
159
+      if len(cluster_indices_kmeans) > 0:
160
+        # Use time_index for x-axis and original df for y-axis values
161
+        axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]],
162
+               color=colors[cluster_id], label=f'Cluster {cluster_id}', s=10, alpha=0.6) # Added alpha for better visualization of overlaps
163
+    axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
164
+    axes[i].set_title(featureNames[feature])
165
+    axes[i].grid(True, linestyle='--', alpha=0.6) # Added grid for readability
166
+
167
+    # Plot change points if enabled
168
+    if show_cp and change_point_indices is not None:
169
+      # Ensure change_point_indices contains datetime objects matching time_index
170
+      for cp_time in change_point_indices:
171
+        axes[i].axvline(x=cp_time, color='red', linestyle='--', linewidth=1.5, label='Change Point' if i == 0 else '', alpha=0.8) # Only one label for change point
172
+
173
+  # Add legend to the last subplot, including change point if plotted
174
+  handles, labels = [], []
175
+  for ax in axes:
176
+    for handle, label in zip(*ax.get_legend_handles_labels()):
177
+      if label not in labels:
178
+        handles.append(handle)
179
+        labels.append(label)
180
+  if handles:
181
+    axes[-1].legend(handles, labels, loc='upper right')
182
+
183
+  plt.tight_layout()
184
+  plt.show()
185
+
186
+# Combined Evaluation Function (Full and Delayed)
187
+def evaluate_and_report(kmeans_model, scaled_test_data_list, original_test_data_list, true_labels_list, timesteps, delay_steps, features, options):
188
+  all_y_true_full = [] # True labels for all test sequences
189
+  all_predicted_cluster_labels_full = [] # Predicted clusters for all test sequences
190
+  all_original_test_sequences_full = [] # Original sequences for plotting anomalies/misclassified later if needed
191
+  all_change_points_detected_list = [] # Detected change points for each test file
192
+
193
+  # --- 1. Collect data and predict clusters for ALL test sequences ---
194
+  for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)):
195
+    original_indices = original_df.index
196
+    # time_index for plotting corresponds to the end of each sequence
197
+    time_index = original_indices[timesteps - 1:]
198
+
199
+    sequences = create_sequences(scaled_df.values, timesteps)
200
+    if sequences.size == 0:
201
+      print(f"Warning: No sequences generated for test file {k}. Skipping.")
202
+      all_change_points_detected_list.append([]) # Append empty list for consistency
203
+      continue # Skip to next file
204
+
205
+    n_sequences = sequences.shape[0]
206
+    reshaped_sequences = sequences.reshape(n_sequences, -1)
207
+    predicted_clusters = kmeans_model.predict(reshaped_sequences)
208
+
209
+    # Collect true labels and predicted clusters for FULL evaluation
210
+    all_y_true_full.extend(y_true_categorical[timesteps - 1:])
211
+    all_predicted_cluster_labels_full.extend(predicted_clusters)
212
+
213
+    # Collect original sequences (aligned with sequence ends) for potential plotting
214
+    for i in range(n_sequences):
215
+      start_index = original_df.index.get_loc(time_index[i]) - (timesteps - 1)
216
+      end_index = start_index + timesteps
217
+      all_original_test_sequences_full.append(original_df[features].iloc[start_index:end_index].values)
218
+
219
+
220
+    # Detect change points for this test file (on scaled data)
221
+    change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
222
+    # Map change point indices to the time_index of sequences
223
+    # A change at index `i` in the original data affects the sequence ending at `i`.
224
+    # So, a change point at original index `cp_original` corresponds to the sequence ending at `cp_original`.
225
+    # The index in the `sequences` array corresponding to original index `cp_original` is `cp_original - (timesteps - 1)`.
226
+    # We need to make sure this sequence index is valid (>= 0 and < n_sequences).
227
+    change_point_sequence_indices = change_points - (timesteps - 1)
228
+    # Filter for valid sequence indices
229
+    valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < n_sequences)]
230
+
231
+    # Store the time index of the valid change point sequences
232
+    if valid_change_point_sequence_indices.size > 0:
233
+      cp_time_indices = time_index[valid_change_point_sequence_indices].tolist()
234
+      all_change_points_detected_list.append(cp_time_indices)
235
+    else:
236
+      all_change_points_detected_list.append([])
237
+
238
+    # Plot clustered data for the current test file if requested and transition=False (to avoid duplicate plots handled later)
239
+    if options.plot_clustered and not options.transition:
240
+      print(f"\nClustered Data for Test File {k}:")
241
+      plot_clustered_data(original_df, predicted_clusters, time_index, kmeans_model.n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices if show_change_points else None)
242
+
243
+
244
+  all_y_true_full = np.array(all_y_true_full)
245
+  all_predicted_cluster_labels_full = np.array(all_predicted_cluster_labels_full)
246
+  all_original_test_sequences_full = np.array(all_original_test_sequences_full)
247
+
248
+
249
+  # --- 2. Perform FULL Evaluation (on all test sequences) ---
250
+  print("\n--- Full Evaluation Results (All Test Sequences) ---")
251
+
252
+  # Analyze clusters and assign a dominant true label to each cluster based on ALL test sequences
253
+  cluster_dominant_label_full = {}
254
+  for cluster_id in range(kmeans_model.n_clusters):
255
+    indices_in_cluster = np.where(all_predicted_cluster_labels_full == cluster_id)[0]
256
+    if len(indices_in_cluster) > 0:
257
+      labels_in_cluster = all_y_true_full[indices_in_cluster]
258
+      if len(labels_in_cluster) > 0:
259
+        # Use np.argmax to find the index of the max count (dominant label)
260
+        dominant_label = np.argmax(np.bincount(labels_in_cluster))
261
+        cluster_dominant_label_full[cluster_id] = dominant_label
262
+      else:
263
+        cluster_dominant_label_full[cluster_id] = -1 # No data in this cluster with known labels
264
+    else:
265
+      cluster_dominant_label_full[cluster_id] = -1 # Empty cluster
266
+
267
+  # Create predicted labels for full evaluation based on the dominant label of the assigned cluster
268
+  predicted_labels_numeric_full = np.array([cluster_dominant_label_full.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels_full])
269
+
270
+  # Evaluate (using numeric labels for the full set)
271
+  valid_indices_full = predicted_labels_numeric_full != -1
272
+  if np.sum(valid_indices_full) > 0 and len(np.unique(all_y_true_full[valid_indices_full])) > 1:
273
+    print("Classification Report (Full Test Set):")
274
+    print(classification_report(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full]))
275
+    cm_full = confusion_matrix(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full])
276
+    plt.figure(figsize=(8, 6))
277
+    sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues')
278
+    plt.xlabel('Predicted Cluster (Dominant True Label)')
279
+    plt.ylabel('True Label')
280
+    plt.title('Confusion Matrix (Full Test Set)')
281
+    plt.show()
282
+  else:
283
+    print("Could not perform full evaluation (not enough data or classes after mapping).")
284
+
285
+
286
+  # --- 3. Perform DELAYED Evaluation (on subset after delay) ---
287
+  print("\n--- Delayed Evaluation Results (Subset after Delay) ---")
288
+
289
+  all_y_true_delayed = []
290
+  all_predicted_cluster_labels_delayed = []
291
+
292
+  # Recalculate evaluation_allowed based on combined sequence indices and change points across all test data
293
+  # This requires mapping the original change point indices back to the combined sequence indices.
294
+  # This is complex. Let's simplify: Apply delay logic PER FILE, then combine. This is what the original evaluate_with_delay did.
295
+  # Let's revert the data collection for delayed eval back to per-file processing for simplicity and match original intent.
296
+
297
+  # Re-process test data per file for delayed evaluation
298
+  for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)):
299
+    original_indices = original_df.index
300
+    time_index = original_indices[timesteps - 1:]
301
+    sequences = create_sequences(scaled_df.values, timesteps)
302
+    if sequences.size == 0: continue # Skip empty files
303
+
304
+    n_sequences = sequences.shape[0]
305
+    reshaped_sequences = sequences.reshape(n_sequences, -1)
306
+    predicted_clusters = kmeans_model.predict(reshaped_sequences)
307
+
308
+    # Detect change points for THIS file (on scaled data)
309
+    change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
310
+
311
+    # Apply delay logic PER FILE
312
+    evaluation_allowed_file = np.ones(n_sequences, dtype=bool)
313
+    # Map original data change point indices to sequence indices for delay logic
314
+    change_point_sequence_indices_file = change_points - (timesteps - 1)
315
+    # Filter for valid sequence indices for delay
316
+    valid_change_point_sequence_indices_file = change_point_sequence_indices_file[(change_point_sequence_indices_file >= 0) & (change_point_sequence_indices_file < n_sequences)]
317
+
318
+
319
+    for cp_seq_index in valid_change_point_sequence_indices_file:
320
+      start_delay = max(0, cp_seq_index)
321
+      end_delay = min(n_sequences, cp_seq_index + delay_steps)
322
+      evaluation_allowed_file[start_delay:end_delay] = False
323
+
324
+
325
+    # Collect data for DELAYED evaluation (only where evaluation_allowed_file is True)
326
+    # Use the dominant label mapping calculated on the FULL test set for consistency
327
+    predicted_labels_numeric_file = np.array([cluster_dominant_label_full.get(cluster, -1) for cluster in predicted_clusters]) # Use full mapping
328
+    true_labels_file = y_true_categorical[timesteps - 1:] # True labels aligned with sequences
329
+
330
+    all_y_true_delayed.extend(true_labels_file[evaluation_allowed_file])
331
+    all_predicted_cluster_labels_delayed.extend(predicted_labels_numeric_file[evaluation_allowed_file])
332
+
333
+
334
+  all_y_true_delayed = np.array(all_y_true_delayed)
335
+  all_predicted_cluster_labels_delayed = np.array(all_predicted_cluster_labels_delayed)
336
+
337
+
338
+  # Perform Delayed Evaluation
339
+  valid_indices_delayed = all_predicted_cluster_labels_delayed != -1
340
+  if np.sum(valid_indices_delayed) > 0 and len(np.unique(all_y_true_delayed[valid_indices_delayed])) > 1:
341
+    print("Classification Report (Subset after Delay):")
342
+    print(classification_report(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed]))
343
+    cm_delayed = confusion_matrix(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed])
344
+    plt.figure(figsize=(8, 6))
345
+    sns.heatmap(cm_delayed, annot=True, fmt='d', cmap='Blues')
346
+    plt.xlabel('Predicted Label (Delayed)')
347
+    plt.ylabel('True Label')
348
+    plt.title('Confusion Matrix (Subset after Delay)')
349
+    plt.show()
350
+  else:
351
+    print("Could not perform delayed evaluation (not enough data after delay or classes).")
352
+
353
+  # --- 4. Report Detected Change Points ---
354
+  print("\nDetected Change Points (Start Time of Sequence after Change):")
355
+  # Print the collected list of change point time indices per file
356
+  for i, cp_list in enumerate(all_change_points_detected_list):
357
+    print(f"File {i}: {cp_list}")
358
+
359
+  # Note: Anomaly and Misclassified plotting is not implemented in this version due to complexity with delayed evaluation subset.
360
+
361
+# Main execution
362
+if __name__ == "__main__":
363
+  # Load and preprocess training data (already done outside __name__ == "__main__")
364
+  # Load and preprocess test data (already done outside __name__ == "__main__")
365
+
366
+  # Create true labels list for test data
367
+  true_labels_list = []
368
+  for i, df in enumerate(dataTest):
369
+    true_labels_list.append(np.full(len(df), i))
370
+
371
+  # Plot raw data if requested
372
+  if options.plot_raw:
373
+    print("\nPlotting Raw Data:")
374
+    num_features = len(features)
375
+    fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
376
+    if num_features == 1: axes = [axes]
377
+    for i, feature in enumerate(features):
378
+      for k, df in enumerate(dataTest):
379
+        axes[i].plot(df.index, df[feature], label=f'Class {k}', alpha=0.7) # Added alpha
380
+      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
381
+      axes[i].set_title(featureNames[feature])
382
+    axes[-1].legend(loc='upper right') # Legend on the last subplot
383
+    plt.tight_layout()
384
+    plt.show()
385
+
386
+  # Plot clustered data for training set if requested
387
+  train_sequences = create_sequences(scaled_train_df.values, timesteps)
388
+  train_reshaped_sequences = train_sequences.reshape(train_sequences.shape[0], -1)
389
+  train_predicted_clusters = kmeans.predict(train_reshaped_sequences)
390
+  train_time_index = combined_train_data.index[timesteps - 1:]
391
+  if options.plot_clustered:
392
+    print("\nClustered Data for Training Set:")
393
+    # Change points detection for training data (optional)
394
+    train_change_points = detect_change_points(scaled_train_df.values, threshold=0.8)
395
+    train_change_point_sequence_indices = train_change_points - (timesteps - 1)
396
+    valid_train_change_point_sequence_indices = train_change_point_sequence_indices[(train_change_point_sequence_indices >= 0) & (train_change_point_sequence_indices < train_sequences.shape[0])]
397
+    train_cp_time_indices = train_time_index[valid_train_change_point_sequence_indices].tolist() if valid_train_change_point_sequence_indices.size > 0 else None
398
+
399
+    plot_clustered_data(combined_train_data.loc[train_time_index], train_predicted_clusters, train_time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=train_cp_time_indices)
400
+
401
+
402
+  # Plot clustered data for TEST set (per file) if requested
403
+  # Note: This iterates through test files and plots each one's clustered data.
404
+  # It also calculates and plots change points for each file.
405
+  if options.plot_clustered:
406
+    print("\nClustered Data for Test Sets (per file):")
407
+    for k, (scaled_df, original_df) in enumerate(zip(scaled_test_df_list, dataTest)):
408
+      original_indices = original_df.index
409
+      time_index = original_indices[timesteps - 1:]
410
+      sequences = create_sequences(scaled_df.values, timesteps)
411
+      if sequences.size == 0: continue
412
+      reshaped_sequences = sequences.reshape(sequences.shape[0], -1)
413
+      predicted_clusters = kmeans.predict(reshaped_sequences)
414
+
415
+      # Change points detection for this test file
416
+      change_points = detect_change_points(scaled_df.values, threshold=0.8)
417
+      change_point_sequence_indices = change_points - (timesteps - 1)
418
+      valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < sequences.shape[0])]
419
+      cp_time_indices = time_index[valid_change_point_sequence_indices].tolist() if valid_change_point_sequence_indices.size > 0 else None
420
+
421
+      print(f" Plotting Test File {k}")
422
+      plot_clustered_data(original_df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices)
423
+
424
+
425
+  # Perform Evaluation (Full and Delayed)
426
+  # This function handles all evaluation reporting and confusion matrix plotting
427
+  evaluate_and_report(kmeans, scaled_test_df_list, dataTest, true_labels_list, timesteps, delay_steps, features, options) # Corrected: Pass scaled_test_df_list
428
+
429
+  # Calculate and print Inertia and Silhouette Score for combined test data
430
+  # Need to combine all test sequences and cluster labels first, if not already done
431
+  X_test_sequences_combined = np.vstack([create_sequences(df.values, timesteps) for df in scaled_test_df_list if create_sequences(df.values, timesteps).size > 0])
432
+  if X_test_sequences_combined.size > 0:
433
+    X_test_combined_reshaped = X_test_sequences_combined.reshape(X_test_sequences_combined.shape[0], -1)
434
+    all_cluster_labels_test_combined = kmeans.predict(X_test_combined_reshaped)
435
+
436
+    print("\n--- K-Means Model Evaluation (Overall Metrics on Combined Test Data) ---")
437
+    print(f"Inertia: {kmeans.inertia_:.4f}") # Inertia is from training fit
438
+
439
+    # Silhouette score on the combined test data
440
+    if len(np.unique(all_cluster_labels_test_combined)) > 1 and len(all_cluster_labels_test_combined) > 0:
441
+      silhouette = silhouette_score(X_test_combined_reshaped, all_cluster_labels_test_combined)
442
+      print(f"Silhouette Score: {silhouette:.4f}")
443
+    else:
444
+      print("Silhouette Score: Not applicable for single cluster on combined test data.")
445
+  else:
446
+    print("\n--- K-Means Model Evaluation (Overall Metrics) ---")
447
+    print("No test data sequences available to evaluate overall Inertia and Silhouette Score.")
448
+
449
+  # Note: Anomaly and Misclassified plotting is not implemented in this version.

Powered by TurnKey Linux.