Ako data: Unsupervised/reinforcement learning for anomaly detection and clustering. Unsupervised, trained, and evaluated on labeled anomaly data.

standard_scaler_cp_delayed.py 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. from sklearn.cluster import KMeans
  5. from sklearn.preprocessing import StandardScaler
  6. # Removed LabelEncoder as it's not used in this version
  7. from sklearn.metrics import classification_report, confusion_matrix, silhouette_score # Added silhouette_score back
  8. import argparse
  9. import os
  10. import seaborn as sns
  11. # Command line arguments setup (Removed plot_anomalies, plot_misclassified flags as they are not implemented)
  12. parser = argparse.ArgumentParser(description='Anomaly detection using K-Means with change point detection and delayed evaluation.')
  13. parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
  14. parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means.')
  15. parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.') # Added n_init back
  16. parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
  17. parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
  18. parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
  19. # parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies.') # Removed
  20. # parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances.') # Removed
  21. parser.add_argument('--delay', type=int, default=10, help='Number of timesteps to delay evaluation after a change point.')
  22. parser.add_argument('--show_change_points', action='store_true', help='Show change points on clustered plots.')
  23. options = parser.parse_args()
  24. # Parameters
  25. n_clusters = options.n_clusters
  26. timesteps = options.timesteps
  27. n_init = options.n_init # Used n_init from options (FIXED)
  28. delay_steps = options.delay
  29. show_change_points = options.show_change_points
  30. # Data loading (same as previous code)
  31. NumberOfFailures = 4
  32. datafiles = [[], []]
  33. for i in range(NumberOfFailures + 1):
  34. datafiles[0].append([])
  35. datafiles[1].append([])
  36. datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
  37. datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
  38. datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
  39. datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
  40. datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
  41. if options.transition:
  42. datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
  43. datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']
  44. datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
  45. datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']
  46. datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
  47. else:
  48. datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
  49. datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
  50. datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
  51. datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
  52. datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
  53. features = ['r1 s1', 'r1 s4', 'r1 s5']
  54. # Using standard LaTeX formatting without span tags
  55. featureNames = {'r1 s1': r'$T_{evap}$', 'r1 s4': r'$T_{cond}$', 'r1 s5': r'$T_{air}$'}
  56. unitNames = {'r1 s1': r'($^o$C)', 'r1 s4': r'($^o$C)', 'r1 s5': r'($^o$C)'}
  57. NumFeatures = len(features)
  58. # Load and preprocess data (same as previous code)
  59. dataTrain = []
  60. for class_files in datafiles[0]:
  61. script_dir = os.path.dirname(os.path.abspath(__file__))
  62. data_dir = os.path.join(script_dir, 'data')
  63. class_dfs = []
  64. for base_filename in class_files:
  65. filepath = os.path.join(data_dir, f'{base_filename}.csv')
  66. try:
  67. df = pd.read_csv(filepath)
  68. df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
  69. df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
  70. for col in features:
  71. df[col] = pd.to_numeric(df[col], errors='coerce')
  72. df = df.set_index('timestamp').resample('5Min')[features].mean()
  73. df = df[features].interpolate()
  74. class_dfs.append(df)
  75. except FileNotFoundError:
  76. print(f"Warning: File {filepath} not found and skipped.")
  77. if class_dfs:
  78. dataTrain.append(pd.concat(class_dfs))
  79. combined_train_data = pd.concat(dataTrain)
  80. dataTest = []
  81. for class_files in datafiles[1]:
  82. script_dir = os.path.dirname(os.path.abspath(__file__))
  83. data_dir = os.path.join(script_dir, 'data')
  84. class_dfs = []
  85. for base_filename in class_files:
  86. filepath = os.path.join(data_dir, f'{base_filename}.csv')
  87. try:
  88. df = pd.read_csv(filepath)
  89. df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
  90. df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
  91. for col in features:
  92. df[col] = pd.to_numeric(df[col], errors='coerce')
  93. df = df.set_index('timestamp').resample('5Min')[features].mean()
  94. df = df[features].interpolate()
  95. class_dfs.append(df)
  96. except FileNotFoundError:
  97. print(f"Warning: File {filepath} not found and skipped.")
  98. if class_dfs:
  99. dataTest.append(pd.concat(class_dfs))
  100. # Normalize data
  101. scaler = StandardScaler() # Using StandardScaler
  102. scaled_train_data = scaler.fit_transform(combined_train_data[features])
  103. scaled_test_data_list = []
  104. for df in dataTest:
  105. scaled_test_data_list.append(scaler.transform(df[features]))
  106. scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
  107. scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]
  108. # Create time sequences (NO Rate of Change)
  109. def create_sequences(data, timesteps):
  110. sequences = []
  111. for i in range(len(data) - timesteps + 1):
  112. sequences.append(data[i:i + timesteps])
  113. return np.array(sequences)
  114. X_train_sequences = create_sequences(scaled_train_df.values, timesteps)
  115. # X_test_sequences_list calculated later in main execution
  116. # Train K-Means model on all training data
  117. n_samples_train, n_timesteps_train, n_features_train = X_train_sequences.shape
  118. X_train_reshaped = X_train_sequences.reshape(n_samples_train, n_timesteps_train * n_features_train)
  119. kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # Using n_init from options (FIXED)
  120. kmeans.fit(X_train_reshaped)
  121. # Function to detect change points (Function following sharp jumps or drops)
  122. # This function works on a single 2D data array (samples, features)
  123. def detect_change_points(data, threshold=0.8):
  124. change_points = []
  125. # Iterate through the data points starting from the second one
  126. for i in range(1, len(data)):
  127. # Calculate the absolute difference between the current point and the previous point
  128. difference = np.abs(data[i] - data[i-1])
  129. # If the difference for ANY feature is greater than the threshold, mark this point as a change point
  130. if np.any(difference > threshold):
  131. change_points.append(i)
  132. return np.array(change_points)
  133. # Function to plot clustered data (adapted to accept sequence indices and show change points)
  134. def plot_clustered_data(df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=False, change_point_indices=None):
  135. num_features = len(features)
  136. fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
  137. if num_features == 1: axes = [axes] # Ensure axes is always an array
  138. colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
  139. for i, feature in enumerate(features):
  140. # Plot data points colored by their assigned cluster
  141. for cluster_id in range(n_clusters):
  142. cluster_indices_kmeans = np.where(predicted_clusters == cluster_id)[0]
  143. if len(cluster_indices_kmeans) > 0:
  144. # Use time_index for x-axis and original df for y-axis values
  145. axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]],
  146. color=colors[cluster_id], label=f'Cluster {cluster_id}', s=10, alpha=0.6) # Added alpha for better visualization of overlaps
  147. axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
  148. axes[i].set_title(featureNames[feature])
  149. axes[i].grid(True, linestyle='--', alpha=0.6) # Added grid for readability
  150. # Plot change points if enabled
  151. if show_cp and change_point_indices is not None:
  152. # Ensure change_point_indices contains datetime objects matching time_index
  153. for cp_time in change_point_indices:
  154. axes[i].axvline(x=cp_time, color='red', linestyle='--', linewidth=1.5, label='Change Point' if i == 0 else '', alpha=0.8) # Only one label for change point
  155. # Add legend to the last subplot, including change point if plotted
  156. handles, labels = [], []
  157. for ax in axes:
  158. for handle, label in zip(*ax.get_legend_handles_labels()):
  159. if label not in labels:
  160. handles.append(handle)
  161. labels.append(label)
  162. if handles:
  163. axes[-1].legend(handles, labels, loc='upper right')
  164. plt.tight_layout()
  165. plt.show()
  166. # Combined Evaluation Function (Full and Delayed)
  167. def evaluate_and_report(kmeans_model, scaled_test_data_list, original_test_data_list, true_labels_list, timesteps, delay_steps, features, options):
  168. all_y_true_full = [] # True labels for all test sequences
  169. all_predicted_cluster_labels_full = [] # Predicted clusters for all test sequences
  170. all_original_test_sequences_full = [] # Original sequences for plotting anomalies/misclassified later if needed
  171. all_change_points_detected_list = [] # Detected change points for each test file
  172. # --- 1. Collect data and predict clusters for ALL test sequences ---
  173. for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)):
  174. original_indices = original_df.index
  175. # time_index for plotting corresponds to the end of each sequence
  176. time_index = original_indices[timesteps - 1:]
  177. sequences = create_sequences(scaled_df.values, timesteps)
  178. if sequences.size == 0:
  179. print(f"Warning: No sequences generated for test file {k}. Skipping.")
  180. all_change_points_detected_list.append([]) # Append empty list for consistency
  181. continue # Skip to next file
  182. n_sequences = sequences.shape[0]
  183. reshaped_sequences = sequences.reshape(n_sequences, -1)
  184. predicted_clusters = kmeans_model.predict(reshaped_sequences)
  185. # Collect true labels and predicted clusters for FULL evaluation
  186. all_y_true_full.extend(y_true_categorical[timesteps - 1:])
  187. all_predicted_cluster_labels_full.extend(predicted_clusters)
  188. # Collect original sequences (aligned with sequence ends) for potential plotting
  189. for i in range(n_sequences):
  190. start_index = original_df.index.get_loc(time_index[i]) - (timesteps - 1)
  191. end_index = start_index + timesteps
  192. all_original_test_sequences_full.append(original_df[features].iloc[start_index:end_index].values)
  193. # Detect change points for this test file (on scaled data)
  194. change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
  195. # Map change point indices to the time_index of sequences
  196. # A change at index `i` in the original data affects the sequence ending at `i`.
  197. # So, a change point at original index `cp_original` corresponds to the sequence ending at `cp_original`.
  198. # The index in the `sequences` array corresponding to original index `cp_original` is `cp_original - (timesteps - 1)`.
  199. # We need to make sure this sequence index is valid (>= 0 and < n_sequences).
  200. change_point_sequence_indices = change_points - (timesteps - 1)
  201. # Filter for valid sequence indices
  202. valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < n_sequences)]
  203. # Store the time index of the valid change point sequences
  204. if valid_change_point_sequence_indices.size > 0:
  205. cp_time_indices = time_index[valid_change_point_sequence_indices].tolist()
  206. all_change_points_detected_list.append(cp_time_indices)
  207. else:
  208. all_change_points_detected_list.append([])
  209. # Plot clustered data for the current test file if requested and transition=False (to avoid duplicate plots handled later)
  210. if options.plot_clustered and not options.transition:
  211. print(f"\nClustered Data for Test File {k}:")
  212. plot_clustered_data(original_df, predicted_clusters, time_index, kmeans_model.n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices if show_change_points else None)
  213. all_y_true_full = np.array(all_y_true_full)
  214. all_predicted_cluster_labels_full = np.array(all_predicted_cluster_labels_full)
  215. all_original_test_sequences_full = np.array(all_original_test_sequences_full)
  216. # --- 2. Perform FULL Evaluation (on all test sequences) ---
  217. print("\n--- Full Evaluation Results (All Test Sequences) ---")
  218. # Analyze clusters and assign a dominant true label to each cluster based on ALL test sequences
  219. cluster_dominant_label_full = {}
  220. for cluster_id in range(kmeans_model.n_clusters):
  221. indices_in_cluster = np.where(all_predicted_cluster_labels_full == cluster_id)[0]
  222. if len(indices_in_cluster) > 0:
  223. labels_in_cluster = all_y_true_full[indices_in_cluster]
  224. if len(labels_in_cluster) > 0:
  225. # Use np.argmax to find the index of the max count (dominant label)
  226. dominant_label = np.argmax(np.bincount(labels_in_cluster))
  227. cluster_dominant_label_full[cluster_id] = dominant_label
  228. else:
  229. cluster_dominant_label_full[cluster_id] = -1 # No data in this cluster with known labels
  230. else:
  231. cluster_dominant_label_full[cluster_id] = -1 # Empty cluster
  232. # Create predicted labels for full evaluation based on the dominant label of the assigned cluster
  233. predicted_labels_numeric_full = np.array([cluster_dominant_label_full.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels_full])
  234. # Evaluate (using numeric labels for the full set)
  235. valid_indices_full = predicted_labels_numeric_full != -1
  236. if np.sum(valid_indices_full) > 0 and len(np.unique(all_y_true_full[valid_indices_full])) > 1:
  237. print("Classification Report (Full Test Set):")
  238. print(classification_report(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full]))
  239. cm_full = confusion_matrix(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full])
  240. plt.figure(figsize=(8, 6))
  241. sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues')
  242. plt.xlabel('Predicted Cluster (Dominant True Label)')
  243. plt.ylabel('True Label')
  244. plt.title('Confusion Matrix (Full Test Set)')
  245. plt.show()
  246. else:
  247. print("Could not perform full evaluation (not enough data or classes after mapping).")
  248. # --- 3. Perform DELAYED Evaluation (on subset after delay) ---
  249. print("\n--- Delayed Evaluation Results (Subset after Delay) ---")
  250. all_y_true_delayed = []
  251. all_predicted_cluster_labels_delayed = []
  252. # Recalculate evaluation_allowed based on combined sequence indices and change points across all test data
  253. # This requires mapping the original change point indices back to the combined sequence indices.
  254. # This is complex. Let's simplify: Apply delay logic PER FILE, then combine. This is what the original evaluate_with_delay did.
  255. # Let's revert the data collection for delayed eval back to per-file processing for simplicity and match original intent.
  256. # Re-process test data per file for delayed evaluation
  257. for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)):
  258. original_indices = original_df.index
  259. time_index = original_indices[timesteps - 1:]
  260. sequences = create_sequences(scaled_df.values, timesteps)
  261. if sequences.size == 0: continue # Skip empty files
  262. n_sequences = sequences.shape[0]
  263. reshaped_sequences = sequences.reshape(n_sequences, -1)
  264. predicted_clusters = kmeans_model.predict(reshaped_sequences)
  265. # Detect change points for THIS file (on scaled data)
  266. change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
  267. # Apply delay logic PER FILE
  268. evaluation_allowed_file = np.ones(n_sequences, dtype=bool)
  269. # Map original data change point indices to sequence indices for delay logic
  270. change_point_sequence_indices_file = change_points - (timesteps - 1)
  271. # Filter for valid sequence indices for delay
  272. valid_change_point_sequence_indices_file = change_point_sequence_indices_file[(change_point_sequence_indices_file >= 0) & (change_point_sequence_indices_file < n_sequences)]
  273. for cp_seq_index in valid_change_point_sequence_indices_file:
  274. start_delay = max(0, cp_seq_index)
  275. end_delay = min(n_sequences, cp_seq_index + delay_steps)
  276. evaluation_allowed_file[start_delay:end_delay] = False
  277. # Collect data for DELAYED evaluation (only where evaluation_allowed_file is True)
  278. # Use the dominant label mapping calculated on the FULL test set for consistency
  279. predicted_labels_numeric_file = np.array([cluster_dominant_label_full.get(cluster, -1) for cluster in predicted_clusters]) # Use full mapping
  280. true_labels_file = y_true_categorical[timesteps - 1:] # True labels aligned with sequences
  281. all_y_true_delayed.extend(true_labels_file[evaluation_allowed_file])
  282. all_predicted_cluster_labels_delayed.extend(predicted_labels_numeric_file[evaluation_allowed_file])
  283. all_y_true_delayed = np.array(all_y_true_delayed)
  284. all_predicted_cluster_labels_delayed = np.array(all_predicted_cluster_labels_delayed)
  285. # Perform Delayed Evaluation
  286. valid_indices_delayed = all_predicted_cluster_labels_delayed != -1
  287. if np.sum(valid_indices_delayed) > 0 and len(np.unique(all_y_true_delayed[valid_indices_delayed])) > 1:
  288. print("Classification Report (Subset after Delay):")
  289. print(classification_report(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed]))
  290. cm_delayed = confusion_matrix(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed])
  291. plt.figure(figsize=(8, 6))
  292. sns.heatmap(cm_delayed, annot=True, fmt='d', cmap='Blues')
  293. plt.xlabel('Predicted Label (Delayed)')
  294. plt.ylabel('True Label')
  295. plt.title('Confusion Matrix (Subset after Delay)')
  296. plt.show()
  297. else:
  298. print("Could not perform delayed evaluation (not enough data after delay or classes).")
  299. # --- 4. Report Detected Change Points ---
  300. print("\nDetected Change Points (Start Time of Sequence after Change):")
  301. # Print the collected list of change point time indices per file
  302. for i, cp_list in enumerate(all_change_points_detected_list):
  303. print(f"File {i}: {cp_list}")
  304. # Note: Anomaly and Misclassified plotting is not implemented in this version due to complexity with delayed evaluation subset.
  305. # Main execution
  306. if __name__ == "__main__":
  307. # Load and preprocess training data (already done outside __name__ == "__main__")
  308. # Load and preprocess test data (already done outside __name__ == "__main__")
  309. # Create true labels list for test data
  310. true_labels_list = []
  311. for i, df in enumerate(dataTest):
  312. true_labels_list.append(np.full(len(df), i))
  313. # Plot raw data if requested
  314. if options.plot_raw:
  315. print("\nPlotting Raw Data:")
  316. num_features = len(features)
  317. fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
  318. if num_features == 1: axes = [axes]
  319. for i, feature in enumerate(features):
  320. for k, df in enumerate(dataTest):
  321. axes[i].plot(df.index, df[feature], label=f'Class {k}', alpha=0.7) # Added alpha
  322. axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
  323. axes[i].set_title(featureNames[feature])
  324. axes[-1].legend(loc='upper right') # Legend on the last subplot
  325. plt.tight_layout()
  326. plt.show()
  327. # Plot clustered data for training set if requested
  328. train_sequences = create_sequences(scaled_train_df.values, timesteps)
  329. train_reshaped_sequences = train_sequences.reshape(train_sequences.shape[0], -1)
  330. train_predicted_clusters = kmeans.predict(train_reshaped_sequences)
  331. train_time_index = combined_train_data.index[timesteps - 1:]
  332. if options.plot_clustered:
  333. print("\nClustered Data for Training Set:")
  334. # Change points detection for training data (optional)
  335. train_change_points = detect_change_points(scaled_train_df.values, threshold=0.8)
  336. train_change_point_sequence_indices = train_change_points - (timesteps - 1)
  337. valid_train_change_point_sequence_indices = train_change_point_sequence_indices[(train_change_point_sequence_indices >= 0) & (train_change_point_sequence_indices < train_sequences.shape[0])]
  338. train_cp_time_indices = train_time_index[valid_train_change_point_sequence_indices].tolist() if valid_train_change_point_sequence_indices.size > 0 else None
  339. plot_clustered_data(combined_train_data.loc[train_time_index], train_predicted_clusters, train_time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=train_cp_time_indices)
  340. # Plot clustered data for TEST set (per file) if requested
  341. # Note: This iterates through test files and plots each one's clustered data.
  342. # It also calculates and plots change points for each file.
  343. if options.plot_clustered:
  344. print("\nClustered Data for Test Sets (per file):")
  345. for k, (scaled_df, original_df) in enumerate(zip(scaled_test_df_list, dataTest)):
  346. original_indices = original_df.index
  347. time_index = original_indices[timesteps - 1:]
  348. sequences = create_sequences(scaled_df.values, timesteps)
  349. if sequences.size == 0: continue
  350. reshaped_sequences = sequences.reshape(sequences.shape[0], -1)
  351. predicted_clusters = kmeans.predict(reshaped_sequences)
  352. # Change points detection for this test file
  353. change_points = detect_change_points(scaled_df.values, threshold=0.8)
  354. change_point_sequence_indices = change_points - (timesteps - 1)
  355. valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < sequences.shape[0])]
  356. cp_time_indices = time_index[valid_change_point_sequence_indices].tolist() if valid_change_point_sequence_indices.size > 0 else None
  357. print(f" Plotting Test File {k}")
  358. plot_clustered_data(original_df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices)
  359. # Perform Evaluation (Full and Delayed)
  360. # This function handles all evaluation reporting and confusion matrix plotting
  361. evaluate_and_report(kmeans, scaled_test_df_list, dataTest, true_labels_list, timesteps, delay_steps, features, options) # Corrected: Pass scaled_test_df_list
  362. # Calculate and print Inertia and Silhouette Score for combined test data
  363. # Need to combine all test sequences and cluster labels first, if not already done
  364. X_test_sequences_combined = np.vstack([create_sequences(df.values, timesteps) for df in scaled_test_df_list if create_sequences(df.values, timesteps).size > 0])
  365. if X_test_sequences_combined.size > 0:
  366. X_test_combined_reshaped = X_test_sequences_combined.reshape(X_test_sequences_combined.shape[0], -1)
  367. all_cluster_labels_test_combined = kmeans.predict(X_test_combined_reshaped)
  368. print("\n--- K-Means Model Evaluation (Overall Metrics on Combined Test Data) ---")
  369. print(f"Inertia: {kmeans.inertia_:.4f}") # Inertia is from training fit
  370. # Silhouette score on the combined test data
  371. if len(np.unique(all_cluster_labels_test_combined)) > 1 and len(all_cluster_labels_test_combined) > 0:
  372. silhouette = silhouette_score(X_test_combined_reshaped, all_cluster_labels_test_combined)
  373. print(f"Silhouette Score: {silhouette:.4f}")
  374. else:
  375. print("Silhouette Score: Not applicable for single cluster on combined test data.")
  376. else:
  377. print("\n--- K-Means Model Evaluation (Overall Metrics) ---")
  378. print("No test data sequences available to evaluate overall Inertia and Silhouette Score.")
  379. # Note: Anomaly and Misclassified plotting is not implemented in this version.

Powered by TurnKey Linux.