Ako data: Unsupervised/reinforcement learning for anomaly detection and clustering. Unsupervised, trained, and evaluated on labeled anomaly data.

robust_scaler_roc_cp_delayed.py 28KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. # Import both scalers
  5. from sklearn.preprocessing import StandardScaler, RobustScaler
  6. # Removed LabelEncoder as it's not used in this version
  7. from sklearn.cluster import KMeans
  8. # Added silhouette_score back, keep classification_report, confusion_matrix
  9. from sklearn.metrics import classification_report, confusion_matrix, silhouette_score
  10. import argparse
  11. import os
  12. import seaborn as sns
  13. # Command line arguments setup
  14. parser = argparse.ArgumentParser(description='Anomaly detection using K-Means clustering with Rate of Change, Change Point Detection, and Delayed Evaluation.')
  15. parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
  16. parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means.')
  17. parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.') # Using n_init from options (FIXED)
  18. parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
  19. parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
  20. parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
  21. # Removed plot_anomalies, plot_misclassified flags as they are not implemented in this version
  22. # parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies.')
  23. # parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances.')
  24. parser.add_argument('--delay', type=int, default=10, help='Number of timesteps to delay evaluation after a change point.')
  25. parser.add_argument('--show_change_points', action='store_true', help='Show change points on clustered plots.')
  26. parser.add_argument('--use_standard_scaler', action='store_true', help='Use StandardScaler instead of RobustScaler.') # Added scaler choice flag
  27. options = parser.parse_args()
  28. # Parameters
  29. n_clusters = options.n_clusters
  30. timesteps = options.timesteps
  31. n_init = options.n_init # Used n_init from options (FIXED)
  32. delay_steps = options.delay
  33. show_change_points = options.show_change_points
  34. use_standard_scaler = options.use_standard_scaler # Get scaler choice
  35. # Data loading (same as previous code)
  36. NumberOfFailures = 4
  37. datafiles = [[], []]
  38. for i in range(NumberOfFailures + 1):
  39. datafiles[0].append([])
  40. datafiles[1].append([])
  41. datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
  42. datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
  43. datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
  44. datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
  45. datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']
  46. if options.transition:
  47. datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
  48. datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']
  49. datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
  50. datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']
  51. datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
  52. else:
  53. datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
  54. datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
  55. datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
  56. datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
  57. datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']
  58. features = ['r1 s1', 'r1 s4', 'r1 s5']
  59. n_original_features = len(features) # Store original number of features for sequence creation with ROC
  60. # Using standard LaTeX formatting for display names
  61. featureNames = {'r1 s1': r'$T_{evap}$', 'r1 s4': r'$T_{cond}$', 'r1 s5': r'$T_{air}$'}
  62. unitNames = {'r1 s1': r'($^o$C)', 'r1 s4': r'($^o$C)', 'r1 s5': r'($^o$C)'}
  63. NumFeatures = len(features) # Used for indexing features[:NumFeatures]
  64. # Load and preprocess data (same as previous code)
  65. dataTrain = []
  66. for class_files in datafiles[0]:
  67. script_dir = os.path.dirname(os.path.abspath(__file__))
  68. data_dir = os.path.join(script_dir, 'data')
  69. class_dfs = []
  70. for base_filename in class_files:
  71. filepath = os.path.join(data_dir, f'{base_filename}.csv')
  72. try:
  73. df = pd.read_csv(filepath)
  74. df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
  75. df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
  76. for col in features:
  77. df[col] = pd.to_numeric(df[col], errors='coerce')
  78. df = df.set_index('timestamp').resample('5Min')[features].mean()
  79. df = df[features].interpolate()
  80. class_dfs.append(df)
  81. except FileNotFoundError:
  82. print(f"Warning: File {filepath} not found and skipped.")
  83. if class_dfs:
  84. dataTrain.append(pd.concat(class_dfs))
  85. combined_train_data = pd.concat(dataTrain)
  86. dataTest = []
  87. for class_files in datafiles[1]:
  88. script_dir = os.path.dirname(os.path.abspath(__file__))
  89. data_dir = os.path.join(script_dir, 'data')
  90. class_dfs = []
  91. for base_filename in class_files:
  92. filepath = os.path.join(data_dir, f'{base_filename}.csv')
  93. try:
  94. df = pd.read_csv(filepath)
  95. df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
  96. df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
  97. for col in features:
  98. df[col] = pd.to_numeric(df[col], errors='coerce')
  99. df = df.set_index('timestamp').resample('5Min')[features].mean()
  100. df = df[features].interpolate()
  101. class_dfs.append(df)
  102. except FileNotFoundError:
  103. print(f"Warning: File {filepath} not found and skipped.")
  104. if class_dfs:
  105. dataTest.append(pd.concat(class_dfs))
  106. # Normalize data (Uses RobustScaler by default, StandardScaler if --use_standard_scaler is set)
  107. scaler = StandardScaler() if use_standard_scaler else RobustScaler() # Scaler choice based on argument
  108. scaled_train_data = scaler.fit_transform(combined_train_data[features])
  109. scaled_test_data_list = [] # This list will store NumPy arrays initially
  110. for df in dataTest:
  111. scaled_test_data_list.append(scaler.transform(df[features]))
  112. # Convert scaled data to DataFrames for easier handling and plotting
  113. scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
  114. scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)] # This list stores DataFrames
  115. # Create time sequences WITH Rate of Change (from previous version)
  116. def create_sequences_with_rate_of_change(data, timesteps, original_features_count):
  117. sequences = []
  118. # Calculate rate of change for the entire data first (handles NaNs appropriately)
  119. # Resulting shape is (len(data), original_features_count), with NaN in first row
  120. rate_of_change_full = np.diff(data, axis=0, prepend=np.nan)
  121. # Combine original data and rate of change
  122. # Horizontally stack original data and rate of change
  123. combined_data = np.hstack((data, rate_of_change_full)) # Shape (len(data), 2 * original_features_count)
  124. # Create sequences from the combined data
  125. for i in range(1, len(combined_data) - timesteps + 1): # CORRECTED: Start range from 1 to skip the first sequence
  126. sequence = combined_data[i:i + timesteps] # Shape (timesteps, 2 * original_features_count)
  127. sequences.append(sequence)
  128. return np.array(sequences)
  129. X_train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features) # Use scaled_train_df.values
  130. # Train K-Means model on all training data
  131. n_samples_train, n_timesteps_train, n_total_features_train = X_train_sequences.shape # Total features is now 2 * original_features
  132. X_train_reshaped = X_train_sequences.reshape(n_samples_train, n_timesteps_train * n_total_features_train) # Flatten sequences
  133. kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # Using n_init from options (FIXED)
  134. kmeans.fit(X_train_reshaped)
  135. # Function to detect change points (Function following sharp jumps or drops)
  136. # This function works on a single 2D data array (samples, features)
  137. # It's typically better to detect change points on the original scaled features BEFORE adding ROC
  138. def detect_change_points(data, threshold=0.8):
  139. change_points = []
  140. # Iterate through the data points starting from the second one
  141. for i in range(1, len(data)):
  142. # Calculate the absolute difference between the current point and the previous point
  143. difference = np.abs(data[i] - data[i-1])
  144. # If the difference for ANY feature is greater than the threshold, mark this point as a change point
  145. if np.any(difference > threshold):
  146. change_points.append(i)
  147. return np.array(change_points)
  148. # Function to plot clustered data (adapted to accept sequence indices and show change points)
  149. def plot_clustered_data(df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=False, change_point_indices=None):
  150. # Note: 'features' here is the list of original feature names
  151. num_original_features = len(features)
  152. fig, axes = plt.subplots(num_original_features, 1, figsize=(15, 5 * num_original_features), sharex=True)
  153. if num_original_features == 1: axes = [axes] # Ensure axes is always an array
  154. colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
  155. # We are plotting original features only
  156. # The input 'features' list in this function IS the list of original feature names ('r1 s1', 'r1 s4', 'r1 s5')
  157. for i, feature in enumerate(features): # Iterate only through original features for plotting
  158. # Plot data points colored by their assigned cluster
  159. for cluster_id in range(n_clusters):
  160. cluster_indices_kmeans = np.where(predicted_clusters == cluster_id)[0]
  161. if len(cluster_indices_kmeans) > 0:
  162. # Use time_index for x-axis and original df for y-axis values
  163. axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]],
  164. color=colors[cluster_id], label=f'Cluster {cluster_id}', s=10, alpha=0.6) # Added alpha for better visualization of overlaps
  165. axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}') # Use original names/units from dict
  166. axes[i].set_title(featureNames[feature]) # Use original names from dict
  167. axes[i].grid(True, linestyle='--', alpha=0.6) # Added grid for readability
  168. # Plot change points if enabled
  169. if show_cp and change_point_indices is not None:
  170. # Ensure change_point_indices contains datetime objects matching time_index
  171. for cp_time in change_point_indices:
  172. axes[i].axvline(x=cp_time, color='red', linestyle='--', linewidth=1.5, label='Change Point' if i == 0 else '', alpha=0.8) # Only one label for change point
  173. # Add legend to the last subplot, including cluster labels and change point if plotted
  174. handles, labels = [], []
  175. # Collect handles and labels from all axes to avoid duplicates
  176. for ax in axes:
  177. for handle, label in zip(*ax.get_legend_handles_labels()):
  178. if label not in labels:
  179. handles.append(handle)
  180. labels.append(label)
  181. if handles:
  182. axes[-1].legend(handles, labels, loc='upper right')
  183. plt.tight_layout()
  184. plt.show()
  185. # Combined Evaluation Function (Full and Delayed)
  186. def evaluate_and_report(kmeans_model, scaled_test_df_list, original_test_data_list, true_labels_list, timesteps, delay_steps, features, options, n_original_features):
  187. all_y_true_full = [] # True labels for all test sequences
  188. all_predicted_cluster_labels_full = [] # Predicted clusters for all test sequences
  189. # all_original_test_sequences_full = [] # Not directly needed in evaluation logic itself
  190. all_change_points_detected_list = [] # Detected change points time indices for each test file
  191. # --- 1. Collect data and predict clusters for ALL test sequences ---
  192. # Iterate over scaled DataFrames (scaled_test_df_list) and original DataFrames (original_test_data_list)
  193. for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_df_list, original_test_data_list, true_labels_list)):
  194. original_indices = original_df.index
  195. # time_index for plotting and aligning labels corresponds to the end of each sequence
  196. time_index = original_indices[timesteps - 1:]
  197. # Create sequences (using create_sequences_with_rate_of_change for this version)
  198. # scaled_df is a DataFrame, pass its .values
  199. sequences = create_sequences_with_rate_of_change(scaled_df.values, timesteps, n_original_features)
  200. if sequences.size == 0:
  201. print(f"Warning: No sequences generated for test file {k}. Skipping.")
  202. all_change_points_detected_list.append([]) # Append empty list for consistency
  203. continue # Skip to next file
  204. n_sequences = sequences.shape[0]
  205. reshaped_sequences = sequences.reshape(n_sequences, -1)
  206. predicted_clusters = kmeans_model.predict(reshaped_sequences)
  207. # Collect true labels aligned with the sequences that were ACTUALLY created.
  208. # create_sequences_with_rate_of_change skips the first sequence (ending at timesteps-1).
  209. # So, start collecting true labels from the index corresponding to the END of the second sequence (index timesteps).
  210. all_y_true_full.extend(y_true_categorical[timesteps:]) # CORRECTED: Start slicing from timesteps
  211. all_predicted_cluster_labels_full.extend(predicted_clusters)
  212. # Detect change points for this test file (on scaled data - using ORIGINAL features before adding ROC)
  213. # scaled_df.values contains only the original scaled features
  214. change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
  215. # Map original data change point indices to the time_index of sequences
  216. # A change at original index `cp_original` corresponds to the sequence ENDING at `cp_original`.
  217. # The index in the `sequences` array corresponding to original index `cp_original` is `cp_original - (timesteps - 1)`.
  218. # We need to make sure this sequence index is valid (>= 0 and < n_sequences).
  219. change_point_sequence_indices = change_points - (timesteps - 1)
  220. # Filter for valid sequence indices
  221. valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < n_sequences)]
  222. # Store the time index of the valid change point sequences
  223. if valid_change_point_sequence_indices.size > 0:
  224. cp_time_indices = time_index[valid_change_point_sequence_indices].tolist()
  225. all_change_points_detected_list.append(cp_time_indices)
  226. else:
  227. all_change_points_detected_list.append([])
  228. # Plot clustered data for the current test file if requested and transition=False (to avoid duplicate plots handled later)
  229. # This uses original_df and predicted_clusters for this file
  230. if options.plot_clustered and not options.transition:
  231. print(f"\nClustered Data for Test File {k}:")
  232. plot_clustered_data(original_df, predicted_clusters, time_index, kmeans_model.n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices if show_change_points else None)
  233. # Convert collected lists to numpy arrays for evaluation
  234. all_y_true_full = np.array(all_y_true_full)
  235. all_predicted_cluster_labels_full = np.array(all_predicted_cluster_labels_full)
  236. # all_original_test_sequences_full = np.array(all_original_test_sequences_full) # Convert if needed for plotting later
  237. # --- 2. Perform FULL Evaluation (on all test sequences) ---
  238. print("\n--- Full Evaluation Results (All Test Sequences) ---")
  239. # Analyze clusters and assign a dominant true label to each cluster based on ALL test sequences
  240. cluster_dominant_label_full = {}
  241. for cluster_id in range(kmeans_model.n_clusters):
  242. indices_in_cluster = np.where(all_predicted_cluster_labels_full == cluster_id)[0]
  243. if len(indices_in_cluster) > 0:
  244. labels_in_cluster = all_y_true_full[indices_in_cluster]
  245. if len(labels_in_cluster) > 0:
  246. # Use np.argmax to find the index of the max count (dominant label)
  247. dominant_label = np.argmax(np.bincount(labels_in_cluster))
  248. cluster_dominant_label_full[cluster_id] = dominant_label
  249. else:
  250. cluster_dominant_label_full[cluster_id] = -1 # No data in this cluster with known labels
  251. else:
  252. cluster_dominant_label_full[cluster_id] = -1 # Empty cluster
  253. # Create predicted labels for full evaluation based on the dominant label of the assigned cluster
  254. predicted_labels_numeric_full = np.array([cluster_dominant_label_full.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels_full])
  255. # Evaluate (using numeric labels for the full set)
  256. valid_indices_full = predicted_labels_numeric_full != -1
  257. if np.sum(valid_indices_full) > 0 and len(np.unique(all_y_true_full[valid_indices_full])) > 1:
  258. print("Classification Report (Full Test Set):")
  259. print(classification_report(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full]))
  260. cm_full = confusion_matrix(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full])
  261. plt.figure(figsize=(8, 6))
  262. sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues')
  263. plt.xlabel('Predicted Cluster (Dominant True Label)')
  264. plt.ylabel('True Label')
  265. plt.title('Confusion Matrix (Full Test Set)')
  266. plt.show()
  267. else:
  268. print("Could not perform full evaluation (not enough data or classes after mapping).")
  269. # --- 3. Perform DELAYED Evaluation (on subset after delay) ---
  270. print("\n--- Delayed Evaluation Results (Subset after Delay) ---")
  271. all_y_true_delayed = []
  272. all_predicted_cluster_labels_delayed = []
  273. # Apply delay logic PER FILE's sequence indices, and collect results
  274. sequence_count_so_far = 0
  275. # Iterate over scaled DataFrames (scaled_test_df_list) and original DataFrames (original_test_data_list)
  276. for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_df_list, original_test_data_list, true_labels_list)):
  277. # Use the correct sequence creation function based on this version (with ROC)
  278. sequences = create_sequences_with_rate_of_change(scaled_df.values, timesteps, n_original_features)
  279. if sequences.size == 0:
  280. sequence_count_so_far += 0 # No sequences added
  281. continue # Skip empty files
  282. n_sequences_file = sequences.shape[0]
  283. # Detect change points for THIS file (on scaled data - original features)
  284. change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
  285. # Apply delay logic PER FILE's sequence indices
  286. evaluation_allowed_file = np.ones(n_sequences_file, dtype=bool)
  287. # Map original data change point indices to sequence indices for delay logic
  288. change_point_sequence_indices_file = change_points - (timesteps - 1)
  289. # Filter for valid sequence indices for delay
  290. valid_change_point_sequence_indices_file = change_point_sequence_indices_file[(change_point_sequence_indices_file >= 0) & (change_point_sequence_indices_file < n_sequences_file)]
  291. for cp_seq_index in valid_change_point_sequence_indices_file:
  292. start_delay = max(0, cp_seq_index)
  293. end_delay = min(n_sequences_file, cp_seq_index + delay_steps)
  294. evaluation_allowed_file[start_delay:end_delay] = False
  295. # Collect data for DELAYED evaluation (only where evaluation_allowed_file is True)
  296. # Use the dominant label mapping calculated on the FULL test set for consistency
  297. # Get the predicted clusters for sequences in THIS file (from the full prediction list)
  298. predicted_clusters_file = all_predicted_cluster_labels_full[sequence_count_so_far : sequence_count_so_far + n_sequences_file]
  299. predicted_labels_numeric_file = np.array([cluster_dominant_label_full.get(cluster, -1) for cluster in predicted_clusters_file]) # Use full mapping
  300. # Get true labels for sequences in THIS file (aligned with sequences)
  301. true_labels_file = all_y_true_full[sequence_count_so_far : sequence_count_so_far + n_sequences_file]
  302. all_y_true_delayed.extend(true_labels_file[evaluation_allowed_file])
  303. all_predicted_cluster_labels_delayed.extend(predicted_labels_numeric_file[evaluation_allowed_file])
  304. sequence_count_so_far += n_sequences_file # Update count for slicing
  305. all_y_true_delayed = np.array(all_y_true_delayed)
  306. all_predicted_cluster_labels_delayed = np.array(all_predicted_cluster_labels_delayed)
  307. # Perform Delayed Evaluation
  308. valid_indices_delayed = all_predicted_cluster_labels_delayed != -1
  309. if np.sum(valid_indices_delayed) > 0 and len(np.unique(all_y_true_delayed[valid_indices_delayed])) > 1:
  310. print("Classification Report (Subset after Delay):")
  311. print(classification_report(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed]))
  312. cm_delayed = confusion_matrix(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed])
  313. plt.figure(figsize=(8, 6))
  314. sns.heatmap(cm_delayed, annot=True, fmt='d', cmap='Blues')
  315. plt.xlabel('Predicted Label (Delayed)')
  316. plt.ylabel('True Label')
  317. plt.title('Confusion Matrix (Subset after Delay)')
  318. plt.show()
  319. else:
  320. print("Could not perform delayed evaluation (not enough data after delay or classes).")
  321. # --- 4. Report Detected Change Points ---
  322. print("\nDetected Change Points (Start Time of Sequence after Change):")
  323. # Print the collected list of change point time indices per file
  324. for i, cp_list in enumerate(all_change_points_detected_list):
  325. print(f"File {i}: {cp_list}")
  326. # Note: Anomaly and Misclassified plotting is not implemented in this version due to complexity with delayed evaluation subset.
  327. # Return collected data if needed for further processing (e.g., plotting)
  328. return all_y_true_full, all_predicted_cluster_labels_full, all_change_points_detected_list # Removed original_test_sequences_full return
  329. # Main execution
  330. if __name__ == "__main__":
  331. # Load and preprocess training data (already done outside __name__ == "__main__")
  332. # Load and preprocess test data (already done outside __name__ == "__main__")
  333. # Create true labels list for test data
  334. true_labels_list = []
  335. for i, df in enumerate(dataTest):
  336. true_labels_list.append(np.full(len(df), i))
  337. # Plot raw data if requested
  338. if options.plot_raw:
  339. print("\nPlotting Raw Data:")
  340. num_features = len(features)
  341. fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
  342. if num_features == 1: axes = [axes]
  343. for i, feature in enumerate(features):
  344. for k, df in enumerate(dataTest):
  345. axes[i].plot(df.index, df[feature], label=f'Class {k}', alpha=0.7) # Added alpha
  346. axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
  347. axes[i].set_title(featureNames[feature])
  348. axes[-1].legend(loc='upper right') # Legend on the last subplot
  349. plt.tight_layout()
  350. plt.show()
  351. # Plot clustered data for training set if requested
  352. # Use scaled_train_df.values for sequence creation
  353. train_sequences = create_sequences_with_rate_of_change(scaled_train_df.values, timesteps, n_original_features)
  354. train_reshaped_sequences = train_sequences.reshape(train_sequences.shape[0], -1)
  355. train_predicted_clusters = kmeans.predict(train_reshaped_sequences)
  356. train_time_index = combined_train_data.index[timesteps - 1:]
  357. if options.plot_clustered:
  358. print("\nClustered Data for Training Set:")
  359. # Change points detection for training data (optional) - on original scaled features
  360. train_change_points = detect_change_points(scaled_train_df.values, threshold=0.8)
  361. train_change_point_sequence_indices = train_change_points - (timesteps - 1)
  362. valid_train_change_point_sequence_indices = train_change_point_sequence_indices[(train_change_point_sequence_indices >= 0) & (train_change_point_sequence_indices < train_sequences.shape[0])]
  363. train_cp_time_indices = train_time_index[valid_train_change_point_sequence_indices].tolist() if valid_train_change_point_sequence_indices.size > 0 else None
  364. # Use combined_train_data for plotting original values
  365. plot_clustered_data(combined_train_data.loc[train_time_index], train_predicted_clusters, train_time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=train_cp_time_indices)
  366. # Plot clustered data for TEST set (per file) if requested
  367. # Note: This iterates through test files and plots each one's clustered data.
  368. # It also calculates and plots change points for each file.
  369. if options.plot_clustered:
  370. print("\nClustered Data for Test Sets (per file):")
  371. # Corrected loop: iterate over scaled DataFrames (scaled_test_df_list) and original DataFrames (dataTest)
  372. for k, (scaled_df, original_df) in enumerate(zip(scaled_test_df_list, dataTest)):
  373. original_indices = original_df.index
  374. time_index = original_indices[timesteps - 1:]
  375. # Use scaled_df.values for sequence creation
  376. sequences = create_sequences_with_rate_of_change(scaled_df.values, timesteps, n_original_features)
  377. if sequences.size == 0: continue
  378. reshaped_sequences = sequences.reshape(sequences.shape[0], -1)
  379. predicted_clusters = kmeans.predict(reshaped_sequences)
  380. # Change points detection for this test file - on original scaled features
  381. change_points = detect_change_points(scaled_df.values, threshold=0.8)
  382. change_point_sequence_indices = change_points - (timesteps - 1)
  383. valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < sequences.shape[0])]
  384. cp_time_indices = time_index[valid_change_point_sequence_indices].tolist() if valid_change_point_sequence_indices.size > 0 else None
  385. print(f" Plotting Test File {k}")
  386. # Use original_df for plotting original values
  387. plot_clustered_data(original_df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices)
  388. # Perform Evaluation (Full and Delayed)
  389. # This function handles all evaluation reporting and confusion matrix plotting
  390. # Pass scaled_test_df_list (list of DataFrames)
  391. all_y_true_full, all_predicted_cluster_labels_full, all_change_points_detected_list = evaluate_and_report(kmeans, scaled_test_df_list, dataTest, true_labels_list, timesteps, delay_steps, features, options, n_original_features) # Pass scaled_test_df_list (FIXED)
  392. # Calculate and print Inertia and Silhouette Score for combined test data
  393. # Need to combine all test sequences and cluster labels first
  394. # The evaluation function now returns the full lists, so we can use them here.
  395. # Need the combined reshaped sequences for Silhouette calculation
  396. # Use scaled_test_df_list to create sequences
  397. X_test_sequences_combined = np.vstack([create_sequences_with_rate_of_change(df.values, timesteps, n_original_features) for df in scaled_test_df_list if create_sequences_with_rate_of_change(df.values, timesteps, n_original_features).size > 0])
  398. if X_test_sequences_combined.size > 0 and all_predicted_cluster_labels_full.size > 0: # Check if any sequences were processed for evaluation
  399. X_test_combined_reshaped = X_test_sequences_combined.reshape(X_test_sequences_combined.shape[0], -1)
  400. print("\n--- K-Means Model Evaluation (Overall Metrics on Combined Test Data) ---")
  401. print(f"Inertia: {kmeans.inertia_:.4f}") # Inertia is from training fit
  402. # Silhouette score on the combined test data
  403. # Use the cluster labels predicted for the full test set
  404. if len(np.unique(all_predicted_cluster_labels_full)) > 1 and all_predicted_cluster_labels_full.size > 0: # Check for multiple clusters and non-empty
  405. try:
  406. silhouette = silhouette_score(X_test_combined_reshaped, all_predicted_cluster_labels_full)
  407. print(f"Silhouette Score: {silhouette:.4f}")
  408. except ValueError as e:
  409. print(f"Silhouette Score: Could not calculate ({e})")
  410. else:
  411. print("Silhouette Score: Not applicable for single cluster or empty data on combined test data.")
  412. else:
  413. print("\n--- K-Means Model Evaluation (Overall Metrics) ---")
  414. print("No test data sequences available or processed for overall evaluation metrics.")
  415. # Note: Anomaly and Misclassified plotting is not implemented in this version.

Powered by TurnKey Linux.