import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Removed LabelEncoder as it's not used in this version
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score # Added silhouette_score back
import argparse
import os
import seaborn as sns

# Command line arguments setup (Removed plot_anomalies, plot_misclassified flags as they are not implemented)
parser = argparse.ArgumentParser(description='Anomaly detection using K-Means with change point detection and delayed evaluation.')
parser.add_argument('--timesteps', type=int, default=20, help='Number of timesteps for sequences.')
parser.add_argument('--n_clusters', type=int, default=5, help='Number of clusters for K-Means.')
parser.add_argument('--n_init', type=int, default=10, help='Number of initializations for K-Means.') # Added n_init back
parser.add_argument('--transition', action='store_true', help='Use transition data for testing.')
parser.add_argument('--plot_raw', action='store_true', help='Plot raw data.')
parser.add_argument('--plot_clustered', action='store_true', help='Plot clustered data.')
# parser.add_argument('--plot_anomalies', action='store_true', help='Plot detected anomalies.') # Removed
# parser.add_argument('--plot_misclassified', action='store_true', help='Plot misclassified instances.') # Removed
parser.add_argument('--delay', type=int, default=10, help='Number of timesteps to delay evaluation after a change point.')
parser.add_argument('--show_change_points', action='store_true', help='Show change points on clustered plots.')
options = parser.parse_args()

# Parameters
n_clusters = options.n_clusters
timesteps = options.timesteps
n_init = options.n_init # Used n_init from options (FIXED)
delay_steps = options.delay
show_change_points = options.show_change_points

# Data loading (same as previous code)
NumberOfFailures = 4
datafiles = [[], []]
for i in range(NumberOfFailures + 1):
  datafiles[0].append([])
  datafiles[1].append([])

datafiles[0][0] = ['2024-08-07_5_', '2024-08-08_5_', '2025-01-25_5_', '2025-01-26_5_']
datafiles[0][1] = ['2024-12-11_5_', '2024-12-12_5_', '2024-12-13_5_']
datafiles[0][2] = ['2024-12-18_5_', '2024-12-21_5_', '2024-12-22_5_', '2024-12-23_5_', '2024-12-24_5_']
datafiles[0][3] = ['2024-12-28_5_', '2024-12-29_5_', '2024-12-30_5_']
datafiles[0][4] = ['2025-02-13_5_', '2025-02-14_5_']

if options.transition:
  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_', '2024-12-16_5_']
  datafiles[1][2] = ['2024-12-17_5_', '2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
  datafiles[1][3] = ['2024-12-27_5_', '2024-12-31_5_', '2025-01-01_5_']
  datafiles[1][4] = ['2025-02-12_5_', '2025-02-15_5_', '2025-02-16_5_']
else:
  datafiles[1][0] = ['2025-01-27_5_', '2025-01-28_5_']
  datafiles[1][1] = ['2024-12-14_5_', '2024-12-15_5_']
  datafiles[1][2] = ['2024-12-19_5_', '2024-12-25_5_', '2024-12-26_5_']
  datafiles[1][3] = ['2024-12-31_5_', '2025-01-01_5_']
  datafiles[1][4] = ['2025-02-15_5_', '2025-02-16_5_']

features = ['r1 s1', 'r1 s4', 'r1 s5']
# Using standard LaTeX formatting without span tags
featureNames = {'r1 s1': r'$T_{evap}$', 'r1 s4': r'$T_{cond}$', 'r1 s5': r'$T_{air}$'}
unitNames = {'r1 s1': r'($^o$C)', 'r1 s4': r'($^o$C)', 'r1 s5': r'($^o$C)'}
NumFeatures = len(features)

# Load and preprocess data (same as previous code)
dataTrain = []
for class_files in datafiles[0]:
  script_dir = os.path.dirname(os.path.abspath(__file__))
  data_dir = os.path.join(script_dir, 'data')
  class_dfs = []
  for base_filename in class_files:
    filepath = os.path.join(data_dir, f'{base_filename}.csv')
    try:
      df = pd.read_csv(filepath)
      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
      for col in features:
        df[col] = pd.to_numeric(df[col], errors='coerce')
      df = df.set_index('timestamp').resample('5Min')[features].mean()
      df = df[features].interpolate()
      class_dfs.append(df)
    except FileNotFoundError:
      print(f"Warning: File {filepath} not found and skipped.")
  if class_dfs:
    dataTrain.append(pd.concat(class_dfs))
combined_train_data = pd.concat(dataTrain)

dataTest = []
for class_files in datafiles[1]:
  script_dir = os.path.dirname(os.path.abspath(__file__))
  data_dir = os.path.join(script_dir, 'data')
  class_dfs = []
  for base_filename in class_files:
    filepath = os.path.join(data_dir, f'{base_filename}.csv')
    try:
      df = pd.read_csv(filepath)
      df['timestamp'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
      df['timestamp'] = df['timestamp'].fillna(pd.to_datetime(df['datetime'], format='%d-%m-%Y %H:%M:%S', errors='coerce'))
      for col in features:
        df[col] = pd.to_numeric(df[col], errors='coerce')
      df = df.set_index('timestamp').resample('5Min')[features].mean()
      df = df[features].interpolate()
      class_dfs.append(df)
    except FileNotFoundError:
      print(f"Warning: File {filepath} not found and skipped.")
  if class_dfs:
    dataTest.append(pd.concat(class_dfs))

# Normalize data
scaler = StandardScaler() # Using StandardScaler
scaled_train_data = scaler.fit_transform(combined_train_data[features])
scaled_test_data_list = []
for df in dataTest:
  scaled_test_data_list.append(scaler.transform(df[features]))

scaled_train_df = pd.DataFrame(scaled_train_data, columns=features, index=combined_train_data.index)
scaled_test_df_list = [pd.DataFrame(data, columns=features, index=df.index) for data, df in zip(scaled_test_data_list, dataTest)]

# Create time sequences (NO Rate of Change)
def create_sequences(data, timesteps):
  sequences = []
  for i in range(len(data) - timesteps + 1):
    sequences.append(data[i:i + timesteps])
  return np.array(sequences)

X_train_sequences = create_sequences(scaled_train_df.values, timesteps)
# X_test_sequences_list calculated later in main execution

# Train K-Means model on all training data
n_samples_train, n_timesteps_train, n_features_train = X_train_sequences.shape
X_train_reshaped = X_train_sequences.reshape(n_samples_train, n_timesteps_train * n_features_train)
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=n_init) # Using n_init from options (FIXED)
kmeans.fit(X_train_reshaped)

# Function to detect change points (Function following sharp jumps or drops)
# This function works on a single 2D data array (samples, features)
def detect_change_points(data, threshold=0.8):
  change_points = []
  # Iterate through the data points starting from the second one
  for i in range(1, len(data)):
    # Calculate the absolute difference between the current point and the previous point
    difference = np.abs(data[i] - data[i-1])
    # If the difference for ANY feature is greater than the threshold, mark this point as a change point
    if np.any(difference > threshold):
      change_points.append(i)
  return np.array(change_points)

# Function to plot clustered data (adapted to accept sequence indices and show change points)
def plot_clustered_data(df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=False, change_point_indices=None):
  num_features = len(features)
  fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
  if num_features == 1: axes = [axes] # Ensure axes is always an array
  colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))

  for i, feature in enumerate(features):
    # Plot data points colored by their assigned cluster
    for cluster_id in range(n_clusters):
      cluster_indices_kmeans = np.where(predicted_clusters == cluster_id)[0]
      if len(cluster_indices_kmeans) > 0:
        # Use time_index for x-axis and original df for y-axis values
        axes[i].scatter(time_index[cluster_indices_kmeans], df[feature].loc[time_index[cluster_indices_kmeans]],
               color=colors[cluster_id], label=f'Cluster {cluster_id}', s=10, alpha=0.6) # Added alpha for better visualization of overlaps
    axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
    axes[i].set_title(featureNames[feature])
    axes[i].grid(True, linestyle='--', alpha=0.6) # Added grid for readability

    # Plot change points if enabled
    if show_cp and change_point_indices is not None:
      # Ensure change_point_indices contains datetime objects matching time_index
      for cp_time in change_point_indices:
        axes[i].axvline(x=cp_time, color='red', linestyle='--', linewidth=1.5, label='Change Point' if i == 0 else '', alpha=0.8) # Only one label for change point

  # Add legend to the last subplot, including change point if plotted
  handles, labels = [], []
  for ax in axes:
    for handle, label in zip(*ax.get_legend_handles_labels()):
      if label not in labels:
        handles.append(handle)
        labels.append(label)
  if handles:
    axes[-1].legend(handles, labels, loc='upper right')

  plt.tight_layout()
  plt.show()

# Combined Evaluation Function (Full and Delayed)
def evaluate_and_report(kmeans_model, scaled_test_data_list, original_test_data_list, true_labels_list, timesteps, delay_steps, features, options):
  all_y_true_full = [] # True labels for all test sequences
  all_predicted_cluster_labels_full = [] # Predicted clusters for all test sequences
  all_original_test_sequences_full = [] # Original sequences for plotting anomalies/misclassified later if needed
  all_change_points_detected_list = [] # Detected change points for each test file

  # --- 1. Collect data and predict clusters for ALL test sequences ---
  for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)):
    original_indices = original_df.index
    # time_index for plotting corresponds to the end of each sequence
    time_index = original_indices[timesteps - 1:]

    sequences = create_sequences(scaled_df.values, timesteps)
    if sequences.size == 0:
      print(f"Warning: No sequences generated for test file {k}. Skipping.")
      all_change_points_detected_list.append([]) # Append empty list for consistency
      continue # Skip to next file

    n_sequences = sequences.shape[0]
    reshaped_sequences = sequences.reshape(n_sequences, -1)
    predicted_clusters = kmeans_model.predict(reshaped_sequences)

    # Collect true labels and predicted clusters for FULL evaluation
    all_y_true_full.extend(y_true_categorical[timesteps - 1:])
    all_predicted_cluster_labels_full.extend(predicted_clusters)

    # Collect original sequences (aligned with sequence ends) for potential plotting
    for i in range(n_sequences):
      start_index = original_df.index.get_loc(time_index[i]) - (timesteps - 1)
      end_index = start_index + timesteps
      all_original_test_sequences_full.append(original_df[features].iloc[start_index:end_index].values)


    # Detect change points for this test file (on scaled data)
    change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed
    # Map change point indices to the time_index of sequences
    # A change at index `i` in the original data affects the sequence ending at `i`.
    # So, a change point at original index `cp_original` corresponds to the sequence ending at `cp_original`.
    # The index in the `sequences` array corresponding to original index `cp_original` is `cp_original - (timesteps - 1)`.
    # We need to make sure this sequence index is valid (>= 0 and < n_sequences).
    change_point_sequence_indices = change_points - (timesteps - 1)
    # Filter for valid sequence indices
    valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < n_sequences)]

    # Store the time index of the valid change point sequences
    if valid_change_point_sequence_indices.size > 0:
      cp_time_indices = time_index[valid_change_point_sequence_indices].tolist()
      all_change_points_detected_list.append(cp_time_indices)
    else:
      all_change_points_detected_list.append([])

    # Plot clustered data for the current test file if requested and transition=False (to avoid duplicate plots handled later)
    if options.plot_clustered and not options.transition:
      print(f"\nClustered Data for Test File {k}:")
      plot_clustered_data(original_df, predicted_clusters, time_index, kmeans_model.n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices if show_change_points else None)


  all_y_true_full = np.array(all_y_true_full)
  all_predicted_cluster_labels_full = np.array(all_predicted_cluster_labels_full)
  all_original_test_sequences_full = np.array(all_original_test_sequences_full)


  # --- 2. Perform FULL Evaluation (on all test sequences) ---
  print("\n--- Full Evaluation Results (All Test Sequences) ---")

  # Analyze clusters and assign a dominant true label to each cluster based on ALL test sequences
  cluster_dominant_label_full = {}
  for cluster_id in range(kmeans_model.n_clusters):
    indices_in_cluster = np.where(all_predicted_cluster_labels_full == cluster_id)[0]
    if len(indices_in_cluster) > 0:
      labels_in_cluster = all_y_true_full[indices_in_cluster]
      if len(labels_in_cluster) > 0:
        # Use np.argmax to find the index of the max count (dominant label)
        dominant_label = np.argmax(np.bincount(labels_in_cluster))
        cluster_dominant_label_full[cluster_id] = dominant_label
      else:
        cluster_dominant_label_full[cluster_id] = -1 # No data in this cluster with known labels
    else:
      cluster_dominant_label_full[cluster_id] = -1 # Empty cluster

  # Create predicted labels for full evaluation based on the dominant label of the assigned cluster
  predicted_labels_numeric_full = np.array([cluster_dominant_label_full.get(cluster_id, -1) for cluster_id in all_predicted_cluster_labels_full])

  # Evaluate (using numeric labels for the full set)
  valid_indices_full = predicted_labels_numeric_full != -1
  if np.sum(valid_indices_full) > 0 and len(np.unique(all_y_true_full[valid_indices_full])) > 1:
    print("Classification Report (Full Test Set):")
    print(classification_report(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full]))
    cm_full = confusion_matrix(all_y_true_full[valid_indices_full], predicted_labels_numeric_full[valid_indices_full])
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted Cluster (Dominant True Label)')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix (Full Test Set)')
    plt.show()
  else:
    print("Could not perform full evaluation (not enough data or classes after mapping).")


  # --- 3. Perform DELAYED Evaluation (on subset after delay) ---
  print("\n--- Delayed Evaluation Results (Subset after Delay) ---")

  all_y_true_delayed = []
  all_predicted_cluster_labels_delayed = []

  # Recalculate evaluation_allowed based on combined sequence indices and change points across all test data
  # This requires mapping the original change point indices back to the combined sequence indices.
  # This is complex. Let's simplify: Apply delay logic PER FILE, then combine. This is what the original evaluate_with_delay did.
  # Let's revert the data collection for delayed eval back to per-file processing for simplicity and match original intent.

  # Re-process test data per file for delayed evaluation
  for k, (scaled_df, original_df, y_true_categorical) in enumerate(zip(scaled_test_data_list, original_test_data_list, true_labels_list)):
    original_indices = original_df.index
    time_index = original_indices[timesteps - 1:]
    sequences = create_sequences(scaled_df.values, timesteps)
    if sequences.size == 0: continue # Skip empty files

    n_sequences = sequences.shape[0]
    reshaped_sequences = sequences.reshape(n_sequences, -1)
    predicted_clusters = kmeans_model.predict(reshaped_sequences)

    # Detect change points for THIS file (on scaled data)
    change_points = detect_change_points(scaled_df.values, threshold=0.8) # Adjust threshold as needed

    # Apply delay logic PER FILE
    evaluation_allowed_file = np.ones(n_sequences, dtype=bool)
    # Map original data change point indices to sequence indices for delay logic
    change_point_sequence_indices_file = change_points - (timesteps - 1)
    # Filter for valid sequence indices for delay
    valid_change_point_sequence_indices_file = change_point_sequence_indices_file[(change_point_sequence_indices_file >= 0) & (change_point_sequence_indices_file < n_sequences)]


    for cp_seq_index in valid_change_point_sequence_indices_file:
      start_delay = max(0, cp_seq_index)
      end_delay = min(n_sequences, cp_seq_index + delay_steps)
      evaluation_allowed_file[start_delay:end_delay] = False


    # Collect data for DELAYED evaluation (only where evaluation_allowed_file is True)
    # Use the dominant label mapping calculated on the FULL test set for consistency
    predicted_labels_numeric_file = np.array([cluster_dominant_label_full.get(cluster, -1) for cluster in predicted_clusters]) # Use full mapping
    true_labels_file = y_true_categorical[timesteps - 1:] # True labels aligned with sequences

    all_y_true_delayed.extend(true_labels_file[evaluation_allowed_file])
    all_predicted_cluster_labels_delayed.extend(predicted_labels_numeric_file[evaluation_allowed_file])


  all_y_true_delayed = np.array(all_y_true_delayed)
  all_predicted_cluster_labels_delayed = np.array(all_predicted_cluster_labels_delayed)


  # Perform Delayed Evaluation
  valid_indices_delayed = all_predicted_cluster_labels_delayed != -1
  if np.sum(valid_indices_delayed) > 0 and len(np.unique(all_y_true_delayed[valid_indices_delayed])) > 1:
    print("Classification Report (Subset after Delay):")
    print(classification_report(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed]))
    cm_delayed = confusion_matrix(all_y_true_delayed[valid_indices_delayed], all_predicted_cluster_labels_delayed[valid_indices_delayed])
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_delayed, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted Label (Delayed)')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix (Subset after Delay)')
    plt.show()
  else:
    print("Could not perform delayed evaluation (not enough data after delay or classes).")

  # --- 4. Report Detected Change Points ---
  print("\nDetected Change Points (Start Time of Sequence after Change):")
  # Print the collected list of change point time indices per file
  for i, cp_list in enumerate(all_change_points_detected_list):
    print(f"File {i}: {cp_list}")

  # Note: Anomaly and Misclassified plotting is not implemented in this version due to complexity with delayed evaluation subset.

# Main execution
if __name__ == "__main__":
  # Load and preprocess training data (already done outside __name__ == "__main__")
  # Load and preprocess test data (already done outside __name__ == "__main__")

  # Create true labels list for test data
  true_labels_list = []
  for i, df in enumerate(dataTest):
    true_labels_list.append(np.full(len(df), i))

  # Plot raw data if requested
  if options.plot_raw:
    print("\nPlotting Raw Data:")
    num_features = len(features)
    fig, axes = plt.subplots(num_features, 1, figsize=(15, 5 * num_features), sharex=True)
    if num_features == 1: axes = [axes]
    for i, feature in enumerate(features):
      for k, df in enumerate(dataTest):
        axes[i].plot(df.index, df[feature], label=f'Class {k}', alpha=0.7) # Added alpha
      axes[i].set_ylabel(f'{featureNames[feature]} {unitNames[feature]}')
      axes[i].set_title(featureNames[feature])
    axes[-1].legend(loc='upper right') # Legend on the last subplot
    plt.tight_layout()
    plt.show()

  # Plot clustered data for training set if requested
  train_sequences = create_sequences(scaled_train_df.values, timesteps)
  train_reshaped_sequences = train_sequences.reshape(train_sequences.shape[0], -1)
  train_predicted_clusters = kmeans.predict(train_reshaped_sequences)
  train_time_index = combined_train_data.index[timesteps - 1:]
  if options.plot_clustered:
    print("\nClustered Data for Training Set:")
    # Change points detection for training data (optional)
    train_change_points = detect_change_points(scaled_train_df.values, threshold=0.8)
    train_change_point_sequence_indices = train_change_points - (timesteps - 1)
    valid_train_change_point_sequence_indices = train_change_point_sequence_indices[(train_change_point_sequence_indices >= 0) & (train_change_point_sequence_indices < train_sequences.shape[0])]
    train_cp_time_indices = train_time_index[valid_train_change_point_sequence_indices].tolist() if valid_train_change_point_sequence_indices.size > 0 else None

    plot_clustered_data(combined_train_data.loc[train_time_index], train_predicted_clusters, train_time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=train_cp_time_indices)


  # Plot clustered data for TEST set (per file) if requested
  # Note: This iterates through test files and plots each one's clustered data.
  # It also calculates and plots change points for each file.
  if options.plot_clustered:
    print("\nClustered Data for Test Sets (per file):")
    for k, (scaled_df, original_df) in enumerate(zip(scaled_test_df_list, dataTest)):
      original_indices = original_df.index
      time_index = original_indices[timesteps - 1:]
      sequences = create_sequences(scaled_df.values, timesteps)
      if sequences.size == 0: continue
      reshaped_sequences = sequences.reshape(sequences.shape[0], -1)
      predicted_clusters = kmeans.predict(reshaped_sequences)

      # Change points detection for this test file
      change_points = detect_change_points(scaled_df.values, threshold=0.8)
      change_point_sequence_indices = change_points - (timesteps - 1)
      valid_change_point_sequence_indices = change_point_sequence_indices[(change_point_sequence_indices >= 0) & (change_point_sequence_indices < sequences.shape[0])]
      cp_time_indices = time_index[valid_change_point_sequence_indices].tolist() if valid_change_point_sequence_indices.size > 0 else None

      print(f" Plotting Test File {k}")
      plot_clustered_data(original_df, predicted_clusters, time_index, n_clusters, features, featureNames, unitNames, show_cp=show_change_points, change_point_indices=cp_time_indices)


  # Perform Evaluation (Full and Delayed)
  # This function handles all evaluation reporting and confusion matrix plotting
  evaluate_and_report(kmeans, scaled_test_df_list, dataTest, true_labels_list, timesteps, delay_steps, features, options) # Corrected: Pass scaled_test_df_list

  # Calculate and print Inertia and Silhouette Score for combined test data
  # Need to combine all test sequences and cluster labels first, if not already done
  X_test_sequences_combined = np.vstack([create_sequences(df.values, timesteps) for df in scaled_test_df_list if create_sequences(df.values, timesteps).size > 0])
  if X_test_sequences_combined.size > 0:
    X_test_combined_reshaped = X_test_sequences_combined.reshape(X_test_sequences_combined.shape[0], -1)
    all_cluster_labels_test_combined = kmeans.predict(X_test_combined_reshaped)

    print("\n--- K-Means Model Evaluation (Overall Metrics on Combined Test Data) ---")
    print(f"Inertia: {kmeans.inertia_:.4f}") # Inertia is from training fit

    # Silhouette score on the combined test data
    if len(np.unique(all_cluster_labels_test_combined)) > 1 and len(all_cluster_labels_test_combined) > 0:
      silhouette = silhouette_score(X_test_combined_reshaped, all_cluster_labels_test_combined)
      print(f"Silhouette Score: {silhouette:.4f}")
    else:
      print("Silhouette Score: Not applicable for single cluster on combined test data.")
  else:
    print("\n--- K-Means Model Evaluation (Overall Metrics) ---")
    print("No test data sequences available to evaluate overall Inertia and Silhouette Score.")

  # Note: Anomaly and Misclassified plotting is not implemented in this version.