# Csar Fdez, UdL, 2025 # Unsupervised classification. Uses tslearn # https://tslearn.readthedocs.io/en/stable/index.html # Be careful with v0_unsupervised and all versions for supervised. # because dataTrain is not stacke before create_sequences, so, # the sets are not aligned in time import pandas as pd import matplotlib.pyplot as plt import datetime import numpy as np import os.path from optparse import OptionParser import copy import pickle from tslearn.clustering import TimeSeriesKMeans from tslearn.neighbors import KNeighborsTimeSeries from collections import Counter parser = OptionParser() parser.add_option("-t", "--train", dest="train", help="Trains the models (false)", default=False, action="store_true") parser.add_option("-n", "--timesteps", dest="timesteps", help="TIME STEPS ", default=12) (options, args) = parser.parse_args() # data files arrays. Index: # 0. No failure # 1. Blocked evaporator # 2. Full Blocked condenser # 3. Partial Blocked condenser # 4 Fan condenser not working # 5. Open door NumberOfFailures=4 # So far, we have only data for the first 4 types of failures datafiles=[] for i in range(NumberOfFailures+1): datafiles.append([]) # Next set of ddata corresponds to Freezer, SP=-26 datafiles[0]=['2024-08-07_5_','2024-08-08_5_','2025-01-25_5_','2025-01-26_5_','2025-01-27_5_','2025-01-28_5_'] datafiles[1]=['2024-12-11_5_', '2024-12-12_5_','2024-12-13_5_','2024-12-14_5_','2024-12-15_5_'] #datafiles[1]=['2024-12-17_5_','2024-12-16_5_','2024-12-11_5_', '2024-12-12_5_','2024-12-13_5_','2024-12-14_5_','2024-12-15_5_'] # This have transitions datafiles[2]=['2024-12-18_5_','2024-12-19_5_'] datafiles[3]=['2024-12-21_5_','2024-12-22_5_','2024-12-23_5_','2024-12-24_5_','2024-12-25_5_','2024-12-26_5_'] datafiles[4]=['2024-12-28_5_','2024-12-29_5_','2024-12-30_5_','2024-12-31_5_','2025-01-01_5_'] #datafiles[4]=['2024-12-27_5_','2024-12-28_5_','2024-12-29_5_','2024-12-30_5_','2024-12-31_5_','2025-01-01_5_'] # This have transitions #datafiles[4]=[] # Features suggested by Xavier # Care with 'tc s3' because on datafiles[0] is always nulll # Seems to be incoropored in new tests #r1s5 supply air flow temperature #r1s1 inlet evaporator temperature #r1s4 condenser outlet # VAriables r1s4 and pa1 apiii may not exists in cloud controlers features=['r1 s1','r1 s4','r1 s5','pa1 apiii'] features=['r1 s1','r1 s4','r1 s5'] featureNames={} featureNames['r1 s1']='$T_{evap}$' featureNames['r1 s4']='$T_{cond}$' featureNames['r1 s5']='$T_{air}$' featureNames['pa1 apiii']='$P_{elec}$' unitNames={} unitNames['r1 s1']='$(^{o}C)$' unitNames['r1 s4']='$(^{o}C)$' unitNames['r1 s5']='$(^{o}C)$' unitNames['pa1 apiii']='$(W)$' #features=['r1 s1','r1 s2','r1 s3','r1 s4','r1 s5','r1 s6','r1 s7','r1 s8','r1 s9','r1 s10','r2 s1','r2 s2','r2 s3','r2 s4','r2 s5','r2 s6','r2 s7','r2 s8','r2 s9','pa1 apiii','tc s1','tc s2'] #features=['r2 s2', 'tc s1','r1 s10','r1 s6','r2 s8'] NumFeatures=len(features) df_list=[] for i in range(NumberOfFailures+1): df_list.append([]) for i in range(NumberOfFailures+1): dftemp=[] for f in datafiles[i]: print(" ", f) #df1 = pd.read_csv('./data/'+f+'.csv', parse_dates=['datetime'], dayfirst=True, index_col='datetime') df1 = pd.read_csv('./data/'+f+'.csv') dftemp.append(df1) df_list[i]=pd.concat(dftemp) # subsampled to 5' = 30 * 10" # We consider smaples every 5' because in production, we will only have data at this frequency subsamplingrate=30 dataframe=[] for i in range(NumberOfFailures+1): dataframe.append([]) for i in range(NumberOfFailures+1): datalength=df_list[i].shape[0] dataframe[i]=df_list[i].iloc[range(0,datalength,subsamplingrate)][features] dataframe[i].reset_index(inplace=True,drop=True) dataframe[i].dropna(inplace=True) # Train data is first 2/3 of data # Test data is: last 1/3 of data dataTrain=[] dataTest=[] NumberOfSamplesForTest=300 for i in range(NumberOfFailures+1): dataTrain.append(dataframe[i].values[0:int(dataframe[i].shape[0]*2/3),:]) if NumberOfSamplesForTest==0: # Take all dataTest.append(dataframe[i].values[int(dataframe[i].shape[0]*2/3):,:]) else: dataTest.append(dataframe[i].values[int(dataframe[i].shape[0]*2/3):int(dataframe[i].shape[0]*2/3)+NumberOfSamplesForTest,:]) # Calculate means and stdev a=dataTrain[0] for i in range(1,NumberOfFailures+1): a=np.vstack((a,dataTrain[i])) means=a.mean(axis=0) stdevs=a.std(axis=0) def normalize2(train,test): return( (train-means)/stdevs, (test-means)/stdevs ) dataTrainNorm=[] dataTestNorm=[] for i in range(NumberOfFailures+1): dataTrainNorm.append([]) dataTestNorm.append([]) for i in range(NumberOfFailures+1): (dataTrainNorm[i],dataTestNorm[i])=normalize2(dataTrain[i],dataTest[i]) def plotData(): fig, axes = plt.subplots( nrows=NumberOfFailures+1, ncols=2, figsize=(15, 20), dpi=80, facecolor="w", edgecolor="k",sharex=True ) for i in range(NumberOfFailures+1): axes[i][0].plot(np.concatenate((dataTrainNorm[i][:,0],dataTestNorm[i][:,0])),label="Fail "+str(i)+", feature 0") axes[i][1].plot(np.concatenate((dataTrainNorm[i][:,1],dataTestNorm[i][:,1])),label="Fail "+str(i)+", feature 1") #axes[1].legend() #axes[0].set_ylabel(features[0]) #axes[1].set_ylabel(features[1]) plt.show() #plotData() def create_sequences(values, time_steps): output = [] for i in range(len(values) - time_steps ): output.append(values[i : (i + time_steps)]) return np.stack(output) def listToString(l): r='' for i in l: r+=str(i) return(r.replace(' ','')) timesteps=int(options.timesteps) X=dataTrainNorm[0] for i in range(1,NumberOfFailures+1): X=np.vstack((X,dataTrainNorm[i])) xtrain=create_sequences(X,timesteps) km = TimeSeriesKMeans(n_clusters=NumberOfFailures+1, metric="dtw", random_state=0) modelpath="model_v1_unsupervised_"+str(timesteps)+listToString(features)+".pk" if options.train: km.fit(xtrain) km.to_pickle(modelpath) else: km.from_pickle(modelpath) #km.fit_predict(xtrain) colorline=['violet','lightcoral','cyan','lime','grey'] colordot=['darkviolet','red','blue','green','black'] featuresToPlot=features indexesToPlot=[] for i in featuresToPlot: indexesToPlot.append(features.index(i)) def plot(data,ranges): km.fit_predict(data) # Expand data to plot with the timesteps samples of last sample datatoplot=data[:,0,:] datatoplot=np.vstack((datatoplot,data[ranges[len(ranges)-1][1],:,:])) labels=[] # Labels are assigned randomly by classifer for i in range(len(ranges)): b=Counter(km.labels_[ranges[i][0]:ranges[i][1]]) labels.append(b.most_common(1)[0][0]) print("\n\n\n\LABELS: ",labels,"\n\n") NumFeaturesToPlot=len(indexesToPlot) plt.rcParams.update({'font.size': 16}) fig, axes = plt.subplots( nrows=NumFeaturesToPlot, ncols=1, figsize=(15, 10), dpi=80, facecolor="w", edgecolor="k",sharex=True ) for i in range(NumFeaturesToPlot): init=0 end=ranges[0][1] labelsplotted=[] for j in range(len(ranges)): if j==(len(ranges)-1): # Plot the last timesteps classtype=labels.index(labels[j]) if classtype in labelsplotted: axes[i].plot(range(init,end+timesteps),datatoplot[ranges[j][0]:ranges[j][1]+timesteps,indexesToPlot[i]]*stdevs[i]+means[i], color=colorline[classtype],linewidth=1) else: axes[i].plot(range(init,end+timesteps),datatoplot[ranges[j][0]:ranges[j][1]+timesteps,indexesToPlot[i]]*stdevs[i]+means[i], label="Class: "+str(classtype), color=colorline[classtype],linewidth=1) labelsplotted.append(classtype) else: classtype=labels.index(labels[j]) if classtype in labelsplotted: axes[i].plot(range(init,end),datatoplot[ranges[j][0]:ranges[j][1],indexesToPlot[i]]*stdevs[i]+means[i], color=colorline[classtype],linewidth=1) else: axes[i].plot(range(init,end),datatoplot[ranges[j][0]:ranges[j][1],indexesToPlot[i]]*stdevs[i]+means[i], label="Class: "+str(classtype), color=colorline[classtype],linewidth=1) labelsplotted.append(classtype) init=end if j<(len(ranges)-1): end+=(ranges[j+1][1]-ranges[j+1][0]) x=[] y=[] for j in range(len(ranges)): x.append([]) y.append([]) for j in range(len(ranges)): for k in range(ranges[j][0],ranges[j][1]): try: # Idont know why sometimes fails index !!!! x[labels.index(km.labels_[k])].append(k+timesteps) y[labels.index(km.labels_[k])].append(datatoplot[k+timesteps,indexesToPlot[i]]*stdevs[i]+means[i]) except: x[0].append(k+timesteps) y[0].append(datatoplot[k+timesteps,indexesToPlot[i]]*stdevs[i]+means[i]) labelsplotted=[] for j in range(len(ranges)): classtype=labels.index(labels[j]) if classtype in labelsplotted: axes[i].plot(x[j],y[j] ,color=colordot[labels.index(labels[j])],marker='.',linewidth=0) else: axes[i].plot(x[j],y[j] ,color=colordot[labels.index(labels[j])],marker='.',linewidth=0,label="Class type "+str(j) ) labelsplotted.append(classtype) if i==(NumFeatures-1): axes[i].legend(loc='right') s='' s+=featureNames[features[indexesToPlot[i]]] s+=' '+unitNames[features[indexesToPlot[i]]] axes[i].set_ylabel(s) axes[i].grid() axes[NumFeaturesToPlot-1].set_xlabel("Sample number") plt.show() ''' Ranges=[] r=0 for i in range(NumberOfFailures+1): Ranges.append([r,r+dataTrainNorm[i].shape[0]]) r+=dataTrainNorm[i].shape[0] # Drop the last TIME_STEPS for plotting Ranges[NumberOfFailures][1]=Ranges[NumberOfFailures][1]-timesteps-1 plot(xtrain,Ranges) ''' # Try with test data X=dataTestNorm[0] Ranges=[[0,dataTestNorm[0].shape[0]]] r=dataTestNorm[0].shape[0] for i in range(1,NumberOfFailures+1): X=np.vstack((X,dataTestNorm[i])) Ranges.append([r,r+dataTestNorm[i].shape[0]]) r+=dataTestNorm[i].shape[0] X=np.vstack((X,dataTestNorm[0])) # We add a last segment of no fail data Ranges.append([r,r+dataTestNorm[0].shape[0]]) Ranges[len(Ranges)-1][1]=Ranges[len(Ranges)-1][1]-timesteps-1 xtest=create_sequences(X,timesteps) def anomalyMetric(labels,ranges): # Takes ONLY the firsts segments of Ranges # FP, TP: false/true positive # TN, FN: true/false negative # Sensitivity (recall): probab failure detection if data is fail: TP/(TP+FN) # Precision: Rate of positive results: TP/(TP+FP) # F1-score: predictive performance measure: 2*Precision*Sensitity/(Precision+Sensitity) lab=[] # Labels are assigned randomly by classifer TP=[] FN=[] TPFP=[] for i in range(NumberOfFailures+1): TP.append([]) FN.append([]) TPFP.append([]) b=Counter(labels[ranges[i][0]:ranges[i][1]]) lab.append(b.most_common(1)[0][0]) for i in range(NumberOfFailures+1): counttp=0 countfn=0 for j in range(ranges[i][0],ranges[i][1]-timesteps): if lab.index(labels[j])==i: counttp+=1 else: countfn+=1 TP[i]=counttp FN[i]=countfn for i in range(NumberOfFailures+1): count=0 for ii in range(NumberOfFailures+1): for j in range(ranges[ii][0],ranges[ii][1]-timesteps): if lab.index(labels[j])==i: count+=1 TPFP[i]=count segmentLength=[] for i in range(NumberOfFailures+1): segmentLength.append(ranges[i][1]-timesteps-ranges[i][0]) totalSegmentLength=0 for i in range(NumberOfFailures+1): totalSegmentLength+=segmentLength[i] Sensitivity=0 Precision=0 for i in range(NumberOfFailures+1): Sensitivity+=TP[i]/(TP[i]+FN[i])*segmentLength[i]/totalSegmentLength Precision+=TP[i]/(TPFP[i])*segmentLength[i]/totalSegmentLength print(lab) print("TP: ",TP) print("FN: ",FN) print("TPFP: ",TPFP) print("Sensitivity: ",Sensitivity) print("Precision: ",Precision) print("F1-Score: ",2*Precision*Sensitivity/(Sensitivity+Precision)) km.fit_predict(xtest) anomalyMetric(km.labels_,Ranges) plot(xtest,Ranges)