딥러닝 도구는 Google CoLAB을 사용하겠다. 데이터는 다음 링크에서 다운로드 한다.
데이터를 다운로드 받고, CoLAB에 업로드 한다.
from google.colab import filesuploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
1. pands를 이용해 데이터 프레임을 생성한다.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import re
import datetime
from keras.models import *
from keras.layers import *
from keras.layers.core import Lambda
from keras import backend as K
### READ DATA AND CREATE FEATURES FOR year, month, day, hour ###
df = pd.read_csv('nyc_taxi.csv')
date = pd.DataFrame(np.vstack(df.timestamp.apply(lambda x: re.sub(r':|-' ,' ', x).split()))[:,:4].astype(int))
df = pd.concat([df, date], axis=1)
df.columns = ['timestamp','value','yr','mt','d','H']
df.head()
print(df.shape)
df.head()
### PLOT SAMPLE OF DATA ###
#df.iloc[4000:4000+7*48,:].plot(y='value',x='timestamp',figsize=(8,6))
df.iloc[8400:8400+7*48,:].plot(y='value',x='timestamp',figsize=(8,6))
plt.xticks(rotation=70)
2. LSTM을 위한 학습용 데이터를 생성한다.
### WEEKLY AUTOCORR PLOT (10 WEEKS DEPTH) ###
timeLags = np.arange(1,10*48*7)
autoCorr = [df.value.autocorr(lag=dt) for dt in timeLags]
plt.figure(figsize=(19,8))
plt.plot(1.0/(48*7)*timeLags, autoCorr);
plt.xlabel('time lag [weeks]'); plt.ylabel('correlation coeff', fontsize=12);
### CREATE WEEKDAY FEATURE AND COMPUTE THE MEAN FOR WEEKDAYS AT EVERY HOURS ###
weekday = df[['yr', 'mt', 'd']].apply(lambda x: datetime.datetime(x['yr'], x['mt'], x['d']).weekday(),axis=1).values
# weekend = weekday.copy()
# weekend[np.logical_and(weekend != 5, weekend != 6)] = 0
# weekend[weekend != 0] = 1
df['weekday'] = weekday
df['weekday_hour'] = df.weekday.astype(str) +' '+ df.H.astype(str)
df['m_weekday'] = df.weekday_hour.replace(df[:5000].groupby('weekday_hour')['value'].mean().to_dict())
### CREATE GENERATOR FOR LSTM ###
sequence_length = 48
def gen_index(id_df, seq_length, seq_cols):
data_matrix = id_df[seq_cols]
num_elements = data_matrix.shape[0]
for start, stop in zip(range(0, num_elements-seq_length, 1), range(seq_length, num_elements, 1)):
yield data_matrix[stop-sequence_length:stop].values.reshape((-1,len(seq_cols)))
### CREATE AND STANDARDIZE DATA FOR LSTM ###
cnt, mean = [], []
for sequence in gen_index(df, sequence_length, ['value']):
cnt.append(sequence)
for sequence in gen_index(df, sequence_length, ['m_weekday']):
mean.append(sequence)
cnt, mean = np.log(cnt), np.log(mean)
cnt = cnt - mean
cnt.shape
### CREATE AND STANDARDIZE LABEL FOR LSTM ###
init = df.m_weekday[sequence_length:].apply(np.log).values
label = df.value[sequence_length:].apply(np.log).values - init
label.shape
### DEFINE QUANTILE LOSS ###
def q_loss(q,y,f):
e = (y-f)
return K.mean(K.maximum(q*e, (q-1)*e), axis=-1)
### TRAIN TEST SPLIT ###
X_train, X_test = cnt[:5000], cnt[5000:]
y_train, y_test = label[:5000], label[5000:]
3. LSTM 모델을 생성한다. ### CREATE MODEL ###
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]
inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))
lstm = Bidirectional(LSTM(64, return_sequences=True, dropout=0.3))(inputs, training = True)
lstm = Bidirectional(LSTM(16, return_sequences=False, dropout=0.3))(lstm, training = True)
dense = Dense(50)(lstm)
out10 = Dense(1)(dense)
out50 = Dense(1)(dense)
out90 = Dense(1)(dense)
model = Model(inputs, [out10,out50,out90])
model.compile(loss=losses, optimizer='adam', loss_weights = [0.3,0.3,0.3])
4. 데이터를 학습한다.
history = model.fit(X_train, [y_train,y_train,y_train], epochs=50, batch_size=128, verbose=2, shuffle=True)
### QUANTILEs BOOTSTRAPPING ###
pred_10, pred_50, pred_90 = [], [], []
NN = K.function([model.layers[0].input, K.learning_phase()],
[model.layers[-3].output,model.layers[-2].output,model.layers[-1].output])
for i in tqdm.tqdm(range(0,100)):
predd = NN([X_test, 0.5])
pred_10.append(predd[0])
pred_50.append(predd[1])
pred_90.append(predd[2])
pred_10 = np.asarray(pred_10)[:,:,0]
pred_50 = np.asarray(pred_50)[:,:,0]
pred_90 = np.asarray(pred_90)[:,:,0]
### REVERSE TRANSFORM PREDICTIONS ###
pred_90_m = np.exp(np.quantile(pred_90,0.9,axis=0) + init[5000:])
pred_50_m = np.exp(pred_50.mean(axis=0) + init[5000:])
pred_10_m = np.exp(np.quantile(pred_10,0.1,axis=0) + init[5000:])
6. 이상 데이터 확인 ### EVALUATION METRIC ###
mean_squared_log_error(np.exp(y_test + init[5000:]), pred_50_m)
### PLOT QUANTILE PREDICTIONS ###
plt.figure(figsize=(16,8))
plt.plot(pred_90_m, color='cyan')
plt.plot(pred_50_m, color='blue')
plt.plot(pred_10_m, color='green')
### CROSSOVER CHECK ###
plt.scatter(np.where(np.logical_or(pred_50_m>pred_90_m, pred_50_m<pred_10_m))[0],
pred_50_m[np.logical_or(pred_50_m>pred_90_m, pred_50_m<pred_10_m)], c='red', s=50)
### PLOT UNCERTAINTY INTERVAL LENGHT WITH REAL DATA ###
plt.figure(figsize=(16,8))
plt.plot(np.exp(y_test + init[5000:]), color='red', alpha=0.4)
plt.scatter(range(len(pred_10_m)), pred_90_m - pred_10_m)
그럼 다음과 같은 결과를 얻을 수 있다. 아래에서 청색 지점은 이상이 상대적으로 많은 곳이다.
댓글 없음:
댓글 쓰기