import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tensorflow.keras import Model
from tensorflow.keras.layers import Input,LSTM,Dropout,Dense,Activation,Conv1D,AveragePooling1D,Bidirectional,Add
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
WEATHER_PATH = './datasets'
def load_weather_data(weather_path):
csv_path = os.path.join(weather_path,"weather.csv")
return pd.read_csv(csv_path)
weather = load_weather_data(WEATHER_PATH)
print(weather.columns)
print(weather.info())
print(weather.describe())
def Clean_weather(weather):
date = pd.to_datetime(weather['date'].apply(lambda x: str(x)))
weather['year'] =date.dt.year
weather['month'] = date.dt.month
weather['week'] = date.dt.weekofyear
weather['quarter'] = date.dt.to_period('Q').astype('str')[:-2].apply(lambda x:x[-1]).astype('int')
weather['day'] = date.dt.dayofyear
weather[weather==999999] = np.NaN
weather[weather==999990] = np.NaN
return weather
weather = Clean_weather(weather)
weather.to_csv('./datasets/clean_weather.csv',index=False)
plt.rcParams['font.family'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def plot_of_quarter_city(X,attribe):
df_group = X[attribe].groupby([X['quarter'],X['city']]).mean()
pop_df = df_group.unstack()
return pop_df
print(weather.describe())
df_vb_group1 = plot_of_quarter_city(weather,'visibility')
df_vb_group1.plot(kind='bar')
plt.title("可见度变化情况")
plt.show()
df_rain20_group = plot_of_quarter_city(weather,'rain20')
plt.title("20时降雨量")
df_rain08_group = plot_of_quarter_city(weather,'rain08')
df_rain08_group.plot(kind='bar')
plt.title("08时降雨量")
plt.show()
df_tempature_group = plot_of_quarter_city(weather,'temperature')
df_tempature_group.plot(kind='bar')
plt.title("温度变化情况")
plt.show()
df_humidity_group = plot_of_quarter_city(weather,'humidity')
df_humidity_group.plot(kind='bar')
plt.title("湿度变化情况")
plt.show()
df_pressure_group = plot_of_quarter_city(weather,'pressure')
df_pressure_group.plot(kind='bar')
plt.title("气压变化情况")
plt.show()
df_windspeed_group = plot_of_quarter_city(weather,'wind_speed')
df_windspeed_group.plot(kind='bar')
plt.title("风速变化情况")
plt.show()
df_cloud_group = plot_of_quarter_city(weather,'cloud')
df_cloud_group.plot(kind='bar')
plt.title("云量变化情况")
plt.show()
from pandas.plotting import scatter_matrix
attributes = ['wind_speed','temperature', 'humidity', 'rain20', 'cloud', 'visibility']
scatter_matrix(weather[attributes],figsize=(12,8))
plt.show()
def windows_select(data_perpared,feature_size,squence_length,temper_dim):
features = []
labels = []
for i in range(data_perpared.shape[0]-squence_length):
x = np.array(data_perpared[i:i+squence_length,:]).flatten()
y = data_perpared[i+squence_length,temper_dim]
features.append(x)
labels.append(y)
features = np.array(features)
labels = np.array(labels)
return features,labels
def shuffle_data(features,labels):
shuffle_indicies = np.random.permutation(features.shape[0])
features = features[shuffle_indicies,:]
labels = labels[shuffle_indicies]
return features,labels
def get_data(data_perpared,is_joint=False,squence_length=20,feature_size=7,temper_dim=2,train_ratio=0.8,validate_ratio=0.1):
if not is_joint:
data_perpared = data_perpared[:,:feature_size]
features,labels =windows_select(data_perpared,feature_size,squence_length,temper_dim)
features,labels = shuffle_data(features,labels)
else:
data_perpared[0] = data_perpared[0][:,:feature_size]
features,labels = windows_select(data_perpared[0],feature_size,squence_length,temper_dim)
for data in data_perpared[1:]:
data = data[:,:feature_size]
f,l =windows_select(data,feature_size,squence_length,temper_dim)
features = np.vstack((features,f))
labels = np.hstack((labels,l))
features,labels = shuffle_data(features,labels)
print("features.shape",features.shape)
print("labels.shape",labels.shape)
train_row = round(features.shape[0]*train_ratio)
validate_num = round(features.shape[0]*validate_ratio)
test_num = features.shape[0]-train_row-validate_num
x_train = np.reshape(features[:train_row,:],(train_row,squence_length,feature_size))
y_train = np.reshape(labels[:train_row],(train_row,-1))
x_val = np.reshape(features[train_row:train_row+validate_num,:],(validate_num,squence_length,feature_size))
y_val = np.reshape(labels[train_row:train_row+validate_num],(validate_num,-1))
x_test = np.reshape(features[train_row+validate_num:,:],(test_num,squence_length,feature_size))
y_test = np.reshape(labels[train_row+validate_num:],(test_num,-1))
print("train_samples:",x_train.shape,y_train.shape)
print("validate_samples:",x_val.shape,y_val.shape)
print("test_samples:",x_test.shape,y_test.shape)
return (x_train,y_train,x_val,y_val,x_test,y_test)
def windows_select_multi(data_perpared,feature_size,squence_length,temper_dim):
features = []
labels = []
for i in range(data_perpared.shape[0]-squence_length):
x = np.array(data_perpared[i:i+squence_length,:]).flatten()
y = data_perpared[i+squence_length,:temper_dim]
features.append(x)
labels.append(y)
features = np.array(features)
labels = np.array(labels)
return features,labels
def shuffle_data_multi(features,labels):
shuffle_indicies = np.random.permutation(features.shape[0])
features = features[shuffle_indicies,:]
labels = labels[shuffle_indicies,:]
return features,labels
def get_data_multi(data_perpared,is_joint=False,squence_length=20,feature_size=26,temper_dim=11,train_ratio=0.8,validate_ratio=0.1):
if not is_joint:
data_perpared = data_perpared[:,:feature_size]
features,labels =windows_select_multi(data_perpared,feature_size,squence_length,temper_dim)
features,labels = shuffle_data_multi(features,labels)
else:
data_perpared[0] = data_perpared[0][:,:feature_size]
features,labels = windows_select_multi(data_perpared[0],feature_size,squence_length,temper_dim)
for data in data_perpared[1:]:
data = data[:,:feature_size]
f,l =windows_select_multi(data,feature_size,squence_length,temper_dim)
features = np.vstack((features,f))
labels = np.vstack((labels,l))
features,labels = shuffle_data_multi(features,labels)
print("features.shape",features.shape)
print("labels.shape",labels.shape)
train_row = round(features.shape[0]*train_ratio)
validate_num = round(features.shape[0]*validate_ratio)
test_num = features.shape[0]-train_row-validate_num
x_train = np.reshape(features[:train_row,:],(train_row,squence_length,feature_size))
y_train = np.reshape(labels[:train_row,:],(train_row,temper_dim))
x_val = np.reshape(features[train_row:train_row+validate_num,:],(validate_num,squence_length,feature_size))
y_val = np.reshape(labels[train_row:train_row+validate_num,:],(validate_num,temper_dim))
x_test = np.reshape(features[train_row+validate_num:,:],(test_num,squence_length,feature_size))
y_test = np.reshape(labels[train_row+validate_num:,:],(test_num,temper_dim))
print("train_samples:",x_train.shape,y_train.shape)
print("validate_samples:",x_val.shape,y_val.shape)
print("test_samples:",x_test.shape,y_test.shape)
return (x_train,y_train,x_val,y_val,x_test,y_test)
def BLSTM_model(input_shape):
x_in = Input(input_shape,name='input')
x = Conv1D(64,2,padding='same',name='conv1')(x_in)
x = AveragePooling1D(2,name='apool1')(x)
x1 = Bidirectional(LSTM(80,go_backwards=False),name='forward_lstm')(x)
x1 = Dropout(0.3,name='drop1')(x1)
x2 = Bidirectional(LSTM(80,go_backwards=True),name='backward_lstm')(x)
x2 = Dropout(0.3,name='drop2')(x2)
x = Add(name='add')([x1,x2])
x = Dense(100,activation='relu',name='dense1')(x)
x = Dropout(0.3,name='drop3')(x)
x = Dense(50,activation='relu',name='dense2')(x)
x = Dropout(0.2,name='drop4')(x)
x = Dense(10,activation='relu',name='dense3')(x)
x = Dropout(0.1,name='drop5')(x)
x = Dense(1,activation='sigmoid',name='dense4')(x)
return Model(x_in,x,name='BLSTM')
def BLSTM_model_multi(input_shape,classes):
x_in = Input(input_shape,name='input')
x = Conv1D(64,2,padding='same',name='conv1')(x_in)
x = AveragePooling1D(2,name='apool1')(x)
x1 = Bidirectional(LSTM(80,go_backwards=False),name='forward_lstm')(x)
x1 = Dropout(0.3,name='drop1')(x1)
x2 = Bidirectional(LSTM(80,go_backwards=True),name='backward_lstm')(x)
x2 = Dropout(0.3,name='drop2')(x2)
x = Add(name='add')([x1,x2])
x = Dense(100,activation='relu',name='dense1')(x)
x = Dropout(0.3,name='drop3')(x)
x = Dense(50,activation='relu',name='dense2')(x)
x = Dropout(0.2,name='drop4')(x)
x = Dense(10,activation='relu',name='dense3')(x)
x = Dropout(0.1,name='drop5')(x)
x = Dense(classes,activation='sigmoid',name='dense4')(x)
return Model(x_in,x,name='BLSTM')
def evaluate_metrics(y_test, y_pred):
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("mse: ", mse)
print("mae: ", mae)
print('rmse: ', rmse)
print("r2_score:", r2)
model_path = './checkpoint/weights-temp_acc=0.97.h5'
def test(model, x_test, y_test):
model.load_weights(model_path)
y_pred = model.predict(x_test)
evaluate_metrics(y_test, y_pred)
y_pred = y_pred.flatten()
y_test = y_test.flatten()
return y_pred, y_test
def train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,is_train=True):
input_shape = (x_train.shape[1],x_train.shape[2])
model = BLSTM_model(input_shape)
model.compile(loss="mse",optimizer=Adam())
print(model.summary())
if is_train:
cb_ckpt = ModelCheckpoint('./weights.{epoch:02d}-{val_loss:.2f}.h5',monitor='val_loss',
save_best_only=True,mode='min',period=20)
history = model.fit(x_train,y_train,batch_size=32,epochs=100,
validation_data=(x_val,y_val),shuffle=True,
callbacks=[cb_ckpt,
EarlyStopping(monitor='val_loss',patience=10)])
plt.plot(history.history['loss'],label='train')
plt.plot(history.history['val_loss'],label='test')
plt.legend()
plt.title("损失值对比图")
plt.show()
else:
y_pred,y_test = test(model,x_test,y_test)
return y_pred,y_test
num_pipeline = Pipeline([
('imputer',SimpleImputer(strategy="median")),
('std_scaler',MinMaxScaler()),
])
num_weather = weather.drop(["date",'city','county','station','year','phenomenon'],axis=1)
num_attribs = list(num_weather.columns)
full_pipeline = ColumnTransformer([
("num",num_pipeline,num_attribs),
("city",OneHotEncoder(),['city']),
])
data_perpared = full_pipeline.fit_transform(weather)
weather_clean = pd.DataFrame(data_perpared,columns=num_attribs+['shanghai','beijing', 'chongqin'])
print(weather_clean.columns)
weather_clean[['date','county']] = weather[['date','county']]
def generate_features(weather_clean):
sortweather = weather_clean.sort_values(by=['county','date'],ascending=(False,True))
data_perpared_countys = []
for name,group in sortweather.groupby('county'):
group = group.drop(['date','county'],axis=1)
group_array = np.array(group)
data_perpared_countys.append(group_array)
return data_perpared_countys
data_perpared_countys = generate_features(weather_clean)
print(len(data_perpared_countys))
x_train,y_train,x_val,y_val,x_test,y_test = get_data(data_perpared_countys,True,30,26,2)
model_path = './checkpoint/weights-temp_acc=0.97.h5'
print(data_perpared_countys[0].shape)
y_pred,y_test = train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,False)
temper_max = np.max(weather['temperature'])
temper_min = np.min(weather['temperature'])
y_pred_orignal = np.array(y_pred)*(temper_max-temper_min)+temper_min
y_test_orignal = np.array(y_test)*(temper_max-temper_min)+temper_min
evaluate_metrics(y_pred_orignal,y_test_orignal)
fig = plt.figure(figsize=(12,8))
plt.plot(y_test,label='y_test',alpha = 0.3)
plt.plot(y_pred,label='y_pred',alpha = 0.3)
plt.title("温度值对比")
plt.legend()
plt.show()
fig1 = plt.figure(figsize=(12,8))
plt.plot(y_pred_orignal,label='y_pred',alpha = 0.3)
plt.plot(y_test_orignal,label='y_test',alpha = 0.3)
plt.title("原始温度值对比")
plt.legend()
plt.show()
corr_matrix = weather_clean.corr()
print(corr_matrix['visibility'].sort_values(ascending=False))
x_train,y_train,x_val,y_val,x_test,y_test = get_data(data_perpared_countys,True,30,26,7)
model_path = './checkpoint/weights-visibility_acc=0.78.h5'
y_pred,y_test = train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,False)
visbi_max = np.max(weather['visibility'])
visbi_min = np.min(weather['visibility'])
y_pred_orignal = np.array(y_pred)*(visbi_max-visbi_min)+visbi_min
y_test_orignal = np.array(y_test)*(visbi_max-visbi_min)+visbi_min
evaluate_metrics(y_pred_orignal,y_test_orignal)
fig = plt.figure(figsize=(12,8))
plt.plot(y_test,label='y_test',alpha = 0.3)
plt.plot(y_pred,label='y_pred',alpha = 0.3)
plt.title("可见度对比")
plt.legend()
plt.show()
fig1 = plt.figure(figsize=(12,8))
plt.plot(y_pred_orignal,label='y_pred',alpha = 0.3)
plt.plot(y_test_orignal,label='y_test',alpha = 0.3)
plt.title("原始可见度对比")
plt.legend()
plt.show()
print(weather_clean.columns)
def train_and_predict_multi(x_train, y_train, x_val, y_val, x_test, y_test, is_train=True):
input_shape = (x_train.shape[1], x_train.shape[2])
model = BLSTM_model_multi(input_shape, y_train.shape[1])
model.compile(loss="mse", optimizer=Adam())
print(model.summary())
if is_train:
cb_ckpt = ModelCheckpoint('./weights.{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss',
save_best_only=True, mode='min', period=20)
history = model.fit(x_train, y_train, batch_size=32, epochs=100,
validation_data=(x_val, y_val), shuffle=True,
callbacks=[cb_ckpt,
EarlyStopping(monitor='val_loss', patience=10)])
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.title("损失值对比图")
plt.legend()
plt.show()
else:
y_pred, y_test = test(model, x_test, y_test)
return y_pred, y_test
x_train,y_train,x_val,y_val,x_test,y_test = get_data_multi(data_perpared_countys,True,30,26,7)
model_path = './checkpoint/weights-multi_acc=0.54.h5'
print(data_perpared_countys[0].shape)
y_pred,y_test = train_and_predict_multi(x_train,y_train,x_val,y_val,x_test,y_test,False)
weather_clean_class = weather_clean[['sunny', 'cloudy', 'rain', 'fog', 'haze', 'dust',
'thunder', 'lightning','snow', 'hail', 'wind','pressure', 'wind_speed', 'temperature', 'humidity', 'rain20', 'rain08',
'cloud', 'visibility','month', 'week','quarter', 'day', 'shanghai', 'beijing', 'chongqin', 'date', 'county']]
data_perpared_countys = generate_features(weather_clean_class)
classes = ['sunny', 'cloudy', 'rain', 'fog', 'haze', 'dust','thunder', 'lightning','snow', 'hail', 'wind']
model_path = ''
def test_classify(model,x_test,y_test,classes):
model.load_weights(model_path)
y_pred = model.predict(x_test)
rocauc = metrics.roc_auc_score(y_test,y_pred)
prauc = metrics.average_precision_score(y_test,y_pred,average='macro')
print(f'ROC-AUC score={rocauc:.6f}')
print(f'Prauc score={prauc:.6f}')
y_prod = (y_pred > 0.5).astype(np.float32)
acc = metrics.accuracy_score(y_test,y_prod)
f1 = metrics.f1_score(y_test,y_prod,average='samples')
print(f'acc score={acc:.6f}')
print(f'f1 score={f1:.6f}')
for i,cls in enumerate(classes):
cls_rocauc = metrics.roc_auc_score(y_test[:,i],y_pred[:,i])
cls_prauc = metrics.average_precision_score(y_test[:,i],y_pred[:,i])
cls_acc = metrics.accuracy_score(y_test[:,i],y_prod[:,i])
cls_f1 = metrics.f1_score(y_test[:,i],y_prod[:,i])
print(f"[{i:2} {cls:10}] rocauc={cls_rocauc:.4f} prauc={cls_prauc:.4f} acc={cls_acc:4f} f1={cls_f1:.4f}")
return y_pred,y_test
def train_and_predict_classify(x_train,y_train,x_val,y_val,x_test,y_test,classes,is_train=True):
input_shape = (x_train.shape[1],x_train.shape[2])
model = BLSTM_model_multi(input_shape,y_train.shape[1])
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['accuracy'])
print(model.summary())
if is_train:
cb_ckpt = ModelCheckpoint('./weights.{epoch:02d}-{val_loss:.2f}.h5',monitor='val_loss',
save_best_only=True,mode='min',period=20)
history = model.fit(x_train,y_train,batch_size=32,epochs=100,
validation_data=(x_val,y_val),shuffle=True,
callbacks=[cb_ckpt,
EarlyStopping(monitor='val_loss',patience=10)])
plt.plot(history.history['loss'],label='train')
plt.plot(history.history['val_loss'],label='test')
plt.title("损失值对比图")
plt.legend()
plt.show()
else:
y_pred,y_test = test_classify(model,x_test,y_test,classes)
return y_pred,y_test
x_train,y_train,x_val,y_val,x_test,y_test = get_data_multi(data_perpared_countys,True,30,26,11)
model_path = './checkpoint/weights_classify_auc=0.85.h5'
y_pred,y_test = train_and_predict_classify(x_train,y_train,x_val,y_val,x_test,y_test,classes,False)