2019全国高校大数据应用创新竞赛Baseline

网址 : https://ai.futurelab.tv/tournament/2

### 1. 导入需要的工具包并查看数据
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error as mse
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l2
import warnings

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
# 读入并查看数据
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y%m%d')
df = pd.read_csv('bd2019-weather-prediction-training-20190608.csv', 
							parse_dates=True, date_parser=dateparse, index_col='date')
df.head(5)
### 2. 定义的一些方法
def getMetrics(y_true, y_pred):
    '''
    结果评估函数
    y_hat : 预测结果
    y : 真实结果
    '''
    
    plt.figure(figsize=(18,6))
    plt.plot(y_true)
    plt.plot(y_pred, color='red')
    plt.show()
    
    r2 = 1 - np.sum(np.square(np.array(y_pred)-np.array(y_true)))/np.sum(np.square(np.array(y_true)-np.mean(y_true)))
    print('R2 拟合度为: {} , MSE: {}    '.format(r2, mse(y_true, y_pred)))

def getWinDire(wind_dire):
    '''
    wind_direction 字段处理函数
    '''
    if wind_dire == 999999 or wind_dire == 999998:
        return 165.691
    if wind_dire > 361:
        return (wind_dire % 100 - 1) * 22.5
    else:
        return wind_dire
# (temperature, humidity, rain20, rain08)缺失值初步处理, 可改进使用拉格朗日插值法修补缺失值
temperature_mean = np.mean([_ for _ in df.temperature if _ < 888889])
df['temperature'] = df.temperature.replace([999990, 999999], temperature_mean)

humidity_mean = np.mean([_ for _ in df.humidity if _ < 888889])
df['humidity'] = df.humidity.replace([999990, 999999], humidity_mean)
# 对风向根据字段定义预处理
df['wind_direction'] = df['wind_direction'].apply(getWinDire)
df.head(5)
### 4. 数据分析
dta = df[['temperature', 'humidity','station']]
dta.station.unique()
#### 4.1 使用 sta_temp_dict, sta_humi_dict存放每个站台的温度, 湿度信息
sta_temp_dict = {}
sta_humi_dict = {}

for _ in dta.station.unique():
    sta_temp_dict[_] = dta[(dta['station'] == _)][['temperature']]
    sta_humi_dict[_] = dta[(dta['station'] == _)][['humidity']]
#### 5.1  定义构建滑窗数据集函数
def getMovingWindowData(dict_, window_size=17, th_day=1, train_size=0.7):
    '''
    Input: dict_ : 输入数据源dict
            window_size: 窗口大小
            th_day : 构建第几天的结果
            train_size :  训练集大小
    return train_X, , train_Y, test_X, test_Y  每个站台取(1-train_size)用来测试
    '''
    # 切分时每个盒子的大小
    box_size = window_size + th_day 
    # 构造的训练集,测试集数据
    train_X, train_Y = [],[]
    test_X, test_Y = [],[]
    # 遍历每个站台下的数据
    for sta in dict_.keys():
        # 站台下的数据
        dta = list(dict_.get(sta).values.ravel())
        # 站台下的数据可以构建的box数目
        box_list_len = len(dta) - box_size + 1
        # 构建数据集
        for _ in range(box_list_len):
            # 得到单个box数据
            box = dta[_:_+box_size]
            # 构建训练测试集 按train_size
            if _ < box_list_len * train_size:
                train_X.append(box[:window_size])
                train_Y.append(box[-1])
            else:
                test_X.append(box[:window_size])
                test_Y.append(box[-1])
                
    return  np.array(train_X), np.array(train_Y), np.array(test_X), np.array(test_Y)


#### 5.2 构建数据集并预测
train_X, train_Y, test_X, test_Y = getMovingWindowData(sta_temp_dict, th_day=1)

###### 构建ANN,并用train进行训练

def make_model(window_size=17):
    model = Sequential()
    model.add(Dense(60, input_dim=window_size, init="uniform",activation="relu", kernel_regularizer=l2(0.2)))
    model.add(Dense(60, init="uniform", activation="relu", kernel_regularizer=l2(0.2)))
    model.add(Dropout(0.3))
    model.add(Dense(60, init="uniform", activation="relu", kernel_regularizer=l2(0.2)))
    model.add(Dropout(0.3))
    model.add(Dense(60, init="uniform", activation="relu", kernel_regularizer=l2(0.2)))
    model.add(Dropout(0.3))
    model.add(Dense(1))
    model.add(Activation("linear"))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model
model = make_model(window_size=17)
# model.fit(train_X,train_Y, nb_epoch=66, batch_size=1788, validation_split = .05)
model.summary()
# # 训练集拟合情况查看
# getMetrics(train_Y, model.predict(train_X).reshape(-1))
# # 查看在测试集上的表现
# getMetrics(test_Y, model.predict(test_X).reshape(-1))
#### 5.3 预测
##### 加载模型
model_temp_1 = load_model('./model/temp_1.h5')
model_temp_2 = load_model('./model/temp_2.h5')
model_temp_3 = load_model('./model/temp_3.h5')
model_temp_4 = load_model('./model/temp_4.h5')
model_temp_5 = load_model('./model/temp_5.h5')
model_temp_6 = load_model('./model/temp_6.h5')
model_temp_7 = load_model('./model/temp_7.h5')

model_humi_1 = load_model('./model/humi_1.h5')
model_humi_2 = load_model('./model/humi_2.h5')
model_humi_3 = load_model('./model/humi_3.h5')
model_humi_4 = load_model('./model/humi_4.h5')
model_humi_5 = load_model('./model/humi_5.h5')
model_humi_6 = load_model('./model/humi_6.h5')
model_humi_7 = load_model('./model/humi_7.h5')

# 预测未来7日的温度
ann_temp_predicted = []
for _ in sta_temp_dict.keys():
    temp_history = np.reshape(list(sta_temp_dict.get(_)['temperature'].values[-17:]), (1, -1))

    ann_temp_predicted.append(model_temp_1.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_2.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_3.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_4.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_5.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_6.predict(temp_history)[0][0])
    ann_temp_predicted.append(model_temp_7.predict(temp_history)[0][0])
    

# 预测未来7日的湿度度
ann_humi_predicted = []
for _ in sta_humi_dict.keys():
    humi_history = np.reshape(list(sta_humi_dict.get(_)['humidity'].values[-17:]), (1, -1))  
    
    ann_humi_predicted.append(model_humi_1.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_2.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_3.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_4.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_5.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_6.predict(humi_history)[0][0])
    ann_humi_predicted.append(model_humi_7.predict(humi_history)[0][0])

import csv
date = ['20180101', '20180102', '20180103', '20180104', '20180105', '20180106', '20180107'] * len(sta_humi_dict.keys())
station = np.transpose(np.reshape(list(sta_humi_dict.keys()) * 7, (7, len(sta_humi_dict.keys())))).ravel()
with open('./np.csv', 'w', newline='') as csvfile:
    write = csv.writer(csvfile)
    write.writerow(('date','station', 'temperature','humidity'))
    write.writerows(map(list, zip(date,station, ann_temp_predicted,ann_humi_predicted)))

你可能感兴趣的:(MyCode)