作业要求:根据气象局的数据,通过前9小时的数据预测第10小时的PM2.5
一.数据预处理
首先进行数据的预处理:默认第一行是header,data(data为数据帧)只保留从第三列开始的数据,观察train.csv可以发现降雨量那行为NR,将所有的NR变为0方便后面进行处理,然后将数据帧转换为数组。
train.csv文件中包含了一年的数据量,数据的结构为12个月,每个月20天,每天24小时,每个小时包括18中不同物质的含量。进行数据预处理的数组转化如下图:
二.设置不同的model
model1:只含输入的9小时数据的一次项。
model2:含输入的9小时数据的一次项和二次项。
将整个训练集分成训练集和验证集,用训练集分别对两种model进行训练,然后用验证机分别对两种model进行验证,选择效果较好的model,然后使用完整的训练集再对model进行一次训练并保存权重文件,然后使用该权重文件对 预测及进行预测。
三.gradient的推导
四.完整代码
1.train.py
import sys
import pandas as pd
import numpy as np
data = pd.read_csv('D:/学习笔记/李宏毅/work1/hw1/train.csv', encoding = 'big5')#默认第一行是header不是数据
#预处理
data = data.iloc[:,3:]
data[data == 'NR'] = 0
raw_data = data.to_numpy()
month_data = {}
for month in range(12):
sample = np.empty([18,480])
for day in range(20):
for number in range(18):
sample[number,day*24:day*24+24] = raw_data[(month*20+day)*18+number,:]
month_data[month] = sample
x = np.empty([12*471,18*9],dtype = float)
y = np.empty([12*471,1],dtype = float)
for month in range(12):
for day in range(20):
for hour in range(24):
for number in range(18):
if day== 19 and hour > 14:
continue
x[month*471+day*24+hour,9*number:9*number+9] = month_data[month][number,day*24:day*24+9]
y[month*471+day*24+hour,0] = month_data[month][9,day*24+hour+9]
#对x中的每一列进行标准化
mean_x = np.mean(x,axis = 0)
std_x = np.std(x,axis = 0)
for i in range(len(x)):
for j in range(len(x[0])):
x[i][j] = (x[i][j]-mean_x[j])/std_x[j]
#将训练集分成训练集和验证集
import math
x_train_set = x[: math.floor(len(x) * 0.8), :]
y_train_set = y[: math.floor(len(y) * 0.8), :]
x_validation = x[math.floor(len(x) * 0.8): , :]
y_validation = y[math.floor(len(y) * 0.8): , :]
#Adagrad(model1:只包含一次项)
dim = 18*9+1
w = np.zeros([dim,1])
x = np.concatenate((np.ones([math.floor(len(x) * 0.8),1]),x_train_set),axis = 1).astype(float)#将全为1的矩阵和x_train_set按行连接
learning_rate = 2
iter_time = 6000
adagrad = np.zeros([dim,1])
eps = 1e-9
#过程见公式推导
for t in range(iter_time):
loss = np.sqrt(np.sum(np.power(np.dot(x,w)-y_train_set,2))/math.floor(len(x) * 0.8))#np.sqrt()为求平方根的开方运算,np.power(x,y)求x的y次方
if(t%100 == 0):
print(str(t)+':'+str(loss))
gradient = 2*np.dot(x.transpose(),np.dot(x,w)-y_train_set)#x.transpose()为求矩阵转置的运算,np.dot为求矩阵相乘
adagrad += gradient**2#adagrad中存的是分母,adagrad也是dim*1的矩阵,因为对于不同的参数使用不同的补偿,adagrad也是不同的
w = w - learning_rate*gradient/np.sqrt(adagrad+eps)
np.save('weight_model1.npy',w)
#Adagrad(model2:包含二次项)
# dim = 18*9*2+1
# w = np.zeros([dim,1])
# x_train_set_model2 = np.empty([len(x_train_set),18*9*2],dtype = float)
# for round in range(2):
# for length in range(18*9):
# for number in range(len(x_train_set)):
# if round == 0:
# x_train_set_model2[number,length*2] = x_train_set[number,length]
# else:
# x_train_set_model2[number,length*2+1] = x_train_set[number,length]**2
# x = np.concatenate((np.ones([len(x_train_set_model2),1]),x_train_set_model2),axis = 1).astype(float)
# #new_numpy = np.concatenate((np.ones([1,len(x_train_set_model2)]),np.negative(x_train_set_model2.transpose())),axis = 0).astype(float)#np.neagative()对矩阵中所有值取相反数
# learning_rate = 2
# iter_time = 6000
# adagrad = np.zeros([dim,1])
# eps = 1e-9
# for t in range(iter_time):
# loss = np.sqrt(np.sum(np.power(np.dot(x,w)-y_train_set,2))/len(x_train_set))#np.sqrt()为求平方根的开方运算,np.power(x,y)求x的y次方
# if(t%100 == 0):
# print(str(t)+':'+str(loss))
# gradient = 2*np.dot(x.transpose(),np.dot(x,w)-y_train_set)#x.transpose()为求矩阵转置的运算,np.dot为求矩阵相乘
# adagrad += gradient**2#adagrad中存的是分母,adagrad也是dim*1的矩阵,因为对于不同的参数使用不同的补偿,adagrad也是不同的
# w = w - learning_rate*gradient/np.sqrt(adagrad+eps)
# np.save('weight_model2.npy',w)
#对于model2验证集预处理
# x_validation_model2 = np.empty([len(x_validation),18*9*2],dtype = float)
# for round in range(2):
# for length in range(18*9):
# for number in range(len(x_validation)):
# if round == 0:
# x_validation_model2[number,length*2] = x_validation[number,length]
# else:
# x_validation_model2[number,length*2+1] = x_validation[number,length]**2
# x_validation_model2 = np.concatenate((np.ones([len(x_validation),1]),x_validation_model2),axis = 1).astype(float)
#验证
#model1验证
x_validation = np.concatenate((np.ones([len(x_validation),1]),x_validation),axis = 1).astype(float)
ans_y = np.dot(x_validation, w)
#model2验证
#ans_y = np.dot(x_validation_model2, w)
#对于model1验证的结果进行储存
import csv
with open('model1.csv',mode = 'w',newline='') as submit_file:
csv_writer = csv.writer(submit_file)
header = ['id','value']
csv_writer.writerow(header)
for i in range(1131):
row = ['id_' + str(i), ans_y[i][0],y_validation[i][0]]
csv_writer.writerow(row)
#对于model2验证的结果进行储存
# import csv
# with open('model2.csv',mode = 'w',newline='') as submit_file:
# csv_writer = csv.writer(submit_file)
# header = ['id','value']
# csv_writer.writerow(header)
# for i in range(1131):
# row = ['id_' + str(i), ans_y[i][0]]
# csv_writer.writerow(row)
2.test.py
import sys
import pandas as pd
import numpy as np
test = pd.read_csv('D:/学习笔记/李宏毅/work1/hw1/test.csv',header = None,encoding = 'big5' )
test_data = test.iloc[:,2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
x_test = np.empty([240,18*9])
for total in range(240):
for number in range(18):
x_test[total,9*number:9*number+9] = test_data[total*18+number,:]
mean_x = np.mean(x_test,axis = 0)
std_x = np.std(x_test,axis = 0)
for i in range(len(x_test)):
for j in range(len(x_test[0])):
x_test[i][j] = (x_test[i][j] - mean_x[j]) / std_x[j]
x = np.empty([240,18*9*2],dtype = float)
for round in range(2):
for length in range(18*9):
for number in range(len(x_test)):
if round == 0:
x[number,length*2] = x_test[number,length]
else:
x[number,length*2+1] = x_test[number,length]**2
x = np.concatenate((np.ones([len(x_test),1]),x),axis = 1).astype(float)
w = np.load('weight_model2.npy')
ans_y = np.dot(x,w)
import csv
with open('test_model2.csv',mode = 'w',newline='') as submit_file:
csv_writer = csv.writer(submit_file)
header = ['id','value']
csv_writer.writerow(header)
for i in range(240):
row = ['id_' + str(i), ans_y[i][0]]
csv_writer.writerow(row)