待修改
'''读取数据'''
data = pd.read_csv('train.csv') #DataFrame类型
del data['datetime']
del data['item']
'''整理训练集合'''
ItemNum=18
X_Train=[] #训练样本features集合
Y_Train=[] #训练样本目标PM2.5集合
for i in range(int(len(data)/ItemNum)):
day = data[i*ItemNum:(i+1)*ItemNum] #一天的观测数据
for j in range(15):
x = day.iloc[:, j:j + 9]
y = int(day.iloc[9,j+9])
X_Train.append(x)
Y_Train.append(y)
通过执行上面的Python代码,将3600个样本分别存入X_Train、Y_Train中。
'''小批量梯度下降'''
dict={0:8,1:8,2:8,3:8,4:8,5:8,6:8,7:8,8:8,9:9,10:9,11:9,12:9,13:9,14:9,15:9,16:9,17:9,18:12,19:12,20:12,21:12,22:12,23:12,24:12,25:12,26:12}
iteration_count = 10000 #迭代次数
learning_rate = 0.000001 #学习速率
b=0.0001 #初始化偏移项
parameters=[0.001]*27 #初始化27个参数
loss_history=[]
for i in range(iteration_count):
loss=0
b_grad=0
w_grad=[0]*27
examples=list(randint(0, len(X_Train)-1) for index in range(100))
for j in range(100):
index=examples.pop()
day = X_Train[index]
partsum = b+parameters[0]*day.iloc[8,0]+parameters[1]*day.iloc[8,1]+parameters[2]*day.iloc[8,2]+parameters[3]*day.iloc[8,3]+parameters[4]*day.iloc[8,4]+parameters[5]*day.iloc[8,5]+parameters[6]*day.iloc[8,6]+parameters[7]*day.iloc[8,7]+parameters[8]*day.iloc[8,8]+parameters[9]*day.iloc[9,0]+parameters[10]*day.iloc[9,1]+parameters[11]*day.iloc[9,2]+parameters[12]*day.iloc[9,3]+parameters[13]*day.iloc[9,4]+parameters[14]*day.iloc[9,5]+parameters[15]*day.iloc[9,6]+parameters[16]*day.iloc[9,7]+parameters[17]*day.iloc[9,8]+parameters[18]*day.iloc[12,0]+parameters[19]*day.iloc[12,1]+parameters[20]*day.iloc[12,2]+parameters[21]*day.iloc[12,3]+parameters[22]*day.iloc[12,4]+parameters[23]*day.iloc[12,5]+parameters[24]*day.iloc[12,6]+parameters[25]*day.iloc[12,7]+parameters[26]*day.iloc[12,8]-Y_Train[index]
loss=loss + partsum * partsum
b_grad = b_grad + partsum
for k in range(27):
w_grad[k]=w_grad[k]+ partsum * day.iloc[dict[k],k % 9]
loss_history.append(loss/2)
#更新参数
b = b - learning_rate * b_grad
for t in range(27):
parameters[t] = parameters[t] - learning_rate * w_grad[t]
'''评价模型'''
data1 = pd.read_csv('test.csv')
del data1['id']
del data1['item']
X_Test=[]
ItemNum=18
for i in range(int(len(data1)/ItemNum)):
day = data1[i*ItemNum:(i+1)*ItemNum] #一天的观测数据
X_Test.append(day)
Y_Test=[]
data2 = pd.read_csv('answer.csv')
for i in range(len(data2)):
Y_Test.append(data2.iloc[i,1])
b=0.00371301266193
parameters=[-0.0024696993501677625, 0.0042664323568029619, -0.0086174899917209787, -0.017547874680980298, -0.01836289806786489, -0.0046459546176775678, -0.031425910733080147, 0.018037490234208024, 0.17448898242705385, 0.037982590870111861, 0.025666115101346722, 0.02295437149703404, 0.014272058968395849, 0.011573452230087483, 0.010984971346586308, -0.0061003639742210781, 0.19310213021199321, 0.45973205224805752, -0.0034995637680653086, 0.00094072189075279807, 0.00069329550591916357, 0.002966257320079194, 0.0050690506276038138, 0.007559004246038563, 0.013296350700555241, 0.027251049329127801, 0.039423988570899793]
Y_predict=[]
for i in range(len(X_Test)):
day=X_Test[i]
p=b+parameters[0]*day.iloc[8,0]+parameters[1]*day.iloc[8,1]+parameters[2]*day.iloc[8,2]+parameters[3]*day.iloc[8,3]+parameters[4]*day.iloc[8,4]+parameters[5]*day.iloc[8,5]+parameters[6]*day.iloc[8,6]+parameters[7]*day.iloc[8,7]+parameters[8]*day.iloc[8,8]+parameters[9]*day.iloc[9,0]+parameters[10]*day.iloc[9,1]+parameters[11]*day.iloc[9,2]+parameters[12]*day.iloc[9,3]+parameters[13]*day.iloc[9,4]+parameters[14]*day.iloc[9,5]+parameters[15]*day.iloc[9,6]+parameters[16]*day.iloc[9,7]+parameters[17]*day.iloc[9,8]+parameters[18]*day.iloc[12,0]+parameters[19]*day.iloc[12,1]+parameters[20]*day.iloc[12,2]+parameters[21]*day.iloc[12,3]+parameters[22]*day.iloc[12,4]+parameters[23]*day.iloc[12,5]+parameters[24]*day.iloc[12,6]+parameters[25]*day.iloc[12,7]+parameters[26]*day.iloc[12,8]
Y_predict.append(p)
def dev_degree(y_true,y_predict): #评价函数
sum=0
for i in range(len(y_predict)):
sum=sum+(y_true[i]-y_predict[i])*(y_true[i]-y_predict[i])
return sum/len(y_predict)
print(dev_degree(Y_Test,Y_predict))