机器学习hw1

作业要求:根据气象局的数据,通过前9小时的数据预测第10小时的PM2.5

一.数据预处理

        首先进行数据的预处理:默认第一行是header,data(data为数据帧)只保留从第三列开始的数据,观察train.csv可以发现降雨量那行为NR,将所有的NR变为0方便后面进行处理,然后将数据帧转换为数组。

        train.csv文件中包含了一年的数据量,数据的结构为12个月,每个月20天,每天24小时,每个小时包括18中不同物质的含量。进行数据预处理的数组转化如下图:

机器学习hw1_第1张图片

 二.设置不同的model

        model1:只含输入的9小时数据的一次项。

        model2:含输入的9小时数据的一次项和二次项。

        将整个训练集分成训练集和验证集,用训练集分别对两种model进行训练,然后用验证机分别对两种model进行验证,选择效果较好的model,然后使用完整的训练集再对model进行一次训练并保存权重文件,然后使用该权重文件对 预测及进行预测。

三.gradient的推导

机器学习hw1_第2张图片

 四.完整代码

 1.train.py

import sys
import pandas as pd
import numpy as np
data = pd.read_csv('D:/学习笔记/李宏毅/work1/hw1/train.csv', encoding = 'big5')#默认第一行是header不是数据
#预处理
data = data.iloc[:,3:]
data[data == 'NR'] = 0
raw_data = data.to_numpy()
month_data = {}
for month in range(12):
    sample = np.empty([18,480])
    for day in range(20):
        for number in range(18):
            sample[number,day*24:day*24+24] = raw_data[(month*20+day)*18+number,:]
    month_data[month] = sample
x = np.empty([12*471,18*9],dtype = float)
y = np.empty([12*471,1],dtype = float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            for number in range(18):
                if day== 19 and hour > 14:
                    continue
                x[month*471+day*24+hour,9*number:9*number+9] = month_data[month][number,day*24:day*24+9]
                y[month*471+day*24+hour,0] = month_data[month][9,day*24+hour+9]
#对x中的每一列进行标准化
mean_x = np.mean(x,axis = 0)
std_x = np.std(x,axis = 0)
for i in range(len(x)):
    for j in range(len(x[0])):
        x[i][j] = (x[i][j]-mean_x[j])/std_x[j]
#将训练集分成训练集和验证集
import math
x_train_set = x[: math.floor(len(x) * 0.8), :]
y_train_set = y[: math.floor(len(y) * 0.8), :]
x_validation = x[math.floor(len(x) * 0.8): , :]
y_validation = y[math.floor(len(y) * 0.8): , :]

#Adagrad(model1:只包含一次项)

dim = 18*9+1
w = np.zeros([dim,1])
x = np.concatenate((np.ones([math.floor(len(x) * 0.8),1]),x_train_set),axis = 1).astype(float)#将全为1的矩阵和x_train_set按行连接
learning_rate = 2
iter_time = 6000
adagrad = np.zeros([dim,1])
eps = 1e-9
#过程见公式推导
for t in range(iter_time):
    loss = np.sqrt(np.sum(np.power(np.dot(x,w)-y_train_set,2))/math.floor(len(x) * 0.8))#np.sqrt()为求平方根的开方运算,np.power(x,y)求x的y次方
    if(t%100 == 0):
        print(str(t)+':'+str(loss))
    gradient = 2*np.dot(x.transpose(),np.dot(x,w)-y_train_set)#x.transpose()为求矩阵转置的运算,np.dot为求矩阵相乘
    adagrad += gradient**2#adagrad中存的是分母,adagrad也是dim*1的矩阵,因为对于不同的参数使用不同的补偿,adagrad也是不同的
    w = w - learning_rate*gradient/np.sqrt(adagrad+eps)
np.save('weight_model1.npy',w)



#Adagrad(model2:包含二次项)

# dim = 18*9*2+1
# w = np.zeros([dim,1])
# x_train_set_model2 = np.empty([len(x_train_set),18*9*2],dtype = float)
# for round in range(2):
#     for length in range(18*9):
#         for number in range(len(x_train_set)):
#             if round == 0:
#                 x_train_set_model2[number,length*2] = x_train_set[number,length]
#             else:
#                 x_train_set_model2[number,length*2+1] = x_train_set[number,length]**2
# x = np.concatenate((np.ones([len(x_train_set_model2),1]),x_train_set_model2),axis = 1).astype(float)
# #new_numpy = np.concatenate((np.ones([1,len(x_train_set_model2)]),np.negative(x_train_set_model2.transpose())),axis = 0).astype(float)#np.neagative()对矩阵中所有值取相反数
# learning_rate = 2
# iter_time = 6000
# adagrad = np.zeros([dim,1])
# eps = 1e-9
# for t in range(iter_time):
#     loss = np.sqrt(np.sum(np.power(np.dot(x,w)-y_train_set,2))/len(x_train_set))#np.sqrt()为求平方根的开方运算,np.power(x,y)求x的y次方
#     if(t%100 == 0):
#         print(str(t)+':'+str(loss))
#     gradient = 2*np.dot(x.transpose(),np.dot(x,w)-y_train_set)#x.transpose()为求矩阵转置的运算,np.dot为求矩阵相乘
#     adagrad += gradient**2#adagrad中存的是分母,adagrad也是dim*1的矩阵,因为对于不同的参数使用不同的补偿,adagrad也是不同的
#     w = w - learning_rate*gradient/np.sqrt(adagrad+eps)
# np.save('weight_model2.npy',w)


#对于model2验证集预处理

# x_validation_model2 = np.empty([len(x_validation),18*9*2],dtype = float)
# for round in range(2):
#     for length in range(18*9):
#         for number in range(len(x_validation)):
#             if round == 0:
#                 x_validation_model2[number,length*2] = x_validation[number,length]
#             else:
#                 x_validation_model2[number,length*2+1] = x_validation[number,length]**2
# x_validation_model2 = np.concatenate((np.ones([len(x_validation),1]),x_validation_model2),axis = 1).astype(float)

#验证

#model1验证
x_validation = np.concatenate((np.ones([len(x_validation),1]),x_validation),axis = 1).astype(float)
ans_y = np.dot(x_validation, w)

#model2验证
#ans_y = np.dot(x_validation_model2, w)

#对于model1验证的结果进行储存

import csv
with open('model1.csv',mode = 'w',newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id','value']
    csv_writer.writerow(header)
    for i in range(1131):
        row = ['id_' + str(i), ans_y[i][0],y_validation[i][0]]
        csv_writer.writerow(row)


#对于model2验证的结果进行储存
# import csv
# with open('model2.csv',mode = 'w',newline='') as submit_file:
#     csv_writer = csv.writer(submit_file)
#     header = ['id','value']
#     csv_writer.writerow(header)
#     for i in range(1131):
#         row = ['id_' + str(i), ans_y[i][0]]
#         csv_writer.writerow(row)

2.test.py

import sys
import pandas as pd
import numpy as np
test = pd.read_csv('D:/学习笔记/李宏毅/work1/hw1/test.csv',header = None,encoding = 'big5' )
test_data = test.iloc[:,2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
x_test = np.empty([240,18*9])
for total in range(240):
    for number in range(18):
        x_test[total,9*number:9*number+9] = test_data[total*18+number,:]
mean_x = np.mean(x_test,axis = 0)
std_x = np.std(x_test,axis = 0)
for i in range(len(x_test)):
    for j in range(len(x_test[0])):
        x_test[i][j] = (x_test[i][j] - mean_x[j]) / std_x[j]
x = np.empty([240,18*9*2],dtype = float)
for round in range(2):
    for length in range(18*9):
        for number in range(len(x_test)):
            if round == 0:
                x[number,length*2] = x_test[number,length]
            else:
                x[number,length*2+1] = x_test[number,length]**2
x = np.concatenate((np.ones([len(x_test),1]),x),axis = 1).astype(float)
w = np.load('weight_model2.npy')
ans_y = np.dot(x,w)
import csv
with open('test_model2.csv',mode = 'w',newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id','value']
    csv_writer.writerow(header)
    for i in range(240):
        row = ['id_' + str(i), ans_y[i][0]]
        csv_writer.writerow(row)

你可能感兴趣的:(深度学习,python,机器学习)