因为初学机器学习,希望能理解更透彻,在原作者的注释基础上添加自己的理解:
import pandas as pd
import numpy as np
# 数据预处理
def dataProcess(df):
x_list, y_list = [], []
# df替换指定元素,将空数据填充为0
df = df.replace(['NR'], [0.0])
# astype() 转换array中元素数据类型
array = np.array(df).astype(float)
# 将数据集拆分为多个数据帧
for i in range(0, 4320, 18): #18项指标,每次循环加18行,就是每天一循环,一共20天*12月=240天
for j in range(24 - 9): #24-9=15.每天可以取出15个样本range(15)表示0,1...13,14
mat = array[i:i + 18, j:j + 9] #mat是一个18-9的二维数组
label = array[i + 9, j + 9] # 第10行是PM2.5,j+9是当前目标标签
x_list.append(mat) #添加数据帧和对应的标签分别到x_list和y_list
y_list.append(label) #循环完成后,x_list中有240*15=3600条数据帧,y_list中有3600个标签
x = np.array(x_list) #数据帧18*9
y = np.array(y_list) #标签
'''
# 将每行数据都scale到0到1的范围内,有利于梯度下降,但经尝试发现效果并不好
for i in range(18):
if(np.max(x[:, i, :]) != 0):
x[: , i, :] /= np.max(x[:, i, :])
'''
return x, y, array
# 更新参数,训练模型
def train(x_train, y_train, epoch):
bias = 0 # 偏置值初始化
weights = np.ones(9) # 权重初始化,9个pm2.5值的权重
learning_rate = 1 # 初始学习率
reg_rate = 0.001 # 正则项系数
bg2_sum = 0 # 用于存放偏置值的梯度平方和
wg2_sum = np.zeros(9) # 用于存放权重的梯度平方和
for i in range(epoch):
b_g = 0
w_g = np.zeros(9)
# 在所有数据上计算Loss_label的梯度
for j in range(3200):
b_g += (y_train[j] - weights.dot(x_train[j, 9, :]) - bias) * (-1) #x_train[j,9,:]表示第j个数据帧的pm2.5行的全部9个值
for k in range(9): #9个w权重值不同(Adagrad方法)
w_g[k] += (y_train[j] - weights.dot(x_train[j, 9, :]) - bias) * (-x_train[j, 9, k])
# 求平均
b_g /= 3200 #Loss对b的偏微分
w_g /= 3200 #Loss对w的偏微分
# 加上Loss_regularization在w上的梯度
for m in range(9):
w_g[m] += reg_rate * weights[m]
# adagrad
bg2_sum += b_g ** 2
wg2_sum += w_g ** 2
# 更新权重和偏置
bias -= learning_rate / bg2_sum ** 0.5 * b_g
weights -= learning_rate / wg2_sum ** 0.5 * w_g
# 每训练200轮,输出一次在训练集上的损失
if i % 200 == 0:
loss = 0
for j in range(3200):
loss += (y_train[j] - weights.dot(x_train[j, 9, :]) - bias) ** 2
print('after {} epochs, the loss on train data is:'.format(i), loss / 3200)
return weights, bias
# 验证模型效果
def validate(x_val, y_val, weights, bias):
loss = 0
for i in range(400):
loss += (y_val[i] - weights.dot(x_val[i, 9, :]) - bias) ** 2
return loss / 400
def main():
# 从csv中读取有用的信息
# 由于大家获取数据集的渠道不同,所以数据集的编码格式可能不同
# 若读取失败,可在参数栏中加入encoding = 'gb18030'
df = pd.read_csv('train.csv', usecols=range(3, 27)) #取后面24列数据,对应24小时
x, y, _ = dataProcess(df)
# 简单划分训练集与验证集,前3200个数据帧作为训练集后400个数据帧作为验证集
x_train, y_train = x[0:3200], y[0:3200]
x_val, y_val = x[3200:3600], y[3200:3600]
epoch = 2000 # 训练轮数
# 开始训练
w, b = train(x_train, y_train, epoch)
# 在验证集上看效果
loss = validate(x_val, y_val, w, b)
print('The loss on val data is:', loss)
if __name__ == '__main__':
main()
"E:\python program\venv\Scripts\python.exe" "E:/python program/venv/pm2.5.py"
after 0 epochs, the loss on train data is: 955.3009375
after 200 epochs, the loss on train data is: 49.86823677027294
after 400 epochs, the loss on train data is: 46.20101423801224
after 600 epochs, the loss on train data is: 44.88913061600439
after 800 epochs, the loss on train data is: 44.26903588227097
after 1000 epochs, the loss on train data is: 43.950109190566856
after 1200 epochs, the loss on train data is: 43.78092633274224
after 1400 epochs, the loss on train data is: 43.68982565130423
after 1600 epochs, the loss on train data is: 43.640314303297686
after 1800 epochs, the loss on train data is: 43.61322589236443
The loss on val data is: 40.35422383809947
Process finished with exit code 0
在此程序基础上增加自己的修改和优化,可以加深理解和运用。
在这里我使用的n折交叉验证法(与网上所查的交叉验证法好像有出入,就当我自创好了,O(∩_∩)O),取n为10.即将3600条数据分成10等份,每次取其中的一份作为验证集,剩余的9份作为训练集。一共进行10次线性回归。取10次中loss最小的一个模型(即最好的w,b参数)。此方法感觉不是很妥当,感觉会有偶然性,不建议。。。时间关系将epoch调成200次。
import pandas as pd
import numpy as np
# 数据预处理
def dataProcess(df):
x_list, y_list = [], []
# df替换指定元素,将空数据填充为0
df = df.replace(['NR'], [0.0])
# astype() 转换array中元素数据类型
array = np.array(df).astype(float)
# 将数据集拆分为多个数据帧
for i in range(0, 4320, 18): #18项指标,每次循环加18行,就是每天一循环,一共20天*12月=240天
for j in range(24 - 9): #24-9=15.每天可以取出15个样本range(15)表示0,1...13,14
mat = array[i:i + 18, j:j + 9] #mat是一个18-9的二维数组
label = array[i + 9, j + 9] # 第10行是PM2.5,j+9是当前目标标签
x_list.append(mat) #添加数据帧和对应的标签分别到x_list和y_list
y_list.append(label) #循环完成后,x_list中有240*15=3600条数据帧,y_list中有3600个标签
x = np.array(x_list) #数据帧18*9
y = np.array(y_list) #标签
'''
# 将每行数据都scale到0到1的范围内,有利于梯度下降,但经尝试发现效果并不好
for i in range(18):
if(np.max(x[:, i, :]) != 0):
x[: , i, :] /= np.max(x[:, i, :])
'''
return x, y, array
# 更新参数,训练模型
def train(x_train, y_train, epoch):
bias = 0 # 偏置值初始化
weights = np.ones(9) # 权重初始化,9个pm2.5值的权重
learning_rate = 1 # 初始学习率
reg_rate = 0.001 # 正则项系数
bg2_sum = 0 # 用于存放偏置值的梯度平方和
wg2_sum = np.zeros(9) # 用于存放权重的梯度平方和
for i in range(epoch):
b_g = 0
w_g = np.zeros(9)
# 在所有数据上计算Loss_label的梯度
for j in range(3240):
b_g += (y_train[j] - weights.dot(x_train[j, 9, :]) - bias) * (-1) #x_train[j,9,:]表示第j个数据帧的pm2.5行的全部9个值
for k in range(9): #9个w权重值不同(Adagrad方法)
w_g[k] += (y_train[j] - weights.dot(x_train[j, 9, :]) - bias) * (-x_train[j, 9, k])
# 求平均
b_g /= 3240 #Loss对b的偏微分
w_g /= 3240 #Loss对w的偏微分
# 加上Loss_regularization在w上的梯度
for m in range(9):
w_g[m] += reg_rate * weights[m]
# adagrad
bg2_sum += b_g ** 2
wg2_sum += w_g ** 2
# 更新权重和偏置
bias -= learning_rate / bg2_sum ** 0.5 * b_g
weights -= learning_rate / wg2_sum ** 0.5 * w_g
# 每训练200轮,输出一次在训练集上的损失
#if i % 200 == 0:
loss = 0
for j in range(3240):
loss += (y_train[j] - weights.dot(x_train[j, 9, :]) - bias) ** 2
if i % 40 == 0:
print('after {} epochs, the loss on train data is:'.format(i), loss / 3240)
return weights, bias
# 验证模型效果
def validate(x_val, y_val, weights, bias):
loss = 0
for i in range(360):
loss += (y_val[i] - weights.dot(x_val[i, 9, :].T) - bias) ** 2
return loss / 360
def main():
# 从csv中读取有用的信息
# 由于大家获取数据集的渠道不同,所以数据集的编码格式可能不同
# 若读取失败,可在参数栏中加入encoding = 'gb18030'
df = pd.read_csv('train.csv', usecols=range(3, 27)) #取后面24列数据,对应24小时
x, y, _ = dataProcess(df)
## 简单划分训练集与验证集,前3200个数据帧作为训练集后400个数据帧作为验证集
#x_train, y_train = x[0:3200], y[0:3200]
#x_val, y_val = x[3200:3600], y[3200:3600]
epoch = 200 # 训练轮数
x_val=np.empty((10,360,18,9))
y_val,x_train,y_train,w,b,loss=np.empty((10,360)),np.empty((10,3240,18,9)),np.empty((10,3240)),np.empty((10,9)),np.empty((10,1)),np.empty((10,1))
#10折划分,每次取1/10作为验证集,剩余作为训练集
for i in range(10):
x_val[i]=x[360*i:360*i+360] #每次取一份作为验证集
y_val[i]=y[360*i:360*i+360]
x_train[i]=np.vstack((x[0:360*i],x[360*i+360:3600])) #拼接剩余数据,作为训练集
y_train[i]=np.append(y[0:360*i],y[360*i+360:3600])
# 开始训练
w[i], b[i] = train(x_train[i], y_train[i], epoch)
# 在验证集上看效果
loss[i] = validate(x_val[i], y_val[i], w[i].reshape(1, -1), b[i])
print('The loss({}) on val data is:'.format(i), loss[i])
# 找出平均loss最小的w和b组合,在index处取得最好的w,b参数对
minloss = loss[0]
index = 0
for j in range(10):
if loss[j] < minloss:
index = j
minloss=loss[j]
print(w[index])
print(b[index])
print(loss[index])
if __name__ == '__main__':
main()
运行结果:
...........
after 120 epochs, the loss on train data is: 53.671896972263234
after 160 epochs, the loss on train data is: 51.16912102574005
The loss(8) on val data is: [48.29004959]
after 0 epochs, the loss on train data is: 950.8537037037036
after 40 epochs, the loss on train data is: 69.73128533535424
after 80 epochs, the loss on train data is: 58.80185093243073
after 120 epochs, the loss on train data is: 54.27946461521245
after 160 epochs, the loss on train data is: 51.71604410677347
The loss(9) on val data is: [50.29267139]
[ 0.04986149 0.00164052 0.01347303 0.00289513 0.03311725 0.03960047
-0.20365186 0.17383743 0.87260196]
[0.24664116]
[27.2982955]
Process finished with exit code 0
在第一份源代码的基础上,只修改了main函数:
def main():
# 从csv中读取有用的信息
# 由于大家获取数据集的渠道不同,所以数据集的编码格式可能不同
# 若读取失败,可在参数栏中加入encoding = 'gb18030'
df = pd.read_csv('train.csv', usecols=range(3, 27)) #取后面24列数据,对应24小时
x, y, _ = dataProcess(df)
# 随机划分训练集与验证集,前8/9数据帧作为训练集后1/9个数据帧作为验证集
rand_x = np.arange(x.shape[0]) #取第一列,即每个数据帧的编号,用于后面随机打乱
np.random.shuffle(rand_x) #打乱3600个数据帧
x_train, y_train = x[rand_x[0:3200]], y[rand_x[0:3200]]
x_val, y_val = x[rand_x[3200:3600]], y[rand_x[3200:3600]]
epoch = 2000 # 训练轮数
# 开始训练
w, b = train(x_train, y_train, epoch)
# 在验证集上看效果
loss = validate(x_val, y_val, w, b)
print('The loss on val data is:', loss)
运行结果:
"E:\python program\venv\Scripts\python.exe" "E:/python program/venv/pm2.5.py"
after 0 epochs, the loss on train data is: 923.2053124999999
after 200 epochs, the loss on train data is: 49.370681769781115
after 400 epochs, the loss on train data is: 45.68850238288512
after 600 epochs, the loss on train data is: 44.407304475718874
after 800 epochs, the loss on train data is: 43.81675475600828
after 1000 epochs, the loss on train data is: 43.52074281094471
after 1200 epochs, the loss on train data is: 43.367710810290895
after 1400 epochs, the loss on train data is: 43.28735138463021
after 1600 epochs, the loss on train data is: 43.24473095161952
after 1800 epochs, the loss on train data is: 43.221961917771786
The loss on val data is: 43.01102003734101
Process finished with exit code 0
每天24个时间点,每个月有连续20天数据,那么每个月就有20×24=480个连续时间节点。0-8作为训练集,9为标签。。。。。。470-478作为训练集,479为标签。这样一共可以得到480-10+1=471个数据帧和对应的标签。12个月就有12×471=5652个数据帧和对应的标签。
修改数据预处理部分:
# 数据预处理
def dataProcess(df):
x_list, y_list = [], []
# df替换指定元素,将空数据填充为0
df = df.replace(['NR'], [0.0])
# astype() 转换array中元素数据类型
array = np.array(df).astype(float)
# 将数据集拆分为多个数据帧
for i in range(0, 4320, 18*20): #一共12个月,每个月一循环
for j in range(480-9): #24*20=480 480-10+1=471.每个月有471条数据帧及其对应的标签
if j % 24 <= 14 : #判断标签需不需要换天,当标签也在当天
mat = array[i:i+18,j%24:j%24+9]
label = array[i+9,j%24+9]
x_list.append(mat)
y_list.append(label)
elif j % 24 == 15: #当标签为第二天0点
mat = array[i:i+18,j%24:24]
label = array[i + 27, 0]
x_list.append(mat)
y_list.append(label)
else: #当标签为第二天1点后
mat = np.hstack((array[i:i + 18, j%24:24], array[i + 18:i + 36, 0:j%24 - 15]))
label = array[i+27,j%24-15]
x_list.append(mat)
y_list.append(label)
x = np.array(x_list) #数据帧18*9
y = np.array(y_list) #标签
print(x.shape)
'''
# 将每行数据都scale到0到1的范围内,有利于梯度下降,但经尝试发现效果并不好
for i in range(18):
if(np.max(x[:, i, :]) != 0):
x[: , i, :] /= np.max(x[:, i, :])
'''
return x, y, array
将其余需要修改数据大小的地方分别改掉,一共5652个数据帧,8/9训练集有5024个,验证集有628个数据帧。总体代码如下:
import pandas as pd
import numpy as np
# 数据预处理
def dataProcess(df):
x_list, y_list = [], []
# df替换指定元素,将空数据填充为0
df = df.replace(['NR'], [0.0])
# astype() 转换array中元素数据类型
array = np.array(df).astype(float)
# 将数据集拆分为多个数据帧
'''
for i in range(0, 4320, 18): #18项指标,每次循环加18行,就是每天一循环,一共20天*12月=240天
for j in range(24 - 9): #24-9=15.每天可以取出15个样本range(15)表示0,1...13,14
mat = array[i:i + 18, j:j + 9] #mat是一个18-9的二维数组
label = array[i + 9, j + 9] # 第10行是PM2.5,j+9是当前目标标签
x_list.append(mat) #添加数据帧和对应的标签分别到x_list和y_list
y_list.append(label) #循环完成后,x_list中有240*15=3600条数据帧,y_list中有3600个标签
'''
for i in range(0, 4320, 18*20): #一共12个月,每个月一循环
for j in range(480-9): #24*20=480 480-10+1=471.每个月有471条数据帧及其对应的标签
if j % 24 <= 14 : #判断标签需不需要换天,当标签也在当天
mat = array[i:i+18,j%24:j%24+9]
label = array[i+9,j%24+9]
x_list.append(mat)
y_list.append(label)
elif j % 24 == 15: #当标签为第二天0点
mat = array[i:i+18,j%24:24]
label = array[i + 27, 0]
x_list.append(mat)
y_list.append(label)
else: #当标签为第二天1点后
mat = np.hstack((array[i:i + 18, j%24:24], array[i + 18:i + 36, 0:j%24 - 15]))
label = array[i+27,j%24-15]
x_list.append(mat)
y_list.append(label)
x = np.array(x_list) #数据帧18*9
y = np.array(y_list) #标签
print(x.shape)
'''
# 将每行数据都scale到0到1的范围内,有利于梯度下降,但经尝试发现效果并不好
for i in range(18):
if(np.max(x[:, i, :]) != 0):
x[: , i, :] /= np.max(x[:, i, :])
'''
return x, y, array
# 更新参数,训练模型
def train(x_train, y_train, epoch):
bias = 0 # 偏置值初始化
weights = np.ones(9) # 权重初始化,9个pm2.5值的权重
learning_rate = 1 # 初始学习率
reg_rate = 0.001 # 正则项系数
bg2_sum = 0 # 用于存放偏置值的梯度平方和
wg2_sum = np.zeros(9) # 用于存放权重的梯度平方和
for i in range(epoch):
b_g = 0
w_g = np.zeros(9)
# 在所有数据上计算Loss_label的梯度
for j in range(5024):
b_g += (y_train[j] - weights.dot(x_train[j, 9, :]) - bias) * (-1) #x_train[j,9,:]表示第j个数据帧的pm2.5行的全部9个值
for k in range(9): #9个w权重值不同(Adagrad方法)
w_g[k] += (y_train[j] - weights.dot(x_train[j, 9, :]) - bias) * (-x_train[j, 9, k])
# 求平均
b_g /= 5024 #Loss对b的偏微分
w_g /= 5024 #Loss对w的偏微分
# 加上Loss_regularization在w上的梯度
for m in range(9):
w_g[m] += reg_rate * weights[m]
# adagrad
bg2_sum += b_g ** 2
wg2_sum += w_g ** 2
# 更新权重和偏置
bias -= learning_rate / bg2_sum ** 0.5 * b_g
weights -= learning_rate / wg2_sum ** 0.5 * w_g
# 每训练200轮,输出一次在训练集上的损失
if i % 200 == 0:
loss = 0
for j in range(5024):
loss += (y_train[j] - weights.dot(x_train[j, 9, :]) - bias) ** 2
print('after {} epochs, the loss on train data is:'.format(i), loss / 5024)
return weights, bias
# 验证模型效果
def validate(x_val, y_val, weights, bias):
loss = 0
for i in range(628):
loss += (y_val[i] - weights.dot(x_val[i, 9, :]) - bias) ** 2
return loss / 628
def main():
# 从csv中读取有用的信息
# 由于大家获取数据集的渠道不同,所以数据集的编码格式可能不同
# 若读取失败,可在参数栏中加入encoding = 'gb18030'
df = pd.read_csv('train.csv', usecols=range(3, 27)) #取后面24列数据,对应24小时
x, y, _ = dataProcess(df)
# 随机划分训练集与验证集,前8/9数据帧作为训练集后1/9个数据帧作为验证集
rand_x = np.arange(x.shape[0]) #取第一列,即每个数据帧的编号,用于后面随机打乱
np.random.shuffle(rand_x) #打乱5652个数据帧
x_train, y_train = x[rand_x[0:5024]], y[rand_x[0:5024]]
x_val, y_val = x[rand_x[5024:5652]], y[rand_x[5024:5652]]
epoch = 2000 # 训练轮数
# 开始训练
w, b = train(x_train, y_train, epoch)
# 在验证集上看效果
loss = validate(x_val, y_val, w, b)
print('The loss on val data is:', loss)
if __name__ == '__main__':
main()
运行结果:
"E:\python program\venv\Scripts\python.exe" "E:/python program/venv/pm2.5.py"
(5652, 18, 9)
after 0 epochs, the loss on train data is: 784.5989251592357
after 200 epochs, the loss on train data is: 44.15827935199603
after 400 epochs, the loss on train data is: 39.9619536744645
after 600 epochs, the loss on train data is: 38.04672561423071
after 800 epochs, the loss on train data is: 36.95103705270849
after 1000 epochs, the loss on train data is: 36.28281214840936
after 1200 epochs, the loss on train data is: 35.86501915733608
after 1400 epochs, the loss on train data is: 35.60040889807592
after 1600 epochs, the loss on train data is: 35.43139461827926
after 1800 epochs, the loss on train data is: 35.322732865070435
The loss on val data is: 33.05695762754186
Process finished with exit code 0
np.vstack():在竖直方向上堆叠
np.hstack():在水平方向上平铺
例如: np.hstack((array1,array2))
仔细检查参数的维度是不是有错。我检查除了一些(18,8),因此报错。