李宏毅机器学习作业之PM2.5预测

import pandas as pd
import numpy as np
import math
import csv

数据预处理

# 读取数据csv文件,将csv文件保存为矩阵
# 这里read_csv的作用是读取csv文件
data = pd.read_csv("/Users/tiger/Desktop/study/机器学习/李宏毅机器学习/李宏毅机器学习资料/数据/hw1/train.csv", encoding="big5")
# print(data)
# 要全部行,不要前三列
data = data.iloc[:, 3:]
# 将FAINFULL为NR的值转换为0,现实中没有降雨也可以写做0
data[data == "NR"] = 0
# 将data转换为numpy矩阵
arr_data = data.to_numpy()

# 这里得到的数据是一个二维矩阵,一共12*20*18行,每一行有24列
arr_data
array([['14', '14', '14', ..., '15', '15', '15'],
       ['1.8', '1.8', '1.8', ..., '1.8', '1.8', '1.8'],
       ['0.51', '0.41', '0.39', ..., '0.35', '0.36', '0.32'],
       ...,
       ['36', '55', '72', ..., '118', '100', '105'],
       ['1.9', '2.4', '1.9', ..., '1.5', '2', '2'],
       ['0.7', '0.8', '1.8', ..., '1.6', '1.8', '2']], dtype=object)
"""来的思路就是:把一个月中的每一天的数据放到宏观的一行上去,这样一个月的每个小时就会连接起来,
题目要求是输入连续的9个小时的数据来预测第10个小时的PM2.5的值,这样做会增加我们的数据量,
因为如果将每一天割裂开来看的话,每天只有24个小时,这样得到每天有15组数据,如果将每个月的20天连在一起看的话,会增加我们的数据量"""

# 创建一个空的字典用于之后每个月的数据与月份的对应
month_data = {}

# 给上面创建的空的字典填入值
for month in range(12):
    # 创建一个空的矩阵,用于后面存放数据
    # 前面思路中要将每一个月的数据连在一起存放到一个矩阵中,每一天的数据有18个特征故有18列,一共有20天的数据所以有24*20列
    sample = np.empty([18, 24 * 20])  # 这里empty是numpy中的一个方法用于创建一个空的矩阵
    # 这里要做的就是把每一个月的数据放到拼到一行上去,放入到上面创建的空矩阵sample中
    for day in range(20):
        # 去处每个月对应的天的数据放到sample矩阵的对应位置。
        sample[:, (day * 24):(day + 1) * 24] = arr_data[18 * (month * 20 + day) : 18 * (month * 20 + day + 1), :]
    # 将每一个月的数据对应到字典中
    month_data[month] = sample

# 创建一个空的矩阵用于存放训练集,思路中说明每个连续的9个小时为一个训练数据
# 这里每一列存放了一组9个小时训练数据,每个小时有18个特征,故为9 * 18
# 这里行是因为每一个月有471组的连续的9个小时的数据,之所以不是472组数据是因为最后一个9个小时的数据没有第10个小时的数据,故要去掉,一共有12个月所以为12 * 471
x = np.empty([12 * 471, 18 * 9], dtype=float)  # 这里empty是创建一个空的矩阵,第二个参数的作用是指定矩阵中的元素的类型

# 创建这个空矩阵的值是为了存放第十个小时的PM2.5的数据
y = np.empty([12 * 471, 1], dtype=float)

# 这里是为上面创建的两个空矩阵赋值
for month in range(12):
    for day in range(20):
        for hours in range(24):
            if day == 19 and hours > 14:
                continue
            # 等式左边给每个月对应的时间赋值,等式右边就是先取出month_data中对应月份的数据,然后取出对应的值赋值
            x[month * 471 + day * 24 + hours, :] = month_data[month][:, day * 24 + hours : day * 24 + hours + 9].reshape(1, -1)  # reshape的作用是将数据转变为一行
            
            # 这个是将第十个小时的PM2.5的值写入到y中
            y[month * 471 + day * 24 + hours, 0] = month_data[month][9, day * 24 + hours + 9]
    
# 到这里,数据就处理完了,现在得到的两个矩阵,一个矩阵是x,里面包含所有数据,另一个是y,里面包含所有数据的结果。

数据标准化和训练集分类

z-score标准化
z-score标准化也叫做标准差标准化,这种方法给予原属数据的均值(mean)和标准差(standard deviation)进行数据的标准化,
经过处理的数据符合标准正态分布,即均值为0,标准差为1。
转换函数为: x ∗ = ( x − μ ) / σ x^*=(x - \mu ) / \sigma x=(xμ)/σ,其中 μ \mu μ为所有样本数据的均值, σ \sigma σ为所有样本数据的标准差。
z-score表转化方法适用于属性A的最大值和最小值未知的情况,或有超出取值范围的离群数据的情况,要求原始数据的分布近似为高斯分布,否则效果很差。

# 数据标准化
# 这里使用的是z标准化
# 这里是求每一个特征的平均值
mean_x = np.mean(x, axis=0) #这里是求平均值,axis等于i表示沿着第i个坐标轴进行操作,在二维矩阵中0表示沿着列进行操作,1表示沿着行进行操作
# 求每一个特征的标准差
std_x = np.std(x, axis=0)  # np.std就是用于计算标准差的api

for i in range(len(x)):  # 这里len(x)的值就是471 * 12
    for j in range(len(x[0])):  # 这里len(x[0])的值是18 *9
        if std_x[j] != 0:
            x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]
# 这一部分的作用是将数据集按8:2分为训练集和验证数据
x_train_set = x[:math.floor(len(x) * 0.8), :]  # 这里math.floor的作用是向下取整
y_train_set = y[:math.floor(len(y) * 0.8), :]
x_validation = x[math.floor(len(x) * 0.8):, :]
y_validation = y[math.floor(len(y) * 0.8):, :]

训练模型

# 这里是用Loss function和梯度下降,其中Loss function选择均方根误差
# 用做参数的维数,加1室韦了对bias好处理,即有一个误差项
dim = 18 * 9 + 1
w = np.ones([dim, 1])  # ones函数是返回一个全1的矩阵,这里就是返回一个dim行,1列的矩阵
# 这行的目的是为x加上1列全为1的值
#============注意这里并没有区分验证集和训练集==================
x = np.concatenate((np.ones([12 * 471, 1]), x), axis=1).astype(float) # 这里concatenate的作用是进行数组的拼接,astype是转化矩阵中元素的数据类型

# 学习率
learning_rate = 100

# 迭代次数
iter_time = 1000

# adagrad就是把每个参数的学习率除上之前的微分的均方根
adagrad = np.zeros([dim, 1])
eps = 0.0000000001

# 均方根误差函数
for t in range(iter_time):
    # 这里是loss function
    loss = np.sqrt(np.sum(np.power(np.dot(x, w)-y, 2)) / (471 * 12))
    if t % 100 == 0:
        print(str(loss))
    
    # 求梯度
    gradient= 2 * np.dot(x.transpose(), np.dot(x, w) - y)  # 对于二维矩阵来说tanspose是矩阵的转置,对于高维数组来说。例如:A.shape = (2, 2, 4) A.transpose((1, 0, 2))就表示将0轴和1轴互换,dot函数表示矩阵乘法
    adagrad += gradient ** 2
    
    # 更新weight
    w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
np.save('weight.npy', w)  # save 保存为二进制文件
51.65880600054525
20.77437564222039
14.127460058537881
11.303857670788974
9.757354637574426
8.790692587072341
8.132230172649813
7.657473854962996
7.30155272831573
7.027160886807601

载入验证集验证

# 载入上一步训练集训练出来的weight
w = np.load('weight.npy')

# 这一步是将验证集的数据加上一列
x_validation = np.concatenate((np.ones([1131, 1]), x_validation), axis=1).astype(float)

# 计算验证集的loss
loss = np.sqrt(np.sum(np.power(np.dot(x_validation, w) - y_validation, 2)) / 1131)
print(loss)
6.396197121704969

预测testdata得到的预测结果

# 加载数据testdata = pd.read_csv('/Users/tiger/Desktop/study/机器学习/李宏毅机器学习/李宏毅机器学习资料/数据/hw1/data/test.csv', header=None, encoding='big5')  #这里必须指定header=None,不然read_csv函数会默认将指定header=0,即第0行数据为列名testdata[testdata == 'NR'] = 0test_data = testdata.iloc[:, 2:]  # 取csv文件中的全行数即第3列到结束的列数所包含的数据test_data = test_data.to_numpy()  # 将其转换为数组test_x = np.empty([240, 18 * 9], dtype=float)  # 创建一个240行18*9列的空数列用于保存textdata的输入for i in range(240):  # 共240个测试输入数据    test_x[i, :] = test_data[18 * i: 18 * (i + 1), :].reshape(1, -1)# 下面是Normalize,且必须跟training data是同一种方法进行Normalizefor i in range(len(test_x)):    for j in range(len(test_x[0])):        if std_x[j] != 0:            test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]test_x = np.concatenate((np.ones([240, 1]), test_x), axis=1).astype(float)  # 在test_x前面拼接一列全1数组,构成240行,163列数据# 进行预测w = np.load('weight.npy')# 这里得到预测值ans_y = np.dot(test_x, w)# 将预测值写入文件中with open('result.csv', mode='w', newline='') as submit_file:    csv_writer = csv.writer(submit_file)    header = ["id", "value"]    csv_writer.writerow(header)    for i in range(240):        write_row = ['id_' + str(i), ans_y[i][0]]        csv_writer.writerow(write_row)

预测结果

id,value
id_0,10.684860338423547
id_1,16.792247467027792
id_2,21.13290914573666
id_3,4.786672166264047
id_4,26.432101029543443
id_5,20.661216398470728
id_6,22.82256193838437
id_7,29.214571337489453
id_8,17.088067086260295
id_9,60.673227496857464
id_10,18.46631184461022
id_11,8.434982922072763
id_12,59.35574122805243
id_13,50.02059833321406
id_14,20.79550695621464
id_15,12.053554778198952
id_16,33.754835175080906
id_17,66.96743889308122
id_18,-0.8295276120536883
id_19,14.777015305410803
id_20,43.6175032338172
id_21,71.71164467829185
id_22,6.4929465889962525
id_23,18.460168780066326
id_24,13.740735501504256
id_25,36.64832550171499
id_26,23.929430220148795
id_27,69.22176125529859
id_28,8.820272432213695
id_29,56.83516827299004
id_30,21.78811976062332
id_31,6.508471140832372
id_32,0.7105070939855977
id_33,19.409173742757318
id_34,28.49631194484278
id_35,37.184830919331205
id_36,42.638699501863655
id_37,27.92062068906882
id_38,34.00184464463722
id_39,32.776282584725735
id_40,0.3147471150095953
id_41,36.76773597455136
id_42,31.38574765130327
id_43,51.619654408540725
id_44,14.573671096022181
id_45,36.094422477026114
id_46,26.325604056692505
id_47,10.83157207605515
id_48,26.27368393160752
id_49,32.87018829008646
id_50,21.115252402995164
id_51,9.11658185777294
id_52,24.740215118968372
id_53,53.86433229948033
id_54,5.078052964091924
id_55,35.845988273365194
id_56,30.31828590622372
id_57,22.76174887515322
id_58,56.79572297160596
id_59,19.41980835383982
id_60,14.800465707889728
id_61,46.4298823306427
id_62,12.296255144242231
id_63,56.587068330899214
id_64,24.682926968653796
id_65,18.08471500735758
id_66,15.4747777481983
id_67,-1.5714839546253438
id_68,42.81756663081315
id_69,26.814103351740815
id_70,22.15278339231544
id_71,38.91154498276733
id_72,52.78645518848199
id_73,6.633607211863456
id_74,19.43262375530285
id_75,5.4295937891811406
id_76,38.297243422541634
id_77,13.376968247715695
id_78,21.79866468557152
id_79,22.461171385436437
id_80,24.93810899497257
id_81,36.90735643294475
id_82,26.72709247850098
id_83,86.20651363799146
id_84,31.09202882682575
id_85,26.048078588981255
id_86,22.753852116326858
id_87,29.286232026745235
id_88,23.013275219331362
id_89,22.419187499516244
id_90,40.09240468814562
id_91,35.64262994643662
id_92,10.906921649402973
id_93,40.481793701688154
id_94,49.12761549292607
id_95,21.827891816977342
id_96,32.06701565115583
id_97,11.405158819907697
id_98,21.924576837234063
id_99,6.876519069507893
id_100,16.37286087332243
id_101,28.84495778453531
id_102,13.452609812202255
id_103,18.43709336763687
id_104,23.50015647759229
id_105,36.57113161400477
id_106,16.11513000361527
id_107,8.823958814008407
id_108,10.817902482139923
id_109,78.48626325585083
id_110,46.057198070963494
id_111,13.533693145063895
id_112,27.317702868312853
id_113,18.078612385296374
id_114,16.859622181123385
id_115,25.83431782988086
id_116,24.001917149550522
id_117,16.8228637727887
id_118,17.761982799364375
id_119,18.62930797542638
id_120,90.61014485728737
id_121,19.68936709209532
id_122,17.03879198736964
id_123,25.597393006578464
id_124,7.342614024228427
id_125,37.06062273612895
id_126,10.119869757737142
id_127,20.41252056793337
id_128,29.820283281964144
id_129,64.72084276369596
id_130,21.19751919800778
id_131,24.76473026977343
id_132,70.8984026705134
id_133,12.454808924935392
id_134,14.99264556523324
id_135,4.596491768102348
id_136,10.81158755792682
id_137,62.348475327738136
id_138,17.743374282278605
id_139,10.00552822850095
id_140,28.316004854752478
id_141,26.929132700545072
id_142,49.88891166877117
id_143,18.607014064620024
id_144,14.837381630867483
id_145,28.754307275109028
id_146,13.341812892765828
id_147,48.93036067836921
id_148,23.37488606829497
id_149,34.564794236357244
id_150,9.787989988615191
id_151,11.238277525790686
id_152,24.277577033530036
id_153,7.841813189508759
id_154,12.028278920333422
id_155,42.12054647560005
id_156,16.512983574701853
id_157,35.474835784714784
id_158,11.019309333374501
id_159,19.224440755506585
id_160,38.55506886082005
id_161,17.826861486790918
id_162,10.427116128034768
id_163,8.783153033770331
id_164,50.68155380516629
id_165,33.729738809658116
id_166,0.5017788609262963
id_167,12.55657750611676
id_168,59.756278536297884
id_169,14.511272260568928
id_170,63.81949522375305
id_171,44.98301839936411
id_172,29.255027629788973
id_173,21.49224158468537
id_174,63.05990857749928
id_175,25.942458829875548
id_176,21.369667808738164
id_177,39.694681688461124
id_178,10.534607913797082
id_179,29.86544565576032
id_180,13.23069080071577
id_181,12.74458536516137
id_182,54.73714086762352
id_183,51.894349020844466
id_184,10.295565024339165
id_185,37.73732395348945
id_186,24.231082971070673
id_187,64.13898558696462
id_188,6.957416048140537
id_189,56.81016913625055
id_190,34.09906021029121
id_191,5.1871116156991235
id_192,32.07943714442466
id_193,0.06307467581223403
id_194,18.552046412640696
id_195,-3.1542322273857053
id_196,34.0673173295785
id_197,11.99973031590292
id_198,18.440571671476015
id_199,63.398411475723606
id_200,28.544839293955526
id_201,44.27732518305021
id_202,64.1996483242329
id_203,12.863245683452739
id_204,13.004498714142244
id_205,11.990468419218224
id_206,10.67900663677997
id_207,0.017342340157624037
id_208,121.05055732258415
id_209,13.5391969137977
id_210,11.843844090936052
id_211,17.551825360772064
id_212,36.056116567552635
id_213,38.310344915605555
id_214,20.846409274510982
id_215,31.349503047162237
id_216,74.68413290066617
id_217,0.4144316509729382
id_218,9.000804405713932
id_219,32.364929554155786
id_220,15.48355693014387
id_221,17.661845462257443
id_222,111.38530244120676
id_223,18.208975045563406
id_224,19.727670087479993
id_225,57.18039983803029
id_226,5.515303704573245
id_227,13.080393765911577
id_228,7.993930582824756
id_229,13.086480003576824
id_230,48.993489699833916
id_231,15.634289875845017
id_232,54.58982652477382
id_233,35.97284400113751
id_234,20.92766779248563
id_235,40.46786755315091
id_236,65.63489071420496
id_237,36.31885165054494
id_238,16.44903482540684
id_239,14.116096888299431
  • 看着大佬的博客自己理解终于写完了,我这个小垃圾

你可能感兴趣的:(机器学习&深度学习)