李宏毅2020机器学习课程作业1

李宏毅2020机器学习课程作业1

平台:windows
编程软件:VScode

// A code block
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import csv
def main():
    data = pd.read_csv('./train.csv', encoding = 'big5') #读取表格中的数据

    data = data.iloc[:, 3:] #数据处理,前三列的中文信息不要
    data[data == 'NR'] = 0 #数据处理,将数据中等于‘NR’的全部置为0
    raw_data = data.to_numpy() #将数据转化成numpy数组

    """
    将原始的 4320 * 18 的资料依照每个月份重组成 12 個 18 (features) * 480 (hours) 的资料
    """
    month_data = {} #建立一个字典
    for month in range(12):
        sample = np.empty([18, 480]) #随机创建一个数组
        for day in range(20):
            sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
        month_data[month] = sample

    """
    每个月会有 480hours,每 9 小时形成一个data,每个月会有 471 个 data,故总资料数为 471 * 12 笔,而每笔 data 有 9 * 18 的 features (一小时 18 个 features * 9 小时)

    对应的 target 则有 471 * 12 个(第 10 个小时的 PM2.5)
    """

    x = np.empty([12 * 471, 18 * 9], dtype = float)
    y = np.empty([12 * 471, 1], dtype = float)
    for month in range(12):
        for day in range(20):
            for hour in range(24):
                if day == 19 and hour > 14:
                    continue
                x[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) #转化成一个行向量
                y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] #value,将第10个小时的第10个特征,即PM2.5值赋给y

    """# **归一化**"""

    mean_x = np.mean(x, axis = 0) #18 * 9 ,列均值
    std_x = np.std(x, axis = 0) #18 * 9 ,列方差
    for i in range(len(x)): #12 * 471
        for j in range(len(x[0])): #18 * 9 
            if std_x[j] != 0:
                x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]


    """将训练数据按照比例分为训练集和测试集
    """

    x_train_set = x[: math.floor(len(x) * 0.8), :]
    y_train_set = y[: math.floor(len(y) * 0.8), :]
    x_validation = x[math.floor(len(x) * 0.8): , :]
    y_validation = y[math.floor(len(y) * 0.8): , :]
    print(x_train_set)
    print(y_train_set)
    print(x_validation)
    print(y_validation)
    print(len(x_train_set))
    print(len(y_train_set))
    print(len(x_validation))
    print(len(y_validation))

    """
    因为常数项的存在,所以 dimension (dim) 需要多加一列;eps 项是避免 adagrad 的分母为 0 而加的极小数值。
    每一个dimension (dim) 会对应到各自的 gradient, weight (w),通过一次次的 iteration (iter_time) 学习。
    """

    dim = 18 * 9 + 1
    w = np.zeros([dim, 1])
    x = np.concatenate((np.ones([12 * 471, 1]), x), axis = 1).astype(float) #进行行拼接
    learning_rate = 0.3
    iter_time = 5000
    lossl = np.zeros([iter_time,1])
    adagrad = np.zeros([dim, 1])
    eps = 0.0000000001
    for t in range(iter_time):
        loss = np.sqrt(np.sum(np.power(np.dot(x, w) - y, 2))/471/12)#rmse
        lossl[t] = loss
        if(t%100==0):
            print(str(t) + ":" + str(loss))
        gradient = 2 * np.dot(x.transpose(), np.dot(x, w) - y) #dim*1
        adagrad += gradient ** 2
        w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
    np.save('weight.npy', w)
    plt.plot(lossl)
    plt.show()

    """**Testing**
    载入 test data,并且以相似于训练资料预先处理和特征提取的方式处理,使 test data 形成 240 个维度为 18 * 9 + 1 的资料。
    """

    testdata = pd.read_csv('./test.csv', header = None, encoding = 'big5')
    test_data = testdata.iloc[:, 2:]
    test_data[test_data == 'NR'] = 0
    test_data = test_data.to_numpy()
    test_x = np.empty([240, 18*9], dtype = float)
    for i in range(240):
        test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
    for i in range(len(test_x)):
        for j in range(len(test_x[0])):
            if std_x[j] != 0:
                test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
    test_x = np.concatenate((np.ones([240, 1]), test_x), axis = 1).astype(float)
    test_x

    """# **Prediction**
    有了 weight 和测试资料即可预测 target。
    """

    w = np.load('weight.npy')
    ans_y = np.dot(test_x, w)
    ans_y

    """# **Save Prediction to CSV File**"""

    with open('submit.csv', mode='w', newline='') as submit_file:
        csv_writer = csv.writer(submit_file)
        header = ['id', 'value']
        print(header)
        csv_writer.writerow(header)
        for i in range(240):
            row = ['id_' + str(i), ans_y[i][0]]
            csv_writer.writerow(row)
            print(row)

if __name__ == "__main__":
    main()

代码及数据集:
链接:https://pan.baidu.com/s/1-bBgIY3kXkVBXVecnHW3xA
提取码:aonp

你可能感兴趣的:(李宏毅2020机器学习课程作业1)