无偏估计 样本均值等于总体均值
有效性 样本方差小 样本相对集中
最小二乘法 误差平方和最小
import csv, os
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import inv
import random
import math
import sys
def ada(X, Y, w, eta, iteration, lambdaL2): #adagrad 梯度下降
s_grad = np.zeros(len(X[0])) #np.zeros(i)返回一个i个零的数组
list_cost = []
for i in range(iteration):
hypo = np.dot(X,w) #全部样本进行运算 hypo是个向量里面有471*12个元素
loss = hypo - Y #计算运算后与实际的误差 向量里面有471*12个元素
cost = np.sum(loss**2)/len(X) #求和 均方误差
list_cost.append(cost)
grad = np.dot(X.T, loss)/len(X) + lambdaL2*w #计算梯度的公式 可以通过对loss function对Wi求导就会得出梯度是Xi与loss的乘积 求b的时候没有乘Xi 所以要在X里面加一项1
s_grad += grad**2 #梯度和均方根有同样的元素个数 梯度的元素个数和参数的个数相同 18*9+1=163
ada = np.sqrt(s_grad) #adagrad方法分母为均方根
w = w - eta*grad/ada
return w, list_cost
def SGD(X, Y, w, eta, iteration, lambdaL2):
list_cost = []
for i in range(iteration):
hypo = np.dot(X,w)
loss = hypo - Y
cost = np.sum(loss**2)/len(X)
list_cost.append(cost)
rand = np.random.randint(0, len(X))
grad = X[rand]*loss[rand]/len(X) + lambdaL2*w
w = w - eta*grad
return w, list_cost
def GD(X, Y, w, eta, iteration, lambdaL2):
list_cost = []
for i in range(iteration):
hypo = np.dot(X,w)
loss = hypo - Y
cost = np.sum(loss**2)/len(X)
list_cost.append(cost)
grad = np.dot(X.T, loss)/len(X) + lambdaL2 * w
w = w - eta*grad
return w, list_cost
# 每一个维度储存一种污染物的咨询
data = []
for i in range(18):
data.append([])
#read data
n_row = 0
text = open('data/train.csv', 'r', encoding='big5')
row = csv.reader(text, delimiter=',')
for r in row:
if n_row != 0:
for i in range(3,27):
if r[i] != "NR":
data[(n_row-1)%18].append(float(r[i]))
else:
data[(n_row-1)%18].append(float(0))
n_row = n_row + 1
text.close
#parse data to trainX and trainY
x = []
y = []
for i in range(12): #这是月份
for j in range(471): #每月取多少个时间点 一天24个小时 一共20天 24*20-9 减9是为了能让他取到最后的九个
x.append([])
for t in range(18):
for s in range(9):
x[471*i + j].append(data[t][480*i+j+s]) #每一个时间点下取之后的九个时间点的数据 并取18种污染物
y.append(data[9][480*i+j+9]) #取每个时间点下的第十个小时后的PM2.5
trainX = np.array(x) #每一行有9*18个数 每9个代表9天的某一种污染物
trainY = np.array(y)
#parse test data
test_x = []
n_row = 0
text = open('data/test.csv' ,"r")
row = csv.reader(text , delimiter= ",") #按行读
for r in row: #每种污染物某个时间点前九个小时的值 共18种是一个list
if n_row %18 == 0: #每遍历18行就让test_x加一
test_x.append([])
for i in range(2,11):
test_x[n_row//18].append(float(r[i]) ) #n_row'//'18表示整除
else :
for i in range(2,11):
if r[i] !="NR":
test_x[n_row//18].append(float(r[i]))
else:
test_x[n_row//18].append(0)
n_row = n_row+1
text.close()
test_x = np.array(test_x)
#parse anser
ans_y = []
n_row = 0
text = open('data/ans.csv', "r")
row = csv.reader(text, delimiter=",")
for r in row:
ans_y.append(r[1])
ans_y = ans_y[1:]
ans_y = np.array(list(map(int, ans_y)))
# add bias
test_x = np.concatenate((np.ones((test_x.shape[0],1)),test_x), axis=1)
trainX = np.concatenate((np.ones((trainX.shape[0],1)), trainX), axis=1)
#train data
w = np.zeros(len(trainX[0]))
w_sgd, cost_list_sgd = SGD(trainX, trainY, w, eta=0.0001, iteration=20000, lambdaL2=0)
# w_sgd50, cost_list_sgd50 = SGD(trainX, trainY, w, eta=0.0001, iteration=20000, lambdaL2=50)
w_ada, cost_list_ada = ada(trainX, trainY, w, eta=1, iteration=20000, lambdaL2=0)
# w_gd, cost_list_gd = SGD(trainX, trainY, w, eta=0.0001, iteration=20000, lambdaL2=0)
#close form
w_cf = inv(trainX.T.dot(trainX)).dot(trainX.T).dot(trainY)
cost_wcf = np.sum((trainX.dot(w_cf)-trainY)**2) / len(trainX)
hori = [cost_wcf for i in range(20000-3)]
#output testdata
y_ada = np.dot(test_x, w_ada)
y_sgd = np.dot(test_x, w_sgd)
y_cf = np.dot(test_x, w_cf)
#csv format
ans = []
for i in range(len(test_x)):
ans.append(["id_"+str(i)])
a = np.dot(w_ada,test_x[i])
ans[i].append(a)
filename = "result/predict.csv"
text = open(filename, "w+")
s = csv.writer(text,delimiter=',',lineterminator='\n')
s.writerow(["id","value"])
for i in range(len(ans)):
s.writerow(ans[i])
text.close()
#plot training data with different gradiant method
plt.plot(np.arange(len(cost_list_ada[3:])), cost_list_ada[3:], 'b', label="ada")
plt.plot(np.arange(len(cost_list_sgd[3:])), cost_list_sgd[3:], 'g', label='sgd')
# plt.plot(np.arange(len(cost_list_sgd50[3:])), cost_list_sgd50[3:], 'c', label='sgd50')
# plt.plot(np.arange(len(cost_list_gd[3:])), cost_list_gd[3:], 'r', label='gd')
plt.plot(np.arange(len(cost_list_ada[3:])), hori, 'y--', label='close-form')
plt.title('Train Process')
plt.xlabel('Iteration')
plt.ylabel('Loss Function(Quadratic)')
plt.legend()
plt.savefig(os.path.join(os.path.dirname(__file__), "figures/TrainProcess"))
plt.show()
#plot fianl answer
plt.figure()
plt.subplot(131)
plt.title('CloseForm')
plt.xlabel('dataset')
plt.ylabel('pm2.5')
plt.plot(np.arange((len(ans_y))), ans_y, 'r,')
plt.plot(np.arange(240), y_cf, 'b')
plt.subplot(132)
plt.title('ada')
plt.xlabel('dataset')
plt.ylabel('pm2.5')
plt.plot(np.arange((len(ans_y))), ans_y, 'r,')
plt.plot(np.arange(240), y_ada, 'g')
plt.subplot(133)
plt.title('sgd')
plt.xlabel('dataset')
plt.ylabel('pm2.5')
plt.plot(np.arange((len(ans_y))), ans_y, 'r,')
plt.plot(np.arange(240), y_sgd, 'b')
plt.tight_layout()
plt.savefig(os.path.join(os.path.dirname(__file__), "figures/Compare"))
plt.show()
参考贴https://blog.csdn.net/qq_36183881/article/details/91874895