机器学习实战代码_Python3.6_回归

# numpy库下的几个小函数的用法
# 1、mat函数
#
# mat函数可以将目标数据的类型转换为矩阵(matrix)
#
# 2、zeros
#
# zeros函数是生成指定维数的全0数组
#
# 3、ones
#
# ones函数是用于生成一个全1的数组
#
# 4.eye
#
# eye函数用户生成指定行数的单位矩阵
#
# 5、.T
#
# .T作用于矩阵,用作球矩阵的转置
#
# 6、tolist
#
# tolist函数用于把一个矩阵转化成为list列表
#
# 7.getA()
#
# getA()函数是numpy.matrix下的一个函数,用作把矩阵转换成数组,等价于np.asarray(self).
#
# 8. .I
#
# .I用作求矩阵的逆矩阵。逆矩阵在计算中是经常需要用到的。例如一个矩阵A,求A的逆矩阵B,即存在矩阵B是的AB=I(I为单位)


from numpy import *
import matplotlib.pyplot as plt

def load_data_set(file_name):
	num_feat = len(open(file_name).readline().split('\t')) - 1
	data_mat = []
	label_mat = []
	fr = open(file_name)
	for line in fr.readlines():
		line_arr = []
		cue_line = line.strip().split('\t')
		for i in range(num_feat):
			line_arr.append(float(cue_line[i]))
		data_mat.append(line_arr)
		label_mat.append(float(cue_line[-1])) #默认文件最后一行是目标值
	return data_mat, label_mat


def stand_regress(x_arr, y_arr):
	x_mat = mat(x_arr)
	y_mat = mat(y_arr)
	xTx = x_mat.T * x_mat
	if linalg.det(xTx) == 0.0:  #判断矩阵是否可以求逆矩阵
		print('this matrix is singular, cannot do inverse')
		return
	#print(shape(xTx.I), shape(x_mat.T), shape(y_mat))
	#print(x_mat.T * y_mat.T)
	ws = xTx.I * (x_mat.T * y_mat.T)    #书上代码有点小问题,y_mat也要调用.T方法
	return ws

def lwlr(test_point, x_arr, y_arr, k=1.0 ):
	x_mat = mat(x_arr)
	y_mat = mat(y_arr)
	m = shape(x_mat)[0]
	weights = mat(eye(m)) #创建对角单位矩阵
	for j in range(m):
		diff_mat = test_point - x_mat[j, :]
		weights[j, j] = exp(diff_mat*diff_mat.T/(-2.0*k**2))    #权重大小以及指数及衰减
	xTx = x_mat.T * (weights*x_mat)
	if linalg.det(xTx) == 0.0:
		print('this matrix is singular, cannot do inverse')
		return
	ws = xTx.I * ( x_mat.T * (weights*y_mat.T))
	return  test_point*ws

def lwlr_test(test_arr, x_arr, y_arr, k=1.0):
	m = shape(test_arr)[0]
	y_hat = zeros(m)
	for i in range(m):
		y_hat[i] = lwlr(test_arr[i], x_arr, y_arr, k)
	return y_hat

# if __name__ == '__main__':
#     x_arr, y_arr = load_data_set('ex0.txt')
#     # print(x_arr[0:2])
#     # ws = stand_regress(x_arr, y_arr)
#     # print(ws)
#     # x_mat = mat(x_arr)
#     # y_mat = mat(y_arr)
#     # y_hat = x_mat * ws
#     # fig = plt.figure()
#     # ax = fig.add_subplot(111)
#     # ax.scatter(x_mat[:, 1].flatten().A[0], y_mat.T[:, 0].flatten().A[0])
#     # plt.show()
#     #
#     # x_copy = x_mat.copy()
#     # x_copy.sort(0)
#     # y_hat = x_copy * ws
#     # ax.plot(x_copy[:, 1], y_hat)
#     k = float(0.0015)
#     while k < 0.05:
# 	    y_hat = lwlr_test(x_arr, x_arr, y_arr, k)
# 	    x_mat = mat(x_arr)
# 	    y_mat = mat(y_arr)
# 	    srt_ind = x_mat[:, 1].argsort(0)
# 	    x_sort = x_mat[srt_ind][:, 0, :]
# 	    fig = plt.figure()
# 	    ax = fig.add_subplot(111)
# 	    ax.plot(x_sort[:,1], y_hat[srt_ind])
# 	    ax.scatter(x_mat[:, 1].flatten().A[0], y_mat.T.flatten().A[0], s=2, c='red')
# 	    plt.title('K = '+str(k)[0:6])
# 	    plt.savefig('E:\Li_Python\Regression\k_png\k_is_'+str(k)[0:6].replace('.', '_')+'.png')
# 	    plt.show()
# 	    k += 0.0005

def rss_error(y_arr, y_hat_arr):
	return ((y_arr-y_hat_arr)**2).sum()


# if __name__ == '__main__':
#     abx, aby = load_data_set('abalone.txt')
#     y_hat01 = lwlr_test(abx[0:99], abx[0:99], aby[0:99], 0.1)
#     y_hat1 = lwlr_test(abx[0:99], abx[0:99], aby[0:99], 1)
#     y_hat10 = lwlr_test(abx[0:99], abx[0:99], aby[0:99], 10)
#     print('k=0.1 : ', rss_error(aby[0:99], y_hat01.T))
#     print('k=1 : ', rss_error(aby[0:99], y_hat1.T))
#     print('k=10 : ', rss_error(aby[0:99], y_hat10.T))
#
#     y_hat01 = lwlr_test(abx[100:199], abx[0:99], aby[0:99], 0.1)
#     y_hat1 = lwlr_test(abx[100:199], abx[0:99], aby[0:99], 1)
#     y_hat10 = lwlr_test(abx[100:199], abx[0:99], aby[0:99], 10)
#     print('k=0.1 : ', rss_error(aby[100:199], y_hat01.T))
#     print('k=1 : ', rss_error(aby[100:199], y_hat1.T))
#     print('k=10 : ', rss_error(aby[100:199], y_hat10.T))

def ridge_regress(x_mat, y_mat, lam=0.2):
	xTx = x_mat.T * x_mat
	denom = xTx + eye(shape(x_mat)[1])*lam
	if linalg.det(denom) == 0.0:
		print('this matrix is singular, cannot do inverse')
		return
	ws = denom.I * (x_mat.T * y_mat)
	return ws

def ridge_test(x_arr, y_arr):
	x_mat = mat(x_arr)
	y_mat = mat(y_arr).T
	y_mean = mean(y_mat, 0) #求取均值,具体mean函数参考手册
	y_mat = y_mat - y_mean
	x_mean = mean(x_mat, 0)
	x_var = var(x_mat, 0)
	x_mat = (x_mat-x_mean)/x_var
	num_test_pts = 30
	w_mat = zeros((num_test_pts, shape(x_mat)[1]))
	for i in range(num_test_pts):
		ws = ridge_regress(x_mat, y_mat, exp(i-10))
		w_mat[i,:] = ws.T
	return w_mat
	
	
# if __name__ == '__main__':
#     abx, aby = load_data_set('abalone.txt')
#     ridge_weights = ridge_test(abx, aby)
#     fig = plt.figure()
#     ax = fig.add_subplot(111)
#     ax.plot(ridge_weights)
#     plt.savefig('log_lambda.png')
#     plt.show()

#

def regularize(xMat):#regularize by columns
    inMat = xMat.copy()
    inMeans = mean(inMat,0)   #calc mean then subtract it off
    inVar = var(inMat,0)      #calc variance of Xi then divide by it
    inMat = (inMat - inMeans)/inVar
    return inMat


def stage_wise(x_arr, y_arr, eps=0.1, num_it=100):
	x_mat = mat(x_arr)
	y_mat = mat(y_arr).T    #这里需要使用.T方法对矩阵进行转置 Array property returning the array transposed.
	y_mean = mean(y_mat, 0)
	y_mat = y_mat - y_mean
	x_mat = regularize(x_mat)
	m, n = shape(x_mat)
	return_mat = zeros((num_it, n))
	ws = zeros((n,1))
	ws_test = ws.copy()
	ws_max = ws.copy()
	for i in range(num_it):
		print(ws.T)
		lower_error = inf
		for j in range(n):
			for sign in [-1, 1]:
				ws_test = ws.copy()
				ws_test[j] += eps*sign
				y_test = x_mat * ws_test
				rsse = rss_error(y_mat.A, y_test.A)
				if rsse < lower_error:
					lower_error = rsse
					ws_max = ws_test
		ws = ws_max.copy()
		return_mat[i, :] =  ws.T
	return return_mat

if __name__ == '__main__':
	x_arr, y_arr = load_data_set('abalone.txt')
	print(stage_wise(x_arr, y_arr, 0.01, 200))
	print('-'*50)
	print(stage_wise(x_arr, y_arr, 0.001, 5000))
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.plot(stage_wise(x_arr, y_arr, 0.001, 5000))
	plt.show()

###前向逐步回归(属于一种贪心算法)
伪代码:

数据标准化,使其分布满足0均值和单位方差
在每一轮的迭代过程中:
    设置当前最小误差lowest_error为正无穷(inf)
    对每一个特征:
        增大或者减小:
            改变一个系数得到一个新的w
            计算新的w下的误差
            如果误差error小于当前最小误差lowest_error:
                设置w_best等于当前的w
        将w设置为新的w_best

ndarray.mean([axis, dtype, out, keepdims])
Returns the average of the array elements along given axis


逐步线性回归算法主要优点在于其可以帮助理解现有的模型并作出改进,
当构建一个模型之后可以运行该算法来找出重要的特征,这样就有可能及时停止
对不重要特征的收集


你可能感兴趣的:(机器学习)