基于最小二乘法的一般多元线性回归的实战

文章目录

  • 一、说明
  • 二、数据项说明
  • 三、实战部分

一、说明

我是在jupyter完成的,然后导出成markdown格式,ipynb文件导出为markdown的命令如下:

jupyter nbconvert --to markdown  xxx.ipynb

源代码和数据文件,点击这里获取

二、数据项说明

	Name		Data Type	Meas.	Description
	----		---------	-----	-----------
	Sex		nominal			M, F, and I (infant)
	Length		continuous	mm	Longest shell measurement
	Diameter	continuous	mm	perpendicular to length
	Height		continuous	mm	with meat in shell
	Whole weight	continuous	grams	whole abalone
	Shucked weight	continuous	grams	weight of meat
	Viscera weight	continuous	grams	gut weight (after bleeding)
	Shell weight	continuous	grams	after being dried
	Rings		integer			+1.5 gives the age in years

现在有8个数据字段,前面7个是特征值,最最后一个Rings为预测,具体请查阅文件内容

三、实战部分

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dataframe01 = pd.read_excel('abalone.xlsx', sheet_name='data')
dataframe01.head(10)
Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings
0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7
5 I 0.425 0.300 0.095 0.3515 0.1410 0.0775 0.120 8
6 F 0.530 0.415 0.150 0.7775 0.2370 0.1415 0.330 20
7 F 0.545 0.425 0.125 0.7680 0.2940 0.1495 0.260 16
8 M 0.475 0.370 0.125 0.5095 0.2165 0.1125 0.165 9
9 F 0.550 0.440 0.150 0.8945 0.3145 0.1510 0.320 19
# 查看数据容量 
dataframe01.shape
(4177, 9)
dataframe01.columns # 特征名字
Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Rings'],
      dtype='object')
# 清洗数据
# 替换特征值,将性别中的字符类型转化为整数
dataframe02 = dataframe01.copy()

dataframe02.Sex[dataframe01['Sex']=='I']=0
dataframe02.Sex[dataframe01['Sex']=='F']=1
dataframe02.Sex[dataframe01['Sex']=='M']=2

dataframe02.head(10)
Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings
0 2 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
1 2 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
2 1 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
3 2 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
4 0 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7
5 0 0.425 0.300 0.095 0.3515 0.1410 0.0775 0.120 8
6 1 0.530 0.415 0.150 0.7775 0.2370 0.1415 0.330 20
7 1 0.545 0.425 0.125 0.7680 0.2940 0.1495 0.260 16
8 2 0.475 0.370 0.125 0.5095 0.2165 0.1125 0.165 9
9 1 0.550 0.440 0.150 0.8945 0.3145 0.1510 0.320 19
# 导入线性回归的库
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
data_index = list(dataframe01.columns)
data_index
['Sex',
 'Length',
 'Diameter',
 'Height',
 'Whole weight',
 'Shucked weight',
 'Viscera weight',
 'Shell weight',
 'Rings']
# 获取特征矩阵X 的index
X_index = data_index[0:-1]
Y_index = data_index[-1]
X_index, Y_index
(['Sex',
  'Length',
  'Diameter',
  'Height',
  'Whole weight',
  'Shucked weight',
  'Viscera weight',
  'Shell weight'],
 'Rings')
X = dataframe02[X_index]
X.head()
Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight
0 2 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150
1 2 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070
2 1 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210
3 2 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155
4 0 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055
Y = dataframe02[Y_index]
Y.head()
0    15
1     7
2     9
3    10
4     7
Name: Rings, dtype: int64
# 划分训练集和测试集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.2,random_state=420)
Xtrain.head()
Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight
2763 0 0.550 0.425 0.135 0.6560 0.2570 0.1700 0.203
439 2 0.500 0.415 0.165 0.6885 0.2490 0.1380 0.250
1735 2 0.670 0.520 0.165 1.3900 0.7110 0.2865 0.300
751 2 0.485 0.355 0.120 0.5470 0.2150 0.1615 0.140
1626 1 0.570 0.450 0.135 0.7805 0.3345 0.1850 0.210
Ytrain.head()
2763    10
439     13
1735    11
751     10
1626     8
Name: Rings, dtype: int64
#恢复索引
for i in [Xtrain, Xtest]:
    i.index = range(i.shape[0])
#恢复索引
for i in [Ytrain, Ytest]:
    i.index = range(i.shape[0])
Xtrain.head()   # 查看X训练集头部
Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight
0 0 0.550 0.425 0.135 0.6560 0.2570 0.1700 0.203
1 2 0.500 0.415 0.165 0.6885 0.2490 0.1380 0.250
2 2 0.670 0.520 0.165 1.3900 0.7110 0.2865 0.300
3 2 0.485 0.355 0.120 0.5470 0.2150 0.1615 0.140
4 1 0.570 0.450 0.135 0.7805 0.3345 0.1850 0.210
Ytrain.head()
0    10
1    13
2    11
3    10
4     8
Name: Rings, dtype: int64
# 先用训练集训练(fit)标准化的类,然后用训练好的类分别转化(transform)训练集和测试集

# 开始建模
reg = LR().fit(Xtrain, Ytrain)
yhat = reg.predict(Xtest) #预测我们的yhat
yhat.min()
4.22923686878166
yhat.max()
22.656846035572762
reg.coef_ # w,系数向量
array([  0.40527178,  -0.88791132,  13.01662939,  10.39250886,
         9.64127293, -20.87747601, -10.50683081,   7.70632772])
Xtrain.columns
Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight'],
      dtype='object')
[*zip(Xtrain.columns,reg.coef_)]
[('Sex', 0.4052717783379893),
 ('Length', -0.8879113179582045),
 ('Diameter', 13.016629389061475),
 ('Height', 10.39250886428478),
 ('Whole weight', 9.64127293101552),
 ('Shucked weight', -20.87747600529615),
 ('Viscera weight', -10.506830809919672),
 ('Shell weight', 7.706327719866024)]
# 特征说明

Name Data Type Meas. Description


​ Sex nominal M, F, and I (infant)
​ Length continuous mm Longest shell measurement
​ Diameter continuous mm perpendicular to length
​ Height continuous mm with meat in shell
​ Whole weight continuous grams whole abalone
​ Shucked weight continuous grams weight of meat
​ Viscera weight continuous grams gut weight (after bleeding)
​ Shell weight continuous grams after being dried
​ Rings integer +1.5 gives the age in years

# 截距
reg.intercept_
2.7888240054011835
# 自定义最小二乘法尝试
def my_least_squares(x_array, y_array):
    '''
    :param x: 列表,表示m*n矩阵
    :param y: 列表,表示m*1矩阵
    :return: coef:list 回归系数(1*n矩阵)   intercept: float 截距
    '''
    # 矩阵对象化
    arr_x_01 = np.array(x_array)
    arr_y_01 = np.array(y_array)

    # x_array由 m*n矩阵转化为 m*(n+1)矩阵,其中第n+1列系数全为1
    # 获取行数
    row_num = arr_x_01.shape[0]

    # 生成常量系数矩阵  m*1矩阵
    arr_b = np.array([[1 for i in range(0, row_num)]])

    # 合并成m*(n+1)矩阵
    arr_x_02 = np.insert(arr_x_01, 0, values=arr_b, axis=1)

    # 矩阵运算
    w = np.linalg.inv(np.matmul(arr_x_02.T, arr_x_02))
    w = np.matmul(w, arr_x_02.T)
    w = np.matmul(w, arr_y_01)
    
    # w为1*(n+1)矩阵
    # print(w)
    result = list(w)
    coef = result.pop(-1)
    intercept = result
    
    return coef, intercept
# debug中
my_least_squares(Xtrain,list(Ytrain))
# 梯度下降法尝试
def costFunc(X,Y,theta):
    '''
    代价函数
    '''
    inner = np.power((X*theta.T)-Y,2)
    return np.sum(inner)/(2*len(X))

def gradientDescent(X,Y,theta,alpha,iters):
    '''
    梯度下降
    '''
    temp = np.mat(np.zeros(theta.shape))
    cost = np.zeros(iters)
    thetaNums = int(theta.shape[1])
    print(thetaNums)
    for i in range(iters):
        error = (X*theta.T-Y)
        for j in range(thetaNums):
            derivativeInner = np.multiply(error,X[:,j])
            temp[0,j] = theta[0,j] - (alpha*np.sum(derivativeInner)/len(X))

        theta = temp
        cost[i] = costFunc(X,Y,theta)

    return theta,cost

你可能感兴趣的:(数据挖掘,python数据挖掘)