我是在jupyter完成的,然后导出成markdown格式,ipynb文件导出为markdown的命令如下:
jupyter nbconvert --to markdown xxx.ipynb
源代码和数据文件,点击这里获取
Name Data Type Meas. Description
---- --------- ----- -----------
Sex nominal M, F, and I (infant)
Length continuous mm Longest shell measurement
Diameter continuous mm perpendicular to length
Height continuous mm with meat in shell
Whole weight continuous grams whole abalone
Shucked weight continuous grams weight of meat
Viscera weight continuous grams gut weight (after bleeding)
Shell weight continuous grams after being dried
Rings integer +1.5 gives the age in years
现在有8个数据字段,前面7个是特征值,最最后一个Rings为预测,具体请查阅文件内容
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dataframe01 = pd.read_excel('abalone.xlsx', sheet_name='data')
dataframe01.head(10)
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | Rings | |
---|---|---|---|---|---|---|---|---|---|
0 | M | 0.455 | 0.365 | 0.095 | 0.5140 | 0.2245 | 0.1010 | 0.150 | 15 |
1 | M | 0.350 | 0.265 | 0.090 | 0.2255 | 0.0995 | 0.0485 | 0.070 | 7 |
2 | F | 0.530 | 0.420 | 0.135 | 0.6770 | 0.2565 | 0.1415 | 0.210 | 9 |
3 | M | 0.440 | 0.365 | 0.125 | 0.5160 | 0.2155 | 0.1140 | 0.155 | 10 |
4 | I | 0.330 | 0.255 | 0.080 | 0.2050 | 0.0895 | 0.0395 | 0.055 | 7 |
5 | I | 0.425 | 0.300 | 0.095 | 0.3515 | 0.1410 | 0.0775 | 0.120 | 8 |
6 | F | 0.530 | 0.415 | 0.150 | 0.7775 | 0.2370 | 0.1415 | 0.330 | 20 |
7 | F | 0.545 | 0.425 | 0.125 | 0.7680 | 0.2940 | 0.1495 | 0.260 | 16 |
8 | M | 0.475 | 0.370 | 0.125 | 0.5095 | 0.2165 | 0.1125 | 0.165 | 9 |
9 | F | 0.550 | 0.440 | 0.150 | 0.8945 | 0.3145 | 0.1510 | 0.320 | 19 |
# 查看数据容量
dataframe01.shape
(4177, 9)
dataframe01.columns # 特征名字
Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
'Viscera weight', 'Shell weight', 'Rings'],
dtype='object')
# 清洗数据
# 替换特征值,将性别中的字符类型转化为整数
dataframe02 = dataframe01.copy()
dataframe02.Sex[dataframe01['Sex']=='I']=0
dataframe02.Sex[dataframe01['Sex']=='F']=1
dataframe02.Sex[dataframe01['Sex']=='M']=2
dataframe02.head(10)
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | Rings | |
---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0.455 | 0.365 | 0.095 | 0.5140 | 0.2245 | 0.1010 | 0.150 | 15 |
1 | 2 | 0.350 | 0.265 | 0.090 | 0.2255 | 0.0995 | 0.0485 | 0.070 | 7 |
2 | 1 | 0.530 | 0.420 | 0.135 | 0.6770 | 0.2565 | 0.1415 | 0.210 | 9 |
3 | 2 | 0.440 | 0.365 | 0.125 | 0.5160 | 0.2155 | 0.1140 | 0.155 | 10 |
4 | 0 | 0.330 | 0.255 | 0.080 | 0.2050 | 0.0895 | 0.0395 | 0.055 | 7 |
5 | 0 | 0.425 | 0.300 | 0.095 | 0.3515 | 0.1410 | 0.0775 | 0.120 | 8 |
6 | 1 | 0.530 | 0.415 | 0.150 | 0.7775 | 0.2370 | 0.1415 | 0.330 | 20 |
7 | 1 | 0.545 | 0.425 | 0.125 | 0.7680 | 0.2940 | 0.1495 | 0.260 | 16 |
8 | 2 | 0.475 | 0.370 | 0.125 | 0.5095 | 0.2165 | 0.1125 | 0.165 | 9 |
9 | 1 | 0.550 | 0.440 | 0.150 | 0.8945 | 0.3145 | 0.1510 | 0.320 | 19 |
# 导入线性回归的库
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
data_index = list(dataframe01.columns)
data_index
['Sex',
'Length',
'Diameter',
'Height',
'Whole weight',
'Shucked weight',
'Viscera weight',
'Shell weight',
'Rings']
# 获取特征矩阵X 的index
X_index = data_index[0:-1]
Y_index = data_index[-1]
X_index, Y_index
(['Sex',
'Length',
'Diameter',
'Height',
'Whole weight',
'Shucked weight',
'Viscera weight',
'Shell weight'],
'Rings')
X = dataframe02[X_index]
X.head()
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | |
---|---|---|---|---|---|---|---|---|
0 | 2 | 0.455 | 0.365 | 0.095 | 0.5140 | 0.2245 | 0.1010 | 0.150 |
1 | 2 | 0.350 | 0.265 | 0.090 | 0.2255 | 0.0995 | 0.0485 | 0.070 |
2 | 1 | 0.530 | 0.420 | 0.135 | 0.6770 | 0.2565 | 0.1415 | 0.210 |
3 | 2 | 0.440 | 0.365 | 0.125 | 0.5160 | 0.2155 | 0.1140 | 0.155 |
4 | 0 | 0.330 | 0.255 | 0.080 | 0.2050 | 0.0895 | 0.0395 | 0.055 |
Y = dataframe02[Y_index]
Y.head()
0 15
1 7
2 9
3 10
4 7
Name: Rings, dtype: int64
# 划分训练集和测试集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.2,random_state=420)
Xtrain.head()
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | |
---|---|---|---|---|---|---|---|---|
2763 | 0 | 0.550 | 0.425 | 0.135 | 0.6560 | 0.2570 | 0.1700 | 0.203 |
439 | 2 | 0.500 | 0.415 | 0.165 | 0.6885 | 0.2490 | 0.1380 | 0.250 |
1735 | 2 | 0.670 | 0.520 | 0.165 | 1.3900 | 0.7110 | 0.2865 | 0.300 |
751 | 2 | 0.485 | 0.355 | 0.120 | 0.5470 | 0.2150 | 0.1615 | 0.140 |
1626 | 1 | 0.570 | 0.450 | 0.135 | 0.7805 | 0.3345 | 0.1850 | 0.210 |
Ytrain.head()
2763 10
439 13
1735 11
751 10
1626 8
Name: Rings, dtype: int64
#恢复索引
for i in [Xtrain, Xtest]:
i.index = range(i.shape[0])
#恢复索引
for i in [Ytrain, Ytest]:
i.index = range(i.shape[0])
Xtrain.head() # 查看X训练集头部
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0.550 | 0.425 | 0.135 | 0.6560 | 0.2570 | 0.1700 | 0.203 |
1 | 2 | 0.500 | 0.415 | 0.165 | 0.6885 | 0.2490 | 0.1380 | 0.250 |
2 | 2 | 0.670 | 0.520 | 0.165 | 1.3900 | 0.7110 | 0.2865 | 0.300 |
3 | 2 | 0.485 | 0.355 | 0.120 | 0.5470 | 0.2150 | 0.1615 | 0.140 |
4 | 1 | 0.570 | 0.450 | 0.135 | 0.7805 | 0.3345 | 0.1850 | 0.210 |
Ytrain.head()
0 10
1 13
2 11
3 10
4 8
Name: Rings, dtype: int64
# 先用训练集训练(fit)标准化的类,然后用训练好的类分别转化(transform)训练集和测试集
# 开始建模
reg = LR().fit(Xtrain, Ytrain)
yhat = reg.predict(Xtest) #预测我们的yhat
yhat.min()
4.22923686878166
yhat.max()
22.656846035572762
reg.coef_ # w,系数向量
array([ 0.40527178, -0.88791132, 13.01662939, 10.39250886,
9.64127293, -20.87747601, -10.50683081, 7.70632772])
Xtrain.columns
Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
'Viscera weight', 'Shell weight'],
dtype='object')
[*zip(Xtrain.columns,reg.coef_)]
[('Sex', 0.4052717783379893),
('Length', -0.8879113179582045),
('Diameter', 13.016629389061475),
('Height', 10.39250886428478),
('Whole weight', 9.64127293101552),
('Shucked weight', -20.87747600529615),
('Viscera weight', -10.506830809919672),
('Shell weight', 7.706327719866024)]
# 特征说明
Name Data Type Meas. Description
Sex nominal M, F, and I (infant)
Length continuous mm Longest shell measurement
Diameter continuous mm perpendicular to length
Height continuous mm with meat in shell
Whole weight continuous grams whole abalone
Shucked weight continuous grams weight of meat
Viscera weight continuous grams gut weight (after bleeding)
Shell weight continuous grams after being dried
Rings integer +1.5 gives the age in years
# 截距
reg.intercept_
2.7888240054011835
# 自定义最小二乘法尝试
def my_least_squares(x_array, y_array):
'''
:param x: 列表,表示m*n矩阵
:param y: 列表,表示m*1矩阵
:return: coef:list 回归系数(1*n矩阵) intercept: float 截距
'''
# 矩阵对象化
arr_x_01 = np.array(x_array)
arr_y_01 = np.array(y_array)
# x_array由 m*n矩阵转化为 m*(n+1)矩阵,其中第n+1列系数全为1
# 获取行数
row_num = arr_x_01.shape[0]
# 生成常量系数矩阵 m*1矩阵
arr_b = np.array([[1 for i in range(0, row_num)]])
# 合并成m*(n+1)矩阵
arr_x_02 = np.insert(arr_x_01, 0, values=arr_b, axis=1)
# 矩阵运算
w = np.linalg.inv(np.matmul(arr_x_02.T, arr_x_02))
w = np.matmul(w, arr_x_02.T)
w = np.matmul(w, arr_y_01)
# w为1*(n+1)矩阵
# print(w)
result = list(w)
coef = result.pop(-1)
intercept = result
return coef, intercept
# debug中
my_least_squares(Xtrain,list(Ytrain))
# 梯度下降法尝试
def costFunc(X,Y,theta):
'''
代价函数
'''
inner = np.power((X*theta.T)-Y,2)
return np.sum(inner)/(2*len(X))
def gradientDescent(X,Y,theta,alpha,iters):
'''
梯度下降
'''
temp = np.mat(np.zeros(theta.shape))
cost = np.zeros(iters)
thetaNums = int(theta.shape[1])
print(thetaNums)
for i in range(iters):
error = (X*theta.T-Y)
for j in range(thetaNums):
derivativeInner = np.multiply(error,X[:,j])
temp[0,j] = theta[0,j] - (alpha*np.sum(derivativeInner)/len(X))
theta = temp
cost[i] = costFunc(X,Y,theta)
return theta,cost