多元线性回归
一、不处理数据直接求解
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import linear_model
data = pd.read_csv('house_prices.csv')
data.head()
|
house_id |
neighborhood |
area |
bedrooms |
bathrooms |
style |
price |
0 |
1112 |
B |
1188 |
3 |
2 |
ranch |
598291 |
1 |
491 |
B |
3512 |
5 |
3 |
victorian |
1744259 |
2 |
5952 |
B |
1134 |
3 |
2 |
ranch |
571669 |
3 |
3525 |
A |
1940 |
4 |
2 |
ranch |
493675 |
4 |
5108 |
B |
2208 |
6 |
4 |
victorian |
1101539 |
new_data=data.iloc[:,1:]
new_data.head()
|
neighborhood |
area |
bedrooms |
bathrooms |
style |
price |
0 |
B |
1188 |
3 |
2 |
ranch |
598291 |
1 |
B |
3512 |
5 |
3 |
victorian |
1744259 |
2 |
B |
1134 |
3 |
2 |
ranch |
571669 |
3 |
A |
1940 |
4 |
2 |
ranch |
493675 |
4 |
B |
2208 |
6 |
4 |
victorian |
1101539 |
new_data.corr()
|
area |
bedrooms |
bathrooms |
price |
area |
1.000000 |
0.901623 |
0.891481 |
0.823454 |
bedrooms |
0.901623 |
1.000000 |
0.972768 |
0.743435 |
bathrooms |
0.891481 |
0.972768 |
1.000000 |
0.735851 |
price |
0.823454 |
0.743435 |
0.735851 |
1.000000 |
可以发现are、bedrooms、bathroom和price的相关系数分别为0.823、0.743、0.735左右。有很强的相关性。
下面取are、bedrooms和bathroom作为X,price为Y求线性回归。
x_data = new_data.iloc[:, 1:4]
y_data = new_data.iloc[:, -1]
print(x_data, y_data, len(x_data))
area bedrooms bathrooms
0 1188 3 2
1 3512 5 3
2 1134 3 2
3 1940 4 2
4 2208 6 4
... ... ... ...
6023 757 0 0
6024 3540 5 3
6025 1518 2 1
6026 2270 4 2
6027 3355 5 3
[6028 rows x 3 columns] 0 598291
1 1744259
2 571669
3 493675
4 1101539
...
6023 385420
6024 890627
6025 760829
6026 575515
6027 844747
Name: price, Length: 6028, dtype: int64 6028
model = linear_model.LinearRegression()
model.fit(x_data, y_data)
print("回归系数:", model.coef_)
print("截距:", model.intercept_)
print('回归方程: price=',model.coef_[0],'*area +',model.coef_[1],'*bedrooms +',model.coef_[2],'*bathromms +',model.intercept_)
回归系数: [ 345.91101884 -2925.80632467 7345.39171369]
截距: 10072.107046726742
回归方程: price= 345.911018840024 *area + -2925.806324666705 *bedrooms + 7345.391713693825 *bathromms + 10072.107046726742
二、对数据进行清洗后再求解
new_data_Z=new_data.iloc[:,0:]
new_data_IQR=new_data.iloc[:,0:]
def outlier_test(data, column, method=None, z=2):
""" 以某列为依据,使用 上下截断点法 检测异常值(索引) """
"""
full_data: 完整数据
column: full_data 中的指定行,格式 'x' 带引号
return 可选; outlier: 异常值数据框
upper: 上截断点; lower: 下截断点
method:检验异常值的方法(可选, 默认的 None 为上下截断点法),
选 Z 方法时,Z 默认为 2
"""
if method == None:
print(f'以 {
column} 列为依据,使用 上下截断点法(iqr) 检测异常值...')
print('=' * 70)
column_iqr = np.quantile(data[column], 0.75) - np.quantile(data[column], 0.25)
(q1, q3) = np.quantile(data[column], 0.25), np.quantile(data[column], 0.75)
upper, lower = (q3 + 1.5 * column_iqr), (q1 - 1.5 * column_iqr)
outlier = data[(data[column] <= lower) | (data[column] >= upper)]
print(f'第一分位数: {
q1}, 第三分位数:{
q3}, 四分位极差:{
column_iqr}')
print(f"上截断点:{
upper}, 下截断点:{
lower}")
return outlier, upper, lower
if method == 'z':
""" 以某列为依据,传入数据与希望分段的 z 分数点,返回异常值索引与所在数据框 """
"""
params
data: 完整数据
column: 指定的检测列
z: Z分位数, 默认为2,根据 z分数-正态曲线表,可知取左右两端的 2%,
根据您 z 分数的正负设置。也可以任意更改,知道任意顶端百分比的数据集合
"""
print(f'以 {
column} 列为依据,使用 Z 分数法,z 分位数取 {
z} 来检测异常值...')
print('=' * 70)
mean, std = np.mean(data[column]), np.std(data[column])
upper, lower = (mean + z * std), (mean - z * std)
print(f"取 {
z} 个 Z分数:大于 {
upper} 或小于 {
lower} 的即可被视为异常值。")
print('=' * 70)
outlier = data[(data[column] <= lower) | (data[column] >= upper)]
return outlier, upper, lower
outlier, upper, lower = outlier_test(data=new_data_Z, column='price', method='z')
outlier.info(); outlier.sample(5)
new_data_Z.drop(index=outlier.index, inplace=True)
以 price 列为依据,使用 Z 分数法,z 分位数取 2 来检测异常值...
======================================================================
取 2 个 Z分数:大于 1801467.128762203 或小于 -293051.36101170536 的即可被视为异常值。
======================================================================
Int64Index: 335 entries, 22 to 6018
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 neighborhood 335 non-null object
1 area 335 non-null int64
2 bedrooms 335 non-null int64
3 bathrooms 335 non-null int64
4 style 335 non-null object
5 price 335 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.3+ KB
outlier, upper, lower = outlier_test(data=new_data_IQR, column='price')
outlier.info(); outlier.sample(5)
new_data_IQR.drop(index=outlier.index, inplace=True)
以 price 列为依据,使用 上下截断点法(iqr) 检测异常值...
======================================================================
第一分位数: 364135.0, 第三分位数:966675.25, 四分位极差:602540.25
上截断点:1870485.625, 下截断点:-539675.375
Int64Index: 265 entries, 22 to 6018
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 neighborhood 265 non-null object
1 area 265 non-null int64
2 bedrooms 265 non-null int64
3 bathrooms 265 non-null int64
4 style 265 non-null object
5 price 265 non-null int64
dtypes: int64(4), object(2)
memory usage: 14.5+ KB
print("原数据相关性矩阵")
new_data.corr()
原数据相关性矩阵
|
area |
bedrooms |
bathrooms |
price |
area |
1.000000 |
0.901623 |
0.891481 |
0.823454 |
bedrooms |
0.901623 |
1.000000 |
0.972768 |
0.743435 |
bathrooms |
0.891481 |
0.972768 |
1.000000 |
0.735851 |
price |
0.823454 |
0.743435 |
0.735851 |
1.000000 |
print("Z方法处理的数据相关性矩阵")
new_data_Z.corr()
Z方法处理的数据相关性矩阵
|
area |
bedrooms |
bathrooms |
price |
area |
1.000000 |
0.895487 |
0.882985 |
0.787334 |
bedrooms |
0.895487 |
1.000000 |
0.970403 |
0.739090 |
bathrooms |
0.882985 |
0.970403 |
1.000000 |
0.724140 |
price |
0.787334 |
0.739090 |
0.724140 |
1.000000 |
print("IQR方法处理的数据相关性矩阵")
new_data_IQR.corr()
IQR方法处理的数据相关性矩阵
|
area |
bedrooms |
bathrooms |
price |
area |
1.000000 |
0.896169 |
0.883887 |
0.789651 |
bedrooms |
0.896169 |
1.000000 |
0.970790 |
0.736541 |
bathrooms |
0.883887 |
0.970790 |
1.000000 |
0.723202 |
price |
0.789651 |
0.736541 |
0.723202 |
1.000000 |
x_data = new_data_Z.iloc[:, 1:4]
y_data = new_data_Z.iloc[:, -1]
model = linear_model.LinearRegression()
model.fit(x_data, y_data)
print("回归系数:", model.coef_)
print("截距:", model.intercept_)
print('回归方程: price=',model.coef_[0],'*area +',model.coef_[1],'*bedrooms +',model.coef_[2],'*bathromms +',model.intercept_)
回归系数: [ 226.42116974 49931.50311721 -12224.71724497]
截距: 64356.04135007458
回归方程: price= 226.4211697383351 *area + 49931.50311720713 *bedrooms + -12224.71724496588 *bathromms + 64356.04135007458
x_data = new_data_IQR.iloc[:, 1:4]
y_data = new_data_IQR.iloc[:, -1]
model = linear_model.LinearRegression()
model.fit(x_data, y_data)
print("回归系数:", model.coef_)
print("截距:", model.intercept_)
print('回归方程: price=',model.coef_[0],'*area +',model.coef_[1],'*bedrooms +',model.coef_[2],'*bathromms +',model.intercept_)
回归系数: [ 242.61115518 41547.43068791 -6415.7825009 ]
截距: 58018.13845504692
回归方程: price= 242.6111551782956 *area + 41547.43068790577 *bedrooms + -6415.78250090158 *bathromms + 58018.13845504692
三、结果对比
1.不作任何处理的数据求解的结果为:
price= 345.911018840024 *area + -2925.806324666705 *bedrooms + 7345.391713693825 *bathromms + 10072.107046726742
2.采用Z方式清洗数据的求解结果为:
price= 226.4211697383351 *area + 49931.50311720713 *bedrooms + -12224.71724496588 *bathromms + 64356.04135007458
3.采用IQR放心清洗数据的求解结果为:
price= 242.6111551782956 *area + 41547.43068790577 *bedrooms + -6415.78250090158 *bathromms + 58018.13845504692