CRIM: 城镇人均犯罪率
ZN: 住宅用地所占比例
INDUS: 城镇中非住宅用地所占比例
CHAS: 虚拟变量,用于回归分析
NOX: 环保指数
RM: 每栋住宅的房间数
AGE: 1940 年以前建成的自住单位的比例
DIS: 距离 5 个波士顿的就业中心的加权距离
RAD: 距离高速公路的便利指数
TAX: 每一万美元的不动产税率
PTRATIO: 城镇中的教师学生比例
B: 城镇中的黑人比例
LSTAT: 地区中有多少房东属于低收入人群
MEDV: 自住房屋房价中位数(也就是均价)
print (boston_data['DESCR'])
Boston House Prices dataset
===========================
Notes
------ Data Set Characteristics:
:Number of Instances: 506 :Number of Attributes: 13 numeric/categorical predictive :Median Value (attribute 14) is usually the target :Attribute Information (in order): - CRIM per capita crime rate by town - ZN proportion of residential land zoned for lots over 25,000 sq.ft. - INDUS proportion of non-retail business acres per town - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - NOX nitric oxides concentration (parts per 10 million) - RM average number of rooms per dwelling - AGE proportion of owner-occupied units built prior to 1940 - DIS weighted distances to five Boston employment centres - RAD index of accessibility to radial highways - TAX full-value property-tax rate per $10,000 - PTRATIO pupil-teacher ratio by town - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town - LSTAT % lower status of the population - MEDV Median value of owner-occupied homes in $1000's
from sklearn.datasets import load_boston
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from matplotlib import pyplot as plt
boston_data=load_boston()
x_data = boston_data.data
y_data = boston_data.target
names=boston_data.feature_names
FeaturesNums = 13
DataNums = len(x_data)
以下的数据图是经过筛选后的特征数据所得
# 每个Feature和target二维关系图
plt.subplots(figsize=(20,12))
for i in range(FeaturesNums):
plt.subplot(231+i)
plt.scatter(x_train[:,i],y_train,s=20,color='blueviolet')
plt.title(names[i])
plt.show()
plt.subplots(figsize=(20,10))
for i in range(FeaturesNums):
plt.subplot(231+i)
plt.hist(x_data[:,i],color='lightseagreen',width=2)
plt.xlabel(names[i])
plt.title(names[i])
plt.show()
from sklearn import preprocessing
DelList0=[]
for i in range(DataNums):
if (y_data[i] >= 49 or y_data[i] <= 1):
DelList0.append(i)
DataNums -= len(DelList0)
x_data = np.delete(x_data,DelList0,axis=0)
y_data = np.delete(y_data,DelList0,axis=0)
DelList1=[]
for i in range(FeaturesNums):
if (names[i] == 'ZN' or
names[i] == 'INDUS' or
names[i] == 'RAD' or
names[i] == 'TAX' or
names[i] == 'CHAS' or
names[i] == 'NOX' or
names[i] == 'B' or
names[i] == 'PTRATIO'):
DelList1.append(i)
x_data = np.delete(x_data, DelList1, axis=1)
names = np.delete(names, DelList1)
FeaturesNums -= len(DelList1)
from sklearn.preprocessing import MinMaxScaler, scale
nms = MinMaxScaler()
x_train = nms.fit_transform(x_train)
x_test = nms.fit_transform(x_test)
y_train = nms.fit_transform(y_train.reshape(-1,1))
y_test = nms.fit_transform(y_test.reshape(-1,1))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print ("MSE =", mean_squared_error(y_test, y_pred),end='\n\n')
print ("R2 =", r2_score(y_test, y_pred),end='\n\n')
MSE = 0.013304697805737791
R2 = 0.44625845284900767
# 画图
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, c="blue", edgecolors="aqua",s=13)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k', lw=2, color='navy')
ax.set_xlabel('Reality')
ax.set_ylabel('Prediction')
plt.show()
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict, cross_val_score
linear_svr = SVR(kernel='linear')
# linear_svr.fit(x_train, y_train)
# linear_pred = linear_svr.predict(x_test)
linear_svr_pred = cross_val_predict(linear_svr, x_train, y_train, cv=5)
linear_svr_score = cross_val_score(linear_svr, x_train, y_train, cv=5)
linear_svr_meanscore = linear_svr_score.mean()
print ("Linear_SVR_Score =",linear_svr_meanscore,end='\n')
Linear_SVR_Score = 0.6497361775614359
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict, cross_val_score
poly_svr = SVR(kernel='poly')
poly_svr.fit(x_train, y_train)
poly_pred = poly_svr.predict(x_test)
poly_svr_pred = cross_val_predict(poly_svr, x_train, y_train, cv=5)
poly_svr_score = cross_val_score(poly_svr, x_train, y_train, cv=5)
poly_svr_meanscore = poly_svr_score.mean()
print ('\n',"Poly_SVR_Score =",poly_svr_meanscore,end='\n')
Poly_SVR_Score = 0.5383303049258509