#导入model_selection进行数据分割
from sklearn.model_selection import train_test_split
import numpy as np
x = boston.data
y = boston.target
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.25, random_state=33)
print("The max target value is", np.max(boston.target))
print("The min target value is", np.min(boston.target))
print("The average target value is", np.mean(boston.target))
from sklearn.preprocessing import StandardScaler
ss_x = StandardScaler()
ss_y = StandardScaler()
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)
y_train = ss_y.fit_transform(y_train)
y_test = ss_y.transform(y_test)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_y_pred = lr.predict(x_test)
from sklearn.linear_model import SGDRegressor
sgdr = SGDRegressor()
sgdr.fit(x_train, y_train)
sgdr_y_pred = sgdr.predict(x_test)
print('The value of default measurement of LinearRegression is', lr.score(x_train, y_train))
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print('The value of R-sqaured of LinearRegression is', r2_score(y_test, lr_y_pred))
print('The mean squared error of LinearRegression is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_pred)))
print('The mean absolute error of LinearRegression is', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_pred)))
print('The value of default of SDGRegressor is', sgdr.score(x_test, y_test))
print('The vlue of R-squared of SGDRegressor is', r2_score(y_test, sgdr_y_pred))
print('The mean squared error of SGDRegressor is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_pred)))
print('The mean absolute error of SGDRegressor is', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_pred)))
运行python机器学习及实践中利用线性回归对Boston房价进行预测的时候,会出现如下错误:
Traceback (most recent call last):
File "D:\Python362\a_机器学习及实战\LinearRegression.py", line 22, in
y_train = ss_y.fit_transform(y_train)
File "D:\Python362\lib\site-packages\sklearn\base.py", line 518, in fit_transform
return self.fit(X, **fit_params).transform(X)
File "D:\Python362\lib\site-packages\sklearn\preprocessing\data.py", line 590, in fit
return self.partial_fit(X, y)
File "D:\Python362\lib\site-packages\sklearn\preprocessing\data.py", line 612, in partial_fit
warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES)
File "D:\Python362\lib\site-packages\sklearn\utils\validation.py", line 410, in check_array
"if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=[ 33.8 20.3 10.2 22. 21.2 24.2 29. 22.7 21.8 34.9 25.2 20.9
19.4 20. 14. 30.1 33.1 20.6 22.6 33.4 20.1 10.5 15.6 16.8
22.6 34.6 19.8 17.8 22. 17.4 15.4 16.7 22.6 15.1 21.4 15.3
7.4 13.9 17.6 25. 46.7 17.1 23.1 18.7 21.9 18.9 26.7 22.3
25. 14.6 42.8 17.3 22.2 36.5 22.8 19.9 36.2 50. 25. 22.2
17.5 23.9 19.6 24.7 28.4 8.7 21.7 20. 19.9 24.5 15. 7.
15.2 20.4 8.5 17.1 30.1 15. 19.4 23.2 17. 18.9 50. 25.
46. 7.2 17.8 35.1 24.3 5. 16.6 21.8 28.5 22. 20.3 21.7
26.4 30.7 50. 17.2 26.6 21. 23.4 19.5 20.7 23.3 48.8 15.6
19.6 17.4 21.7 14.6 37.9 9.7 17.8 12.1 20.1 29.9 26.4 18.8
32.5 15.7 13.4 21.7 23.6 11.9 13.8 22.2 13. 33.2 50. 22.3
22.4 23.8 29.1 20.8 23.7 19.8 13.9 28.4 45.4 23.7 50. 18.
17.1 18.9 10.4 24.7 23.9 23. 20.2 8.5 14.2 20.3 18.5 12.
19.3 20.6 16.1 12.3 23.1 22.7 20.3 16.7 27.9 21.4 8.1 37.6
15.6 29.6 22.9 24.8 24.4 50. 28.7 50. 16.5 18.2 50. 16.2
14.1 21.2 18.4 25. 50. 21.2 20.4 15.2 22. 19.8 22.1 23.9
24.6 23.9 21.7 44.8 7.2 18.5 20.1 23.3 19.2 29.1 31. 22.9
27.5 39.8 22. 22.8 22.9 14.3 14.5 22.4 19.3 32. 20.1 18.3
24.5 18.4 23.1 22.6 20.2 17.8 31.6 43.5 36.4 11.3 20.5 23.2
29.8 20.6 24.3 18.1 19.1 21.4 31.5 19.2 14.3 24.8 21.1 18.2
48.3 19.4 21.2 10.9 27.5 34.7 14.4 22.8 17.8 50. 24.4 12.8
30.8 28.2 25. 33.1 27.5 12.7 43.1 13.4 21.5 33.4 23.8 21.
26.6 18.5 23. 24.1 20.5 32.2 14.4 11.8 19.5 23.7 13.2 29.
18.2 18.6 23. 42.3 17.2 16.2 20. 30.3 20.9 20.4 24.8 18.7
16.8 22.5 18.8 23.7 23.8 19.6 20.4 16.1 44. 19.3 17.4 10.2
11.7 37.2 11. 23.6 22.8 15. 34.9 17.9 24.4 24.5 6.3 29.4
10.4 38.7 20. 19.4 37. 50. 18.7 48.5 35.4 23.4 7. 50.
20.7 35.4 9.6 25.1 16.1 27. 16.6 13.3 25. 24. 19.6 29.6
21.7 19.1 22. 13.3 27.1 22.9 33.2 13.5 14.5 8.3 41.7 31.2
23.9 23.1 24.3 18.3 20.8 28. 19.5 21.5 13.1 12.5 31.7 13.1
23.1 14.5 22.2 13.1 37.3 22. 10.2 5. 19.3 16. 18.6 50.
31.6 24.1 15.6 19.4 23.3 23.2 13.6].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
主要是工具包版本更新造成的,面对上面问题,我们根据上面错误的提示相应的找出出错的两行代码:
y_train = ss_y.fit_transform(y_train)
y_test = ss_y.transform(y_test)
主要问题出现在上面的两行代码中,例如数据格式为[1, 2, 3, 4]就会出错,如果把这行数据转换成[[1], [2], [3], [4]]就不会出错了,所以要对上面导致出错的两行代码做出修改:
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1,1))
再运行程序,错误消除。但是代码还会出现一些提醒,如下:
Warning (from warnings module):
File "D:\Python362\lib\site-packages\sklearn\linear_model\stochastic_gradient.py", line 84
"and default tol will be 1e-3." % type(self), FutureWarning)
FutureWarning: max_iter and tol parameters have been added in in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
Warning (from warnings module):
File "D:\Python362\lib\site-packages\sklearn\utils\validation.py", line 547
y = column_or_1d(y, warn=True)
DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
第一个提醒是说版本不同,SGDRegressor所默认设置的迭代次数也不同,既然这样为了消除这个提醒,我们可以在sgdr = SGDRegressor()中把迭代次数重新设置一下sgdr = SGDRegressor(max_iter=5)本文设置最大迭代次数为5,也可以设置成1000,这个根据自己的需求设置,第一个warning消除。
第二个提醒是要对预测输出y做出ravel()转换,例如下面操作:
>>> x = np.array([[1, 2, 3], [4, 5, 6]])
>>> print(np.ravel(x))
[1 2 3 4 5 6]
也就是说我的y是2D的形式(shapes, 1),要把二维的形式改成1D的形式(shapes, ),,这就可以对fit输入的y_train作y_train.ravel()这样的转换,即把sgdr.fit(x_train, y_train)代码修改为sgdr.fit(x_train, y_train.ravel()),warning就会消失了
最终修改后的完整代码为:
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.DESCR)
#导入model_selection进行数据分割
from sklearn.model_selection import train_test_split
import numpy as np
x = boston.data
y = boston.target
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.25, random_state=33)
print("The max target value is", np.max(boston.target))
print("The min target value is", np.min(boston.target))
print("The average target value is", np.mean(boston.target))
from sklearn.preprocessing import StandardScaler
ss_x = StandardScaler()
ss_y = StandardScaler()
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))#修改1
y_test = ss_y.transform(y_test.reshape(-1,1))#修改2
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_y_pred = lr.predict(x_test)
from sklearn.linear_model import SGDRegressor
sgdr = SGDRegressor(max_iter=5)#修改3
sgdr.fit(x_train, y_train.ravel())#修改4
sgdr_y_pred= sgdr.predict(x_test)
print('The value of default measurement of LinearRegression is', lr.score(x_train, y_train))
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print('The value of R-sqaured of LinearRegression is', r2_score(y_test, lr_y_pred))
print('The mean squared error of LinearRegression is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_pred)))
print('The mean absolute error of LinearRegression is', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_pred)))
print('The value of default of SDGRegressor is', sgdr.score(x_test, y_test))
print('The vlue of R-squared of SGDRegressor is', r2_score(y_test, sgdr_y_pred))
print('The mean squared error of SGDRegressor is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_pred)))
print('The mean absolute error of SGDRegressor is', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_pred)))
最终运行结果:
Boston House Prices dataset
===========================
Notes
------
Data Set Characteristics:
:Number of Instances: 506
:Number of Attributes: 13 numeric/categorical predictive
:Median Value (attribute 14) is usually the target
:Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per $10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in $1000's
:Missing Attribute Values: None
:Creator: Harrison, D. and Rubinfeld, D.L.
This is a copy of UCI ML housing dataset.
http://archive.ics.uci.edu/ml/datasets/Housing
This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980. N.B. Various transformations are used in the table on
pages 244-261 of the latter.
The Boston house-price data has been used in many machine learning papers that address regression
problems.
**References**
- Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
- many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)
The max target value is 50.0
The min target value is 5.0
The average target value is 22.5328063241
The value of default measurement of LinearRegression is 0.754922212481
The value of R-sqaured of LinearRegression is 0.6763403831
The mean squared error of LinearRegression is 25.0969856921
The mean absolute error of LinearRegression is 3.5261239964
The value of default of SDGRegressor is 0.655430817284
The vlue of R-squared of SGDRegressor is 0.655430817284
The mean squared error of SGDRegressor is 26.7183404942
The mean absolute error of SGDRegressor is 3.53570681806