《Python机器学习及实践》----监督学习经典模型

本片博客是根据《Python机器学习及实践》一书中的实例,所有代码均在本地编译通过。数据为从该书指定的百度网盘上下载的,或者是sklearn自带数据下载到本地使用的。
代码片段:

import pandas as pd
import numpy as np
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
data=pd.read_csv('D:\Source Code\machinelearn\\breast-cancer-wisconsin.txt',sep=',', names=column_names)
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna(how='any')
data.shape
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=33)
Y_train.value_counts()
Y_test.value_counts()
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
lr = LogisticRegression()
sgdc = SGDClassifier()
lr.fit(X_train,Y_train)
lr_y_predict = lr.predict(X_test)
sgdc.fit(X_train,Y_train)
sgdc_y_predict = sgdc.predict(X_test)
from sklearn.metrics import classification_report
print 'Accuracy of LR Classifier:', lr.score(X_test,Y_test)
print classification_report(Y_test,lr_y_predict,target_names=['Benign','Malignant'])
print 'Accuracy of SGD Classifier:',sgdc.score(X_test,Y_test)
print classification_report(Y_test,sgdc_y_predict,target_names=['Benign','Malignant'])

from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape
from sklearn.cross_validation import train_test_split
x_train,x_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size=0.25,random_state=33)
y_train.shape
y_test.shape
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)
lsvc = LinearSVC()
lsvc.fit(x_train,y_train)
y_predict = lsvc.predict(x_test)
from sklearn.metrics import classification_report
print 'The Accuracy of Linear SVC is ',lsvc.score(x_test,y_test)
print classification_report(y_test,y_predict,target_names=digits.target_names.astype(str))

from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups()
print len(news.data)
print news.data[0]
from sklearn.cross_validation import train_test_split
x_train,x_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
x_train = vec.fit_transform(x_train)
x_test = vec.transform(x_test)
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train,y_train)
y_predict = mnb.predict(x_test)
from sklearn.metrics import classification_report
print 'The Accuracy of Naive Bayes Classifier is ',mnb.score(x_test,y_test)
print classification_report(y_test,y_predict,target_names=news.target_names)

from sklearn.datasets import load_iris
iris = load_iris()
iris.data.shape
print iris.DESCR
from sklearn.cross_validation import train_test_split
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.25,random_state=33)
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)
knc = KNeighborsClassifier()
knc.fit(x_train,y_train)
y_predict = knc.predict(x_test)
print 'The Accuracy of K-Nearest Neighbor Classifier is ',knc.score(x_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_predict,target_names=iris.target_names)

import pandas as pd
titanic = pd.read_csv('D:\Source Code\machinelearn\\titanic.txt')
titanic.head()
titanic.info()
X = titanic[['pclass','age','sex']]
Y = titanic['survived']
X.info()
X['age'].fillna(X['age'].mean(),inplace=True)
X.info()
from sklearn.cross_validation import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.25,random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
x_train = vec.fit_transform(x_train.to_dict(orient='record'))
print vec.feature_names_
x_test = vec.transform(x_test.to_dict(orient='record'))
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
y_predict= dtc.predict(x_test)
from sklearn.metrics import classification_report
print dtc.score(x_test,y_test)
print classification_report(y_predict,y_test,target_names=['died','survived'])

import pandas as pd
titanic = pd.read_csv('D:\Source Code\machinelearn\\titanic.txt')
X = titanic[['pclass','age','sex']]
Y = titanic['survived']
X['age'].fillna(X['age'].mean(),inplace=True)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_y_pred = dtc.predict(X_test)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_pred = gbc.predict(X_test)
from sklearn.metrics import classification_report
print 'The accuracy of decision tree is', dtc.score(X_test, y_test)
print classification_report(dtc_y_pred, y_test)
print 'The accuracy of random forest classifier is', rfc.score(X_test, y_test)
print classification_report(rfc_y_pred, y_test)
print 'The accuracy of gradient tree boosting is', gbc.score(X_test, y_test)
print classification_report(gbc_y_pred, y_test)

#线性回归
from sklearn.datasets import load_boston
boston = load_boston()
print boston.DESCR
from sklearn.cross_validation import train_test_split
import numpy as np
X = boston.data
Y = boston.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state = 33)
print "The max target value is ", np.max(boston.target)
print "The min target value is ", np.min(boston.target)
print "The average target value is", np.mean(boston.target)
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
ss_Y = StandardScaler()
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)
Y_train = ss_Y.fit_transform(Y_train)
Y_test = ss_Y.transform(Y_test)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,Y_train)
lr_y_predict = lr.predict(X_test)
from sklearn.linear_model import SGDRegressor
sgdr = SGDRegressor()
sgdr.fit(X_train,Y_train)
sgdr_y_predict = sgdr.predict(X_test)
print 'The value of default measurement of LinearRegression is', lr.score(X_test, Y_test)
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print 'The value of R-squared of LinearRegression is', r2_score(Y_test, lr_y_predict)
print 'The mean squared error of LinearRegression is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_y_predict))
print 'The mean absoluate error of LinearRegression is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_y_predict))

print 'The value of default measurement of SGDRegressor is', sgdr.score(X_test, Y_test)
print 'The value of R-squared of SGDRegressor is', r2_score(Y_test, sgdr_y_predict)
print 'The mean squared error of SGDRegressor is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_y_predict))
print 'The mean absoluate error of SGDRegressor is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_y_predict))

from sklearn.svm import SVR
linear_svr = SVR(kernel='linear')
linear_svr.fit(X_train,Y_train)
linear_svr_y_predict = linear_svr.predict(X_test)
poly_svr = SVR(kernel='poly')
poly_svr.fit(X_train,Y_train)
poly_svr_y_predict = poly_svr.predict(X_test)
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(X_train,Y_train)
rbf_svr_y_predict = rbf_svr.predict(X_test)
from sklearn.metrics import r2_score,mean_absolute_error,median_absolute_error
print 'R-squared value of linear SVR is', linear_svr.score(X_test, Y_test)
print 'The mean squared error of linear SVR is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(linear_svr_y_predict))
print 'The mean absoluate error of linear SVR is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(linear_svr_y_predict))
print 'R-squared value of Poly SVR is', poly_svr.score(X_test, Y_test)
print 'The mean squared error of Poly SVR is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict))
print 'The mean absoluate error of Poly SVR is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict))
print 'R-squared value of RBF SVR is', rbf_svr.score(X_test, Y_test)
print 'The mean squared error of RBF SVR is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict))
print 'The mean absoluate error of RBF SVR is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict))

from sklearn.neighbors import KNeighborsRegressor
uni_knr = KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train,Y_train)
uni_knr_y_predict = uni_knr.predict(X_test)
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train,Y_train)
dis_knr_y_predict = dis_knr.predict(X_test)
print 'R-squared value of uniform-weighted KNeighorRegression:', uni_knr.score(X_test, Y_test)
print 'The mean squared error of uniform-weighted KNeighorRegression:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predict))
print 'The mean absoluate error of uniform-weighted KNeighorRegression', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predict))
print 'R-squared value of distance-weighted KNeighorRegression:', dis_knr.score(X_test, Y_test)
print 'The mean squared error of distance-weighted KNeighorRegression:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict))
print 'The mean absoluate error of distance-weighted KNeighorRegression:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict))

from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train,Y_train)
dtr_y_predict = dtr.predict(X_test)
print 'R-squared value of DecisionTreeRegressor:', dtr.score(X_test, Y_test)
print 'The mean squared error of DecisionTreeRegressor:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict))
print 'The mean absoluate error of DecisionTreeRegressor:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict))

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
# 使用RandomForestRegressor训练模型,并对测试数据做出预测,结果存储在变量rfr_y_predict中。
rfr = RandomForestRegressor()
rfr.fit(X_train, Y_train)
rfr_y_predict = rfr.predict(X_test)
# 使用ExtraTreesRegressor训练模型,并对测试数据做出预测,结果存储在变量etr_y_predict中。
etr = ExtraTreesRegressor()
etr.fit(X_train, Y_train)
etr_y_predict = etr.predict(X_test)
# 使用GradientBoostingRegressor训练模型,并对测试数据做出预测,结果存储在变量gbr_y_predict中。
gbr = GradientBoostingRegressor()
gbr.fit(X_train, Y_train)
gbr_y_predict = gbr.predict(X_test)
# 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估。
print 'R-squared value of RandomForestRegressor:', rfr.score(X_test, Y_test)
print 'The mean squared error of RandomForestRegressor:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict))
print 'The mean absoluate error of RandomForestRegressor:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict))
# 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。\n",
print 'R-squared value of ExtraTreesRegessor:', etr.score(X_test, Y_test)
print 'The mean squared error of  ExtraTreesRegessor:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etr_y_predict))
print 'The mean absoluate error of ExtraTreesRegessor:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etr_y_predict))
# 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度。
print np.sort(zip(etr.feature_importances_, boston.feature_names), axis=0)
# 使用R-squared、MSE以及MAE指标对默认配置的梯度提升回归树在测试集上进行性能评估。
print 'R-squared value of GradientBoostingRegressor:', gbr.score(X_test, Y_test)
print 'The mean squared error of GradientBoostingRegressor:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict))
print 'The mean absoluate error of GradientBoostingRegressor:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict))

其中数组读数据有一处有问题,报错如下:

No handlers could be found for logger "sklearn.datasets.twenty_newsgroups"

解决方法如下:
首先手动下载 http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz 这个包,
在……\账户名\scikit_learn_data,将下载的包放在该目录下,
之后找到twenty_newsgroups.py文件,该文件主要通过这个包来在线下载,我们这里手动下载,然后注释相关代码即可。
找到这个函数 download_20newsgroups,注释以下代码

if os.path.exists(archive_path):
        # Download is not complete as the .tar.gz file is removed after
        # download.
        logger.warn("Download was incomplete, downloading again.")
        os.remove(archive_path)

    logger.warn("Downloading dataset from %s (14 MB)", URL)
    opener = urlopen(URL)
    open(archive_path, 'wb').write(opener.read())

观察以下代码,是将下载的文件解压,所以我们注释掉上面在线下载即可

logger.info("Decompressing %s", archive_path)
tarfile.open(archive_path, "r:gz").extractall(path=target_dir)
os.remove(archive_path)

保存,执行相关导入数据集程序,最终可以发现……r\账户名\scikit_learn_data目录下只剩下
20news-bydate.pkz文件,以后在执行程序就不需要重新下载了。

你可能感兴趣的:(Python,python,机器学习,数据)