使用线性分类模型从事良/恶性乳腺癌预测任务
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
#定义名字,为了,简单命名,第一列是序号,最后一列是输出
column_names=['sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','class']
#利用pandas从网上下载数据
data=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names=column_names)
#删除丢失的不完整的数据
data=data.replace(to_replace='?',value=np.nan)
data=data.dropna(how='any')
print(data.shape)
print(data.head())
##################################################################################
x_train,x_test,y_train,y_test=train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=33)
#查验训练样本的数量和类别分布
print(y_train.value_counts())
#查验测试样本数量和类别分布
print(y_test.value_counts())
#使用线性分类模型从事良/恶性肿瘤预测任务
#标准化数据,保证每个维度的特征数据方差为1,均值为0。使得预测结果不会被某些维度过大的特征值而主导。
ss=StandardScaler()
x_train=ss.fit_transform(x_train)
x_test=ss.fit_transform(x_test)
#初始化LogisticRegression与SGDClassifier
lr=LogisticRegression()
sgdc=SGDClassifier()
#使用fit函数来训练模型
lr.fit(x_train,y_train)
#使用训练好的模型lr对test进行预测
lr_y_predict=lr.predict(x_test)
#调用SGDClassifier中的fit函数来训练模型参数
sgdc.fit(x_train,y_train)
#使用训练好的模型sgdc对X_test进行预测
sgdc_y_predict=sgdc.predict(x_test)
#使用线性分类模型从事良/恶性肿瘤预测任务的性能分析
#使用逻辑斯蒂回归模型自带的评分函数score获得模型在测试集上的准确性结果
print("Accuracy of LR Classifier:",lr.score(x_test,y_test))
print(classification_report(y_test,lr_y_predict,target_names=['Benign','Malignant']))
#使用随机梯度下降模型自带的评分函数score获得模型在测试集上的准确性结果
print('Accuarcy of SGD Classifier:',sgdc.score(x_test,y_test))
print(classification_report(y_test,sgdc_y_predict,target_names=['Benign','Malignant']))
print("finish")
SVM对手写数字进行识别
#手写体数据读取代码样例
#从sklearn.datasets里面导入手写体数字加载器
from sklearn.datasets import load_digits
#从数据集加载器获得手写数字的数据图像并存储在digits变量中。
digits=load_digits()
#检视数据规模和特征维度
print(digits.data.shape)
#手写体数据分割代码样例
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(digits.data,digits.target,test_size=0.25,random_state=33)
#分别检视训练与测试数据规模
print(y_train.shape)
print(y_test.shape)
#使用支持向量机(分类)对手写数字图像进行识别
#导入标准化模块
from sklearn.preprocessing import StandardScaler
#导入基于线性假设的支持向量机分类器LinearSVC
from sklearn.svm import LinearSVC
#从仍需要对训练和测试特征数据进行标准化
ss=StandardScaler()
x_train=ss.fit_transform(x_train)
x_test=ss.transform(x_test)
#初始化线性假设的支持向量机分类器LinearSVC
lsvc=LinearSVC()
#进行模型的训练
lsvc.fit(x_train,y_train)
#利用训练好的模型对测试样本的数字类别进行预测,
y_predict=lsvc.predict(x_test)
#使用模型自带的评估函数进行准确性测评
print('The Accuracy of Linear SVC is',lsvc.score(x_test,y_test))
#依然使用sklearn.metrics里的classification_report模块对预测结果做更加详细的分析
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict,target_names=digits.target_names.astype(str)))
使用贝叶斯对新闻数据集进行分类
#利用朴素贝叶斯对新闻进行分类
#读取20类新闻文本的数据细节
#导入新闻数据抓取器
from sklearn.datasets import fetch_20newsgroups
#即使从互联网上下载数据
news = fetch_20newsgroups(subset='all')
#查验数据规模和细节
print(len(news.data))
print(news.data[0])
#20类新闻文本数据分割
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33)
#使用朴素贝叶斯分类器对新闻文本数据进行分类预测
#导入文本向量转换模块
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
x_train = vec.fit_transform(x_train)
x_test = vec.transform(x_test)
#导入朴素贝叶斯模型
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
#利用训练数据对模型参数进行估计
mnb.fit(x_train,y_train)
#对测试样本进行类别预测
y_predict = mnb.predict(x_test)
#性能评估
#导入用于详细的分类性能报告
from sklearn.metrics import classification_report
print('The Accuracy of Naive Bayes Classifier is', mnb.score(x_test,y_test))
print(classification_report(y_test,y_predict,target_names=news.target_names))
print("done")
使用K-近邻算法对鸢尾花数据进行分类
#使用鸢尾花对Iris数据集进行分类
#导入Iris数据加载器
from sklearn.datasets import load_iris
iris = load_iris()
#查验数据规模
print(iris.data.shape)
#查看数据规模
print(iris.DESCR)
#对Iris数据集进行分割
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.25,random_state=33)
#使用K近邻分类器对鸢尾花数据进行类别预测
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)
#使用K近邻分类器对测试数据进行类别分类
knc = KNeighborsClassifier()
knc.fit(x_train,y_train)
y_predict = knc.predict(x_test)
#对K近邻分类器在鸢尾花数据上的预测性能进行评估
#使用模型自带的评估函数进行准确性评测
print('The accuracy of K-Nearest Neighbor Classifier is',knc.score(x_test,y_test))
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict,target_names=iris.target_names))
使用决策树对泰坦尼克数据进行分析
#使用决策树对泰坦尼克号数据进行分析
import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
print(titanic.head())
#查看数据的统计特征
print(titanic.info())
#使用决策树模型预测泰坦尼克号乘客生还情况
x = titanic[['pclass','age','sex']]
y = titanic['survived']
#对当前选择的特征进行探索
print(x.info())
#补充age里面的数据
x['age'].fillna(x['age'].mean(),inplace=True)
print(x.info())
#对数据进行分割
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=33)
#特征处理
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
x_train = vec.fit_transform(x_train.to_dict(orient='record'))
x_test = vec.transform(x_test.to_dict(orient='record'))
#导入决策树分类
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
y_predict = dtc.predict(x_test)
#性能评估
from sklearn.metrics import classification_report
print(dtc.score(x_test,y_test))
print(classification_report(y_predict,y_test,target_names=['died','survived']))
集成模型对泰坦尼克号生还数据进行预测
#集成模型对泰坦尼克号乘客是否生还的预测
import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#手工选取特征
x = titanic[['pclass','age','sex']]
y = titanic['survived']
#对缺失数据进行填充
x['age'].fillna(x['age'].mean(),inplace=True)
#对数据集进行划分
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=33)
#对类别特征进行转换
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
x_train = vec.fit_transform(x_train.to_dict(orient='record'))
x_test = vec.transform(x_test.to_dict(orient='record'))
#使用单一决策树进行模型训练以及预测分析
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
dtc_y_predict = dtc.predict(x_test)
#使用随机森林分类器进行集成模型的训练以及预测分析
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc_y_pred = rfc.predict(x_test)
#使用梯度提升决策树进行集成模型的训练以及预测分析
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(x_train,y_train)
gbc_y_pred = gbc.predict(x_test)
#预测性能
from sklearn.metrics import classification_report
print("The accuracy of decision tree is",dtc.score(x_test,y_test))
print(classification_report(dtc_y_predict,y_test))
print("The accuracy of random forest classifier is",rfc.score(x_test,y_test))
print(classification_report(rfc_y_pred,y_test))
print("The accuracy of gradient tree boosting is",gbc.score(x_test,y_test))
print(classification_report(gbc_y_pred,y_test))
使用线性回归器对美国波士顿地区房价数据描述
#美国波士顿地区房价数据描述
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.DESCR)
#美国波士顿地区的房价数据分割
from sklearn.model_selection import train_test_split
import numpy as np
x = boston.data
y = boston.target
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=33,test_size=0.25)
#分析回归目标值的差异
print("The max target value is",np.max(boston.target))
print("The min target value is",np.min(boston.target))
print("The average target value is",np.mean(boston.target))
#对数据进行标准化处理
from sklearn.preprocessing import StandardScaler
ss_x = StandardScaler()
ss_y = StandardScaler()
#分别对训练和测试数据特征以及目标值进行标准化处理
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)
y_train = ss_y.fit_transform(y_train.reshape(-1,1))
y_test = ss_y.transform(y_test.reshape(-1,1))
#使用线性回归模型分别对美国波士顿地区房价进行预测
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)
lr_y_predict = lr.predict(x_test)
#
from sklearn.linear_model import SGDRegressor
sgdr = SGDRegressor()
sgdr.fit(x_train,y_train.ravel())
sgdr_y_predict = sgdr.predict(x_test)
#对回归模型性能做评估
print('The value of default measurement of LinearRegression is',lr.score(x_test,y_test))
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
print("The value of R-squared is",r2_score(y_test,lr_y_predict))
print("The value of MAE is",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))
print("The value of MSE is",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))
特征抽取
#DictVectorizer对使用字典存储的数据进行特征抽取与向量化
measurements = [{'city':'Dubai','temperature':33.},{'city':'London','temperature':12.},{'city':'San Fransisco','temperature':18.}]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
print(vec.fit_transform(measurements).toarray())
#输出各个维度上的特征含义
print(vec.get_feature_names())
#使用CountVectorizer并且不去掉停用词的条件下,对文本特征进行量化的朴素贝叶斯分类性能测试
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
x_count_train = count_vec.fit_transform(x_train)
x_count_test = count_vec.transform(x_test)
from sklearn.naive_bayes import MultinomialNB
mnb_count = MultinomialNB()
mnb_count.fit(x_count_train,y_train)
print('The accuracy of classifying 20newsgroups using Naive Bayes',mnb_count.score(x_count_test,y_test))
y_count_predict = mnb_count.predict(x_count_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_count_predict,target_names=news.target_names))
#########使用Tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
x_tfidf_train = tfidf_vec.fit_transform(x_train)
x_tfidf_test = tfidf_vec.transform(x_test)
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(x_tfidf_train,y_train)
print("tf-idf accuracy",mnb_tfidf.score(x_tfidf_test,y_test))
y_tfidf_predict = mnb_tfidf.predict(x_tfidf_test)
print(classification_report(y_test,y_tfidf_predict,target_names=news.target_names))
#使用CountVectorizer与TfidfVectorizer,并且去掉了停用词
count_filter_vec,tfidf_filter_vec = CountVectorizer(analyzer='word',stop_words='english'),TfidfVectorizer(analyzer='word',stop_words='english')
x_count_filter_train = count_filter_vec.fit_transform(x_train)
x_count_filter_test = count_filter_vec.transform(x_test)
x_tfidf_filter_train = tfidf_filter_vec.fit_transform(x_train)
x_tfidf_filter_test = tfidf_filter_vec.transform(x_test)
mnb_count_filter = MultinomialNB()
mnb_count_filter.fit(x_count_filter_train,y_train)
print("accuracy",mnb_count_filter.score(x_count_filter_test,y_test))
y_count_filter_predict = mnb_count_filter.predict(x_count_filter_test)
mnb_tfidf_filter = MultinomialNB()
mnb_tfidf_filter.fit(x_tfidf_filter_train,y_train)
print("accuracy",mnb_tfidf_filter.score(x_count_filter_test,y_test))
y_tfidf_filter_predict = mnb_tfidf_filter.predict(x_tfidf_filter_test)
#性能评估
from sklearn.metrics import classification_report
print(classification_report(y_test,y_count_filter_predict,target_names=news.target_names))
print(classification_report(y_test,y_tfidf_filter_predict,target_names=news.target_names))
#使用单线程对文本分类的朴素贝叶斯模型的超参数组合执行网格搜索
#对文本分类的朴素贝叶斯模型的超参数组合进行网格搜索
#从sklearn.datasets中导入20类新闻文本抓取器
from sklearn.datasets import fetch_20newsgroups
import numpy as np
#抓取新闻数据
news=fetch_20newsgroups(subset='all')
#数据集分割
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data[:3000],
news.target[:3000],test_size=0.25,random_state=33)
#SVM
from sklearn.svm import SVC
#导入TfidVectorizer文本抽取器
from sklearn.feature_extraction.text import TfidfVectorizer
#导入Pipeline
from sklearn.pipeline import Pipeline
#使用pipeline简化系统搭建流程,将文本抽取与分类模型串联
clf=Pipeline([('vect',TfidfVectorizer(stop_words='english',analyzer='word')),('svc',SVC())])
#超参数
parameters={'svc__gamma':np.logspace(-2,1,4),'svc__C':np.logspace(-1,1,3)}
#网格搜索模型GridSearchCV
from sklearn.model_selection import GridSearchCV
#初始化单线程网格搜索
gs=GridSearchCV(clf,parameters,verbose=2,refit=True,cv=3)
#初始化配置并行网格搜索,n_jobs=-1代表使用该计算机全部的CPU
gs=GridSearchCV(clf,parameters,verbose=2,refit=True,cv=3,n_jobs=-1)
time_=gs.fit(X_train,y_train)
gs.best_params_,gs.best_score_
#输出最佳模型在测试集上的准确性
print(gs.score(X_test,y_test))
#0.822666666667
词袋模型的使用
sent1 = 'The cat is walking in the bedroom.'
sent2 = 'A dog was running across the kitchen.'
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
sentences = [sent1,sent2]
#输出特征向量化后的表示
print(count_vec.fit_transform(sentences).toarray())
对比随机决策森林以及XGBoost模型对泰坦尼克号上的乘客是否生还的预测能力
from xgboost import XGBClassifier
import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
x = titanic[['pclass','age','sex']]
y = titanic['survived']
x['age'].fillna(x['age'].mean(),inplace=True)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
x_train = vec.fit_transform(x_train.to_dict(orient='record'))
x_test = vec.transform(x_test.to_dict(orient='record'))
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
print("accuracy",rfc.score(x_test,y_test))
#使用默认的xgboost
from xgboost import XGBClassifier
xgbc = XGBClassifier()
xgbc.fit(x_train,y_train)
print("accuracy",xgbc.score(x_test,y_test))