#!usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
'''
原始数据共有699条样本,共11列,
第一列用于检索的id,9列与肿瘤有关的医学特征(1~10),
最后1列表征肿瘤类型的数值(2表示良性,4表示恶性)
'''
'''良/恶性乳腺癌肿瘤数据预处理'''
# 创建特征列表:
column_names=['sample code number','1','2','3','4','5','6','7','8','9','class']
# 利用pandas从网上下载数据
data=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names=column_names)
# 将?替换为标准缺失值表示
data=data.replace(to_replace='?',value=np.nan)
# 丢弃带有缺失值的数据(只要有一个维度缺失)
data=data.dropna(how='any')
print data.shape #(683, 11)
'''准备良/恶性乳腺癌肿瘤训练、测试数据'''
# 随机采样25%的数据用于测试,剩下的75%用于构建训练集合
x_train,x_test,y_train,y_test=train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=33)
# 查验训练样本的数量和类别分布
print y_train.value_counts() # 训练样本共有512条(344条良性,168条恶性)
# print x_train.value_counts() # 测试样本共有171条(100条良性,71条恶性)
'''使用线性分类模型(Logistic回归与随机梯度参数估计)从事良性/恶性肿瘤预测任务'''
# 标准化数据:
# 保证每个维度的特征数据方差为1,均值为0,使得预测结果不会被某些维度过大的特征值主导
ss=StandardScaler()
x_train=ss.fit_transform(x_train)
x_test=ss.fit_transform(x_test)
# 初始化LogisticRegression、SGDClassifier
lr=LogisticRegression()
sgdc=SGDClassifier()
# 调用LogisticRegression中的fit函数来训练模型参数
lr.fit(x_train,y_train)
# 使用训练好的模型lr对X_test进行预测,结果存储在变量lr_y_predict中
lr_y_predict=lr.predict(x_test)
# 调用SGDClassifier中的fit函数来训练模型参数
sgdc.fit(x_train,y_train)
# 使用训练好的模型sgdc对X_test进行预测,结果存储在变量sgdc_y_predict中
sgdc_y_predict=sgdc.predict(x_test)
#预测结果称为准确性(Accuracy),作为评估分类器模型的一个重要性能指标
'''使用线性分类器模型从事良/恶性肿瘤预测任务的性能分析'''
# 使用logistic回归模型自带的评分函数score获得模型在测试集上的准确性结果
print "Accuracy of LR Classifier:",lr.score(x_test,y_test)
# 使用classification_report模块获得LogisticRegression其他三个指标结果
print classification_report(y_test,lr_y_predict,target_names=['Benign','Malignant'])
print "finish"
# 使用随机梯度下降模型自带的评分函数score获得模型在测试集上的准确性结果
print "Accuracy of LR Classifier:",sgdc.score(x_test,y_test)
# 使用classification_report模块获得LogisticRegression其他三个指标结果
print classification_report(y_test,sgdc_y_predict,target_names=['Benign','Malignant'])
print "finish"
D:/CH2.py
(683, 11)
2 344
4 168
Name: class, dtype: int64
Accuracy of LR Classifier: 0.9707602339181286
precision recall f1-score support
Benign 0.96 0.99 0.98 100
Malignant 0.99 0.94 0.96 71
micro avg 0.97 0.97 0.97 171
macro avg 0.97 0.97 0.97 171
weighted avg 0.97 0.97 0.97 171
finish
Accuracy of LR Classifier: 0.9883040935672515
precision recall f1-score support
Benign 1.00 0.98 0.99 100
Malignant 0.97 1.00 0.99 71
micro avg 0.99 0.99 0.99 171
macro avg 0.99 0.99 0.99 171
weighted avg 0.99 0.99 0.99 171
finish
Process finished with exit code 0
代码中涉及的一些相关函数和知识点归纳拓展:
sklearn的train_test_split()各函数参数含义解释(非常全)
fit_transform,fit,transform区别和作用详解
机器学习中如何处理缺失数据?
#!usr/bin/python
# -*- coding: utf-8 -*-
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
'''示例:手写数字识别分类'''
# 加载手写体数据集
digits=load_digits()
# 输出数据规模及维度
print digits.data.shape
# 数据分割
x_train,x_test,y_train,y_test=train_test_split(digits.data,digits.target,test_size=0.25,random_state=33)
print y_train.shape
# 标准化
ss=StandardScaler()
x_train=ss.fit_transform(x_train)
x_test=ss.fit_transform(x_test)
# 初始化线性假设的支持向量机分类器LinearSVC
lsvc=LinearSVC()
# 进行模型训练
lsvc.fit(x_train,y_train)
# 利用训练好的模型对样本进行预测
y_predict=lsvc.predict(x_test)
# 性能测评
print 'The Accuracy of Linear SVC is',lsvc.score(x_test,y_test)
print classification_report(y_test,y_predict,target_names=digits.target_names.astype(str))
(1797L, 64L)
(1347L,)
The Accuracy of Linear SVC is 0.9488888888888889
precision recall f1-score support
0 0.92 0.97 0.94 35
1 0.95 0.98 0.96 54
2 0.98 1.00 0.99 44
3 0.93 0.93 0.93 46
4 0.97 1.00 0.99 35
5 0.94 0.94 0.94 48
6 0.96 0.98 0.97 51
7 0.90 1.00 0.95 35
8 0.98 0.83 0.90 58
9 0.95 0.91 0.93 44
micro avg 0.95 0.95 0.95 450
macro avg 0.95 0.95 0.95 450
weighted avg 0.95 0.95 0.95 450
D:\My_Program_List(zs)\Python_ML_Kagggle\venv\Scripts\python.exe
Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
18846
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu
I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game. PENS RULE!!!
(4712, 150725)
(14134, 150725)
The Accuracy of Naive Bayes Classifier is 0.8397707979626485
precision recall f1-score support
alt.atheism 0.86 0.86 0.86 201
comp.graphics 0.59 0.86 0.70 250
comp.os.ms-windows.misc 0.89 0.10 0.17 248
comp.sys.ibm.pc.hardware 0.60 0.88 0.72 240
comp.sys.mac.hardware 0.93 0.78 0.85 242
comp.windows.x 0.82 0.84 0.83 263
misc.forsale 0.91 0.70 0.79 257
rec.autos 0.89 0.89 0.89 238
rec.motorcycles 0.98 0.92 0.95 276
rec.sport.baseball 0.98 0.91 0.95 251
rec.sport.hockey 0.93 0.99 0.96 233
sci.crypt 0.86 0.98 0.91 238
sci.electronics 0.85 0.88 0.86 249
sci.med 0.92 0.94 0.93 245
sci.space 0.89 0.96 0.92 221
soc.religion.christian 0.78 0.96 0.86 232
talk.politics.guns 0.88 0.96 0.92 251
talk.politics.mideast 0.90 0.98 0.94 231
talk.politics.misc 0.79 0.89 0.84 188
talk.religion.misc 0.93 0.44 0.60 158
micro avg 0.84 0.84 0.84 4712
macro avg 0.86 0.84 0.82 4712
weighted avg 0.86 0.84 0.82 4712
done
Process finished with exit code 0
#!usr/bin/python
# -*- coding: utf-8 -*-
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier # K节临近法
from sklearn.metrics import classification_report
'''K近邻算法对生物物种进行分类'''
# 导入数据集
iris = load_iris()
print(iris.data.shape)
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=33)
# 标准化。
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
# 类别预测
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)
y_predict = knc.predict(X_test)
# 准确性测评
print('The accuracy of K-Nearest Neighbor Classifier is', knc.score(X_test, y_test))
print(classification_report(y_test, y_predict, target_names=iris.target_names))
D:/My_Program_List(zs)/Python_ML_Kagggle/CH2_4_KNN.py
(150L, 4L)
('The accuracy of K-Nearest Neighbor Classifier is', 0.8947368421052632)
precision recall f1-score support
setosa 1.00 1.00 1.00 8
versicolor 0.73 1.00 0.85 11
virginica 1.00 0.79 0.88 19
micro avg 0.89 0.89 0.89 38
macro avg 0.91 0.93 0.91 38
weighted avg 0.92 0.89 0.90 38
Process finished with exit code 0
#!usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
# 下载数据集
titanic = pd.read_csv('../Dataset/Tencent-Datasets/Titanic/train.csv')
print titanic.head() # 观察前几行数据
print titanic.info() # 查看数据的统计特性
# 特征选择,很可能决定分类的关键特征因素
X = titanic[['Pclass', 'Age', 'Sex']]
y = titanic['Survived']
# 数据处理任务:1.填补缺失数据;2.转化数据特征
X['Age'].fillna(X['Age'].mean(), inplace=True)
# 查看补充完的数据
print X.info()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
# 使用单一决策树训练数据集
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_y_pred = dtc.predict(X_test)
# 使用随机森林训练数据集
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
# 使用梯度提升决策树训练数据集
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_pred = gbc.predict(X_test)
# 性能评估
print('-------显示使用决策树预测的结果------------')
print('The accuracy of decision tree is', dtc.score(X_test, y_test))
print(classification_report(dtc_y_pred, y_test))
print('-------显示使用随机树预测的结果------------')
print('The accuracy of random forest classifier is', rfc.score(X_test, y_test))
print(classification_report(rfc_y_pred, y_test))
print('-------显示使用梯度提升决策树预测的结果------------')
print ('The accuracy of gradient tree boosting is', gbc.score(X_test, y_test))
print (classification_report(gbc_y_pred, y_test))
# -*- coding: utf-8 -*-
from sklearn.feature_extraction import DictVectorizer
'''
对特征进行抽取和向量化
'''
# 定义一组字典列表,用来表示多个数据样本(每个字典代表一个数据样本)
measurements = [{'city': 'Dubai', 'temperature': 33.}, \
{'city': 'London', 'temperature': 12.}, \
{'city': 'ZhengZhou', 'temperature': 26.}]
# 初始化特征抽取器
vec = DictVectorizer()
# 输出转化之后的特征矩阵
print(vec.fit_transform(measurements).toarray())
# 输出各个维度的特征含义
print(vec.get_feature_names())
输出结果:
[[ 1. 0. 0. 33.]
[ 0. 1. 0. 12.]
[ 0. 0. 1. 26.]]
['city=Dubai', 'city=London', 'city=ZhengZhou', 'temperature']
# -*- coding: utf-8 -*-
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
# 特征抽取之后使用朴素贝叶斯Navie Bayes分类器进行分类
# 导入20类新闻文本数据抓取器
# 从网上及时下载新闻样本,subset='all'参数代表下载全部近2万条文本存储在变量news中
news = fetch_20newsgroups(subset='all')
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target,\
test_size = 0.25, random_state = 33)
# 默认配置不去出英文停用词
count_vec = CountVectorizer()
"""
如果这里要使用停用词
count_vec = CountVectorizer(analyzer='word', stop_words='english')
"""
# 只使用词频统计的方式将原始训练和测试文本转化为特征向量
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)
# 导入朴素贝叶斯分类器
mnb_count = MultinomialNB()
# NB进行训练
mnb_count.fit(X_count_train, y_train)
#输出模型的准确性
print('The accuracy of the NB with CountVecorizer:', \
mnb_count.score(X_count_test, y_test))
# 性能评估
y_predict = mnb_count.predict(X_count_test)
print(classification_report(y_test, y_predict, target_names=news.target_names))
输出结果:
('The accuracy of the NB with CountVecorizer:', 0.8397707979626485)
precision recall f1-score support
alt.atheism 0.86 0.86 0.86 201
comp.graphics 0.59 0.86 0.70 250
comp.os.ms-windows.misc 0.89 0.10 0.17 248
comp.sys.ibm.pc.hardware 0.60 0.88 0.72 240
comp.sys.mac.hardware 0.93 0.78 0.85 242
comp.windows.x 0.82 0.84 0.83 263
misc.forsale 0.91 0.70 0.79 257
rec.autos 0.89 0.89 0.89 238
rec.motorcycles 0.98 0.92 0.95 276
rec.sport.baseball 0.98 0.91 0.95 251
rec.sport.hockey 0.93 0.99 0.96 233
sci.crypt 0.86 0.98 0.91 238
sci.electronics 0.85 0.88 0.86 249
sci.med 0.92 0.94 0.93 245
sci.space 0.89 0.96 0.92 221
soc.religion.christian 0.78 0.96 0.86 232
talk.politics.guns 0.88 0.96 0.92 251
talk.politics.mideast 0.90 0.98 0.94 231
talk.politics.misc 0.79 0.89 0.84 188
talk.religion.misc 0.93 0.44 0.60 158
micro avg 0.84 0.84 0.84 4712
macro avg 0.86 0.84 0.82 4712
weighted avg 0.86 0.84 0.82 4712
# -*- coding: utf-8 -*-
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
news = fetch_20newsgroups(subset='all')
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target,\
test_size = 0.25, random_state = 33)
tfidf = TfidfVectorizer()
X_tfidf_train = tfidf.fit_transform(X_train)
X_tfidf_test = tfidf.transform(X_test)
mnb_count = MultinomialNB()
mnb_count.fit(X_tfidf_train, y_train)
print('The accuracy of the NB with TfidfVectorizer:', \
mnb_count.score(X_tfidf_test, y_test))
y_predict = mnb_count.predict(X_tfidf_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict, target_names=news.target_names))
输出结果:
('The accuracy of the NB with TfidfVectorizer:', 0.8463497453310697)
precision recall f1-score support
alt.atheism 0.84 0.67 0.75 201
comp.graphics 0.85 0.74 0.79 250
comp.os.ms-windows.misc 0.82 0.85 0.83 248
comp.sys.ibm.pc.hardware 0.76 0.88 0.82 240
comp.sys.mac.hardware 0.94 0.84 0.89 242
comp.windows.x 0.96 0.84 0.89 263
misc.forsale 0.93 0.69 0.79 257
rec.autos 0.84 0.92 0.88 238
rec.motorcycles 0.98 0.92 0.95 276
rec.sport.baseball 0.96 0.91 0.94 251
rec.sport.hockey 0.88 0.99 0.93 233
sci.crypt 0.73 0.98 0.83 238
sci.electronics 0.91 0.83 0.87 249
sci.med 0.97 0.92 0.95 245
sci.space 0.89 0.96 0.93 221
soc.religion.christian 0.51 0.97 0.67 232
talk.politics.guns 0.83 0.96 0.89 251
talk.politics.mideast 0.92 0.97 0.95 231
talk.politics.misc 0.98 0.62 0.76 188
talk.religion.misc 0.93 0.16 0.28 158
micro avg 0.85 0.85 0.85 4712
macro avg 0.87 0.83 0.83 4712
weighted avg 0.87 0.85 0.84 4712
# -*- coding: utf-8 -*-
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]
# 在x轴上从0至25均匀采样100个数据点
xx = np.linspace(0, 26, 100)
xx = xx.reshape(xx.shape[0], 1)
# 初始化4次多项式特征生成器
poly4 = PolynomialFeatures(degree=4)
X_train_poly4 = poly4.fit_transform(X_train)
regressor_poly4 = LinearRegression()
regressor_poly4.fit(X_train_poly4, y_train)
# 从新映射绘图用x轴采样数据
xx_poly4 = poly4.transform(xx)
# 使用4次多项式回归模型对应x轴采样数据进行回归预测
yy_poly4 = regressor_poly4.predict(xx_poly4)
# 分别对训练数据点、4次多项式回归曲线进行作图
plt.scatter(X_train, y_train)
plt4, = plt.plot(xx, yy_poly4, label='Degree=4')
#@@ 为什么返回两个值???
plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles=[plt4])
#@@ legend和handles都有什么作用???
plt.show()
print('4次多项式的R平方值是', regressor_poly4.score(X_train_poly4, y_train))
# 准备测试数据
X_test = [[6], [8], [11], [16]]
y_test = [[8], [12], [15], [18]]
X_test_poly4 = poly4.transform(X_test)
regressor_poly4.score(X_train_poly4, y_test)
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
# 5组训练数据
X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]
# 从sklearn.preprocessing导入多项式特征生成器
# 初始化4次多项式特征生成器
poly4 = PolynomialFeatures(degree=4)
X_train_poly4 = poly4.fit_transform(X_train)
regressor_poly4 = LinearRegression()
regressor_poly4.fit(X_train_poly4, y_train)
# 4组测试数据
X_test = [[6], [8], [11], [16]]
y_test = [[8], [12], [15], [18]]
X_test_poly4 = poly4.transform(X_test)
# 不加入L1范数正则化
print(regressor_poly4.score(X_test_poly4, y_test))
# 回归模型参数列表
print(regressor_poly4.coef_, '\n')
print('求平方和,验证参数之间的差异 \n',np.sum(regressor_poly4.coef_ **2), '\n')
# 加入L1范数正则化
from sklearn.linear_model import Lasso
lasso_poly4 = Lasso()
lasso_poly4.fit(X_train_poly4, y_train)
print(lasso_poly4.score(X_test_poly4, y_test))
# 输出Lasso模型的参数列表
print(lasso_poly4.coef_)
# 加入L2范数正则化
from sklearn.linear_model import Ridge
ridge_poly4 = Ridge()
ridge_poly4.fit(X_train_poly4, y_train)
print(ridge_poly4.score(X_test_poly4, y_test))
print(ridge_poly4.coef_, '\n')
print('观察参数间的差异: \n', np.sum(ridge_poly4.coef_ **2))